%%latex
\tableofcontents

In [1]:
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 300
import random
import csv
import pandas as pd
import sklearn
import time
# import torch
# from torch import nn
# from torch.utils.data import Dataset, DataLoader
# from torchvision.transforms import ToTensor 
# import matplotlib.cm as cm

# Introduction

In this notebook, we are going to investigate different methods to do the C2P transformation. We'll look at Random Forest Regression and LSSVMs. 

# Data and methods

In [2]:
# Define the three functions determining the output
def eos(rho, eps, Gamma = 5/3):
    """Computes the analytical gamma law EOS from rho and epsilon"""
    return (Gamma - 1) * rho * eps

def h(rho, eps, v):
    """Enthalpy"""
    p = eos(rho, eps)
    return 1 + eps + p/rho

def W(rho, eps, v):
    """Lorentz factor. Here, in 1D so v = v_x"""
    return (1-v**2)**(-1/2)

def D(rho, eps, v):
    """See eq 2 paper"""
    return rho*W(rho, eps, v)

def S(rho, eps, v):
    """See eq2 paper. Note: 1D only for now."""
    return rho*h(rho, eps, v)*((W(rho, eps, v))**2)*v

def tau(rho, eps, v):
    """See eq2 paper."""
    return rho*(h(rho, eps, v))*((W(rho, eps, v))**2) - eos(rho, eps) - D(rho, eps, v)

We generate data as follows. We create a training set by randomly sampling as follows:
- $\rho \in (0, 10.1)$,
- $\epsilon \in (0, 2.02)$, 
- $v_x \in (0, 0.721)$.

In [2]:
# Define ranges of parameters to be sampled (see paper Section 2.1)
rho_min = 0
rho_max = 10.1
eps_min = 0
eps_max = 2.02
v_min = 0
v_max = 0.721

Note: the code in comment below was used to generate the data. It has now been saved separately in a folder called "data".

In [3]:
# number_of_datapoints = 10000
# data = []

# for i in range(number_of_datapoints):
#     rho = random.uniform(rho_min, rho_max)
#     eps = random.uniform(eps_min, eps_max)
#     v     = random.uniform(v_min, v_max)
    
#     p               = eos(rho, eps)
#     Dvalue    = D(rho, eps, v)
#     Svalue     = S(rho, eps, v)
#     tauvalue = tau(rho, eps, v)
    
#     new_row = [rho, eps, v, p, Dvalue, Svalue, tauvalue]
    
#     data.append(new_row)

Save the data in a csv file:

In [4]:
# header = ['rho', 'eps', 'v', 'p', 'D', 'S', 'tau']

# with open('data/NNC2P_data_test.csv', 'w', newline = '') as file:
#     writer = csv.writer(file)
#     # write header
#     writer.writerow(header)
#     # write data
#     writer.writerows(data)

In [5]:
# Import data
data_train = pd.read_csv("data/NNC2P_data_train.csv")
data_test = pd.read_csv("data/NNC2P_data_test.csv")
print("The training data has " + str(len(data_train)) + " instances")
print("The test data has " + str(len(data_test)) + " instances")
data_train

The training data has 80000 instances
The test data has 10000 instances


Unnamed: 0,rho,eps,v,p,D,S,tau
0,0.662984,0.084146,0.218802,0.037192,0.679448,0.173724,0.077335
1,8.565808,0.205945,0.657351,1.176059,11.366755,13.318537,7.718100
2,4.387112,1.598809,0.021593,4.676103,4.388135,0.347321,7.020631
3,5.337054,0.530803,0.351307,1.888615,5.700396,4.031171,3.885760
4,1.133895,0.786717,0.079475,0.594703,1.137493,0.209600,0.905115
...,...,...,...,...,...,...,...
79995,8.101834,0.428605,0.616897,2.314990,10.294002,13.832316,9.813427
79996,7.841014,1.125480,0.209087,5.883268,8.018242,4.930289,9.678536
79997,4.628822,0.194190,0.237759,0.599248,4.765476,1.544018,1.129323
79998,9.913117,1.152242,0.477216,7.614874,11.280468,17.889657,18.592193


## Rework data into appropriate format

In [6]:
X_train = []
y_train = list(data_train["p"].values)

D_values = data_train["D"].values
S_values = data_train["S"].values
tau_values = data_train["tau"].values

for i in range(len(data_train["D"])):
    D_val, S_val, tau_val = D_values[i], S_values[i], tau_values[i]
    
    X_train.append([D_val, S_val, tau_val])

In [7]:
X_test = []
y_test = list(data_test["p"].values)

D_values = data_test["D"].values
S_values = data_test["S"].values
tau_values = data_test["tau"].values

for i in range(len(data_test["D"])):
    D_val, S_val, tau_val = D_values[i], S_values[i], tau_values[i]
    
    X_test.append([D_val, S_val, tau_val])

In [8]:
X_train[:3]

[[0.6794479873399221, 0.1737236543879239, 0.0773353397338715],
 [11.366754576378616, 13.318537432696782, 7.718099642903207],
 [4.388134954254378, 0.3473214062039617, 7.020631287593812]]

In [9]:
y_train[:3]

[0.0371918424194553, 1.1760592057471289, 4.676103122386751]

# Random forest regression

__NOTE:__ bad predictions. Need to read about this method in order to understand how to tune certain hyperparameters. 

Taken from [this documentation page](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html).

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

In [11]:
regr = RandomForestRegressor()
regr.fit(X_train, y_train)

RandomForestRegressor()

In [12]:
y_pred = regr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(mse)

0.005007216202530824


# LSSVMs

Can use [this repo](https://github.com/DannyVanpoucke/LSSVMlib/blob/master/examples/LSSVMlib-example_sklearn.ipynb).

In [19]:
from LSSVMlib.LSSVMRegression import LSSVMRegression
from sklearn.metrics import r2_score

In order to start working with the module, it can be useful to read [this example](https://github.com/DannyVanpoucke/LSSVMlib/blob/master/examples/LSSVMlib-example_sklearn.ipynb). 

Note: SVMs are $\mathcal{O}(N^2)$, so limit the amount of training data.

In [40]:
subsize_data = 500
# --- Limit train data
sub_data_train = data_train[:subsize_data]
sub_data_train

# Also limit the data presented in the other format:
sub_X_train, sub_y_train = X_train[:subsize_data], y_train[:subsize_data]

# --- Limit test data
sub_data_test = data_test[:subsize_data]
sub_data_test

# Also limit the data presented in the other format:
sub_X_test, sub_y_test = X_test[:subsize_data], y_test[:subsize_data]

In [41]:
train_features = np.array(sub_X_train)
train_labels = np.array(sub_y_train)
test_features = np.array(sub_X_test)
test_labels = np.array(sub_y_test)
print("Features:")
print(train_features[:3])
print("Labels:")
print(train_labels[:3])

Features:
[[ 0.67944799  0.17372365  0.07733534]
 [11.36675458 13.31853743  7.71809964]
 [ 4.38813495  0.34732141  7.02063129]]
Labels:
[0.03719184 1.17605921 4.67610312]


## RBF kernel:

In [42]:
ndata = train_features.shape[0] 
start = time.time()
clf = LSSVMRegression(
        gamma=100,       #set the gamma-hyper parameter equal to 1
        kernel='rbf', #use the linear kernel
        sigma=1.0,
        c=0.01,
        d=2)

clf.fit(train_features, train_labels) # train our model, aka solve the set of linear equations
print("b = ", clf.intercept_)
end = time.time()
elapsed = end-start
print("Trained on %d instances and took %0.2f seconds" % (ndata, elapsed))
# print("a_i = ", clf.coef_)

b =  5.024197009415371
Trained on 500 instances and took 0.10 seconds


In [43]:
# Predict for test set
predictions  =clf.predict(test_features)

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(test_labels, predictions))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f' % r2_score(test_labels, predictions))

Mean squared error: 1.15
Coefficient of determination: 0.87


## Poly kernel:

In [44]:
ndata = train_features.shape[0] 
start = time.time()
clf = LSSVMRegression(
        gamma=100,       #set the gamma-hyper parameter equal to 1
        kernel='poly', #use the linear kernel
        sigma=1.0,
        c=0.01,
        d=2)

clf.fit(train_features, train_labels) # train our model, aka solve the set of linear equations
print("b = ", clf.intercept_)
end = time.time()
elapsed = end-start
print("Trained on %d instances and took %0.2f seconds" % (ndata, elapsed))
# print("a_i = ", clf.coef_)

b =  0.007781134495329835
Trained on 500 instances and took 0.07 seconds


In [45]:
# Predict for test set
predictions  =clf.predict(test_features)

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(test_labels, predictions))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f' % r2_score(test_labels, predictions))

Mean squared error: 0.03
Coefficient of determination: 1.00


## Grid search:

In [46]:
from sklearn.model_selection import GridSearchCV

In [47]:
parameters = {'kernel':['rbf'], 
    'gamma':[0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
    'sigma':[0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]}

lssvm = LSSVMRegression() 
clf = GridSearchCV(lssvm, parameters) 
clf.fit(train_features, train_labels)

GridSearchCV(estimator=LSSVMRegression(kernel='rbf'),
             param_grid={'gamma': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
                         'kernel': ['rbf'],
                         'sigma': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]})