In [1]:
import csv
import numpy as np

import sklearn
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn import preprocessing

from plotting_tools import plotting_feats

In [2]:
from sklearn.gaussian_process.kernels import DotProduct, RationalQuadratic, RBF, ConstantKernel, WhiteKernel

In [3]:
def extractData(filename):
    lst=[]
    with open(filename) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            lst.append(row)
    data=np.array(lst, dtype=float)
    return data

def writeResult(filename, data):
    with open(filename, 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',')
        for row in data:
            spamwriter.writerow(row)

## Training

In [4]:
#Do this for every set and version of the csv available
xtrain = extractData('../NewRealistic/v0c0train_x.csv')
ytrain = extractData('../NewRealistic/v0c0train_y.csv')
xtest = extractData('../NewRealistic/v0c0test_x.csv')
ytest = extractData('../NewRealistic/v0c0test_y.csv')

In [5]:
scaler = preprocessing.StandardScaler().fit(xtrain)
xtrain_scaled = scaler.transform(xtrain)
scaler = preprocessing.StandardScaler().fit(xtest)
xtest_scaled = scaler.transform(xtest)
scaler = preprocessing.StandardScaler().fit(ytrain)
ytrain_scaled = scaler.transform(ytrain)
scaler = preprocessing.StandardScaler().fit(ytest)
ytest_scaled = scaler.transform(ytest)

In [6]:
xtrain_scaled = xtrain_scaled[:10000,:]
ytrain_scaled = ytrain_scaled[:10000,:]
xtest_scaled = xtest_scaled[:2000,:]
ytest_scaled = ytest_scaled[:2000,:]

In [7]:
%%time
kernel = DotProduct() + RationalQuadratic(length_scale=1.0, alpha=2, length_scale_bounds=(.1, 10), alpha_bounds=(1e-4, 4))
gpr = GaussianProcessRegressor(kernel=kernel, alpha=1e-7, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=3, copy_X_train=False)
gpr.fit(xtrain_scaled, ytrain_scaled)
predicted_data, std = gpr.predict(xtest_scaled, return_std=True)
gpr.score(xtest_scaled, ytest_scaled)

CPU times: user 4h 29min 38s, sys: 7min 27s, total: 4h 37min 6s
Wall time: 1h 15min 59s


0.944156936020389

In [8]:
writeResult('GPR_v0c0.csv',predicted_data)

### More

In [9]:
xtrain = extractData('../NewRealistic/v0c1train_x.csv')
ytrain = extractData('../NewRealistic/v0c1train_y.csv')
xtest = extractData('../NewRealistic/v0c1test_x.csv')
ytest = extractData('../NewRealistic/v0c1test_y.csv')
scaler = preprocessing.StandardScaler().fit(xtrain)
xtrain_scaled = scaler.transform(xtrain)
scaler = preprocessing.StandardScaler().fit(xtest)
xtest_scaled = scaler.transform(xtest)
scaler = preprocessing.StandardScaler().fit(ytrain)
ytrain_scaled = scaler.transform(ytrain)
scaler = preprocessing.StandardScaler().fit(ytest)
ytest_scaled = scaler.transform(ytest)
xtrain_scaled = xtrain_scaled[:10000,:]
ytrain_scaled = ytrain_scaled[:10000,:]
xtest_scaled = xtest_scaled[:2000,:]
ytest_scaled = ytest_scaled[:2000,:]

In [10]:
%%time
kernel = DotProduct() + RationalQuadratic(length_scale=1.0, alpha=2, length_scale_bounds=(.1, 10), alpha_bounds=(1e-4, 4))
gpr = GaussianProcessRegressor(kernel=kernel, alpha=1e-7, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=3, copy_X_train=False)
gpr.fit(xtrain_scaled, ytrain_scaled)
predicted_data, std = gpr.predict(xtest_scaled, return_std=True)
gpr.score(xtest_scaled, ytest_scaled)

CPU times: user 5h 25min 3s, sys: 9min 30s, total: 5h 34min 33s
Wall time: 1h 32min 41s


0.9409232152964251

In [11]:
writeResult('GPR_v0c1.csv',predicted_data)

In [12]:
xtrain = extractData('../NewRealistic/v1c0train_x.csv')
ytrain = extractData('../NewRealistic/v1c0train_y.csv')
xtest = extractData('../NewRealistic/v1c0test_x.csv')
ytest = extractData('../NewRealistic/v1c0test_y.csv')
scaler = preprocessing.StandardScaler().fit(xtrain)
xtrain_scaled = scaler.transform(xtrain)
scaler = preprocessing.StandardScaler().fit(xtest)
xtest_scaled = scaler.transform(xtest)
scaler = preprocessing.StandardScaler().fit(ytrain)
ytrain_scaled = scaler.transform(ytrain)
scaler = preprocessing.StandardScaler().fit(ytest)
ytest_scaled = scaler.transform(ytest)
xtrain_scaled = xtrain_scaled[:10000,:]
ytrain_scaled = ytrain_scaled[:10000,:]
xtest_scaled = xtest_scaled[:2000,:]
ytest_scaled = ytest_scaled[:2000,:]

In [13]:
%%time
kernel = DotProduct() + RationalQuadratic(length_scale=1.0, alpha=2, length_scale_bounds=(.1, 10), alpha_bounds=(1e-4, 4))
gpr = GaussianProcessRegressor(kernel=kernel, alpha=1e-7, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=3, copy_X_train=False)
gpr.fit(xtrain_scaled, ytrain_scaled)
predicted_data, std = gpr.predict(xtest_scaled, return_std=True)
gpr.score(xtest_scaled, ytest_scaled)

CPU times: user 4h 8min 23s, sys: 6min 37s, total: 4h 15min 1s
Wall time: 1h 9min 22s


0.7330537424775515

In [14]:
writeResult('GPR_v1c0.csv',predicted_data)

In [15]:
xtrain = extractData('../NewRealistic/v1c1train_x.csv')
ytrain = extractData('../NewRealistic/v1c1train_y.csv')
xtest = extractData('../NewRealistic/v1c1test_x.csv')
ytest = extractData('../NewRealistic/v1c1test_y.csv')
scaler = preprocessing.StandardScaler().fit(xtrain)
xtrain_scaled = scaler.transform(xtrain)
scaler = preprocessing.StandardScaler().fit(xtest)
xtest_scaled = scaler.transform(xtest)
scaler = preprocessing.StandardScaler().fit(ytrain)
ytrain_scaled = scaler.transform(ytrain)
scaler = preprocessing.StandardScaler().fit(ytest)
ytest_scaled = scaler.transform(ytest)
xtrain_scaled = xtrain_scaled[:10000,:]
ytrain_scaled = ytrain_scaled[:10000,:]
xtest_scaled = xtest_scaled[:2000,:]
ytest_scaled = ytest_scaled[:2000,:]

In [16]:
%%time
kernel = DotProduct() + RationalQuadratic(length_scale=1.0, alpha=2, length_scale_bounds=(.1, 10), alpha_bounds=(1e-4, 4))
gpr = GaussianProcessRegressor(kernel=kernel, alpha=1e-7, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=3, copy_X_train=False)
gpr.fit(xtrain_scaled, ytrain_scaled)
predicted_data, std = gpr.predict(xtest_scaled, return_std=True)
gpr.score(xtest_scaled, ytest_scaled)

CPU times: user 3h 50min 23s, sys: 6min 28s, total: 3h 56min 52s
Wall time: 1h 5min 4s


0.7274342676559726

In [17]:
writeResult('GPR_v1c1.csv',predicted_data)

In [18]:
xtrain = extractData('../NewRealistic/v2c0train_x.csv')
ytrain = extractData('../NewRealistic/v2c0train_y.csv')
xtest = extractData('../NewRealistic/v2c0test_x.csv')
ytest = extractData('../NewRealistic/v2c0test_y.csv')
scaler = preprocessing.StandardScaler().fit(xtrain)
xtrain_scaled = scaler.transform(xtrain)
scaler = preprocessing.StandardScaler().fit(xtest)
xtest_scaled = scaler.transform(xtest)
scaler = preprocessing.StandardScaler().fit(ytrain)
ytrain_scaled = scaler.transform(ytrain)
scaler = preprocessing.StandardScaler().fit(ytest)
ytest_scaled = scaler.transform(ytest)
xtrain_scaled = xtrain_scaled[:10000,:]
ytrain_scaled = ytrain_scaled[:10000,:]
xtest_scaled = xtest_scaled[:2000,:]
ytest_scaled = ytest_scaled[:2000,:]

In [19]:
%%time
kernel = DotProduct() + RationalQuadratic(length_scale=1.0, alpha=2, length_scale_bounds=(.1, 10), alpha_bounds=(1e-4, 4))
gpr = GaussianProcessRegressor(kernel=kernel, alpha=1e-7, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=3, copy_X_train=False)
gpr.fit(xtrain_scaled, ytrain_scaled)
predicted_data, std = gpr.predict(xtest_scaled, return_std=True)
gpr.score(xtest_scaled, ytest_scaled)



CPU times: user 1h 6min 59s, sys: 1min 58s, total: 1h 8min 58s
Wall time: 19min 9s


0.41474493342413776

In [20]:
writeResult('GPR_v2c0.csv',predicted_data)

In [21]:
xtrain = extractData('../NewRealistic/v2c1train_x.csv')
ytrain = extractData('../NewRealistic/v2c1train_y.csv')
xtest = extractData('../NewRealistic/v2c1test_x.csv')
ytest = extractData('../NewRealistic/v2c1test_y.csv')
scaler = preprocessing.StandardScaler().fit(xtrain)
xtrain_scaled = scaler.transform(xtrain)
scaler = preprocessing.StandardScaler().fit(xtest)
xtest_scaled = scaler.transform(xtest)
scaler = preprocessing.StandardScaler().fit(ytrain)
ytrain_scaled = scaler.transform(ytrain)
scaler = preprocessing.StandardScaler().fit(ytest)
ytest_scaled = scaler.transform(ytest)
xtrain_scaled = xtrain_scaled[:10000,:]
ytrain_scaled = ytrain_scaled[:10000,:]
xtest_scaled = xtest_scaled[:2000,:]
ytest_scaled = ytest_scaled[:2000,:]

In [22]:
%%time
kernel = DotProduct() + RationalQuadratic(length_scale=1.0, alpha=2, length_scale_bounds=(.1, 10), alpha_bounds=(1e-4, 4))
gpr = GaussianProcessRegressor(kernel=kernel, alpha=1e-7, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=3, copy_X_train=False)
gpr.fit(xtrain_scaled, ytrain_scaled)
predicted_data, std = gpr.predict(xtest_scaled, return_std=True)
gpr.score(xtest_scaled, ytest_scaled)



CPU times: user 1h 48min 7s, sys: 2min 58s, total: 1h 51min 6s
Wall time: 30min 17s


0.3910912018368599

In [23]:
writeResult('GPR_v2c1.csv',predicted_data)

In [None]:
#then test the xtest and put the results in an array
writeResult('GPR_set1_v0.csv',predicted_data) #put the name of your method (nn, svr, gpr), and change the set that you use and the version v0,v1,v2

In [None]:
 RBF(length_scale=[1e-3] * 11,length_scale_bounds=[(1e-4, 1e+2)] * 11)

In [None]:
%load_ext autoreload
%aimport plotting_tools
%autoreload 1
from plotting_tools import plotting_feats
plotting_feats(ytest_scaled, predicted_data, 'DP+RC')

In [None]:
xtrain = extractData('../DataFullProcess/set2_v0_x.csv')
ytrain = extractData('../DataFullProcess/set2_v0_y.csv')
xtest = extractData('../DataFullProcess/predict_v0_x.csv')
ytest = extractData('../DataFullProcess/verify_v0_y.csv')
scaler = preprocessing.StandardScaler().fit(xtrain)
xtrain_scaled = scaler.transform(xtrain)
scaler = preprocessing.StandardScaler().fit(xtest)
xtest_scaled = scaler.transform(xtest)
scaler = preprocessing.StandardScaler().fit(ytrain)
ytrain_scaled = scaler.transform(ytrain)
scaler = preprocessing.StandardScaler().fit(ytest)
ytest_scaled = scaler.transform(ytest)
xtrain_scaled = xtrain_scaled[:10000,:]
ytrain_scaled = ytrain_scaled[:10000,:]
xtest_scaled = xtest_scaled[:2000,:]
ytest_scaled = ytest_scaled[:2000,:]

In [None]:
%%time
kernel = DotProduct() + RationalQuadratic(length_scale=1.0, alpha=2, length_scale_bounds=(.1, 10), alpha_bounds=(1e-4, 4))
gpr = GaussianProcessRegressor(kernel=kernel, alpha=1e-7, optimizer='fmin_l_bfgs_b',copy_X_train=False)
gpr.fit(xtrain_scaled, ytrain_scaled)
predicted_data, std = gpr.predict(xtest_scaled, return_std=True)
gpr.score(xtest_scaled, ytest_scaled)

In [None]:
#then test the xtest and put the results in an array
writeResult('GPR_set2_v0.csv',predicted_data) #put the name of your method (nn, svr, gpr), and change the set that you use and the version v0,v1,v2

## Plotting

In [None]:
predicted_data_scaled = extractData('GPR_results/GPR_set1_v2.csv')
ytest = extractData('../DataFullProcess/verify_v2_y.csv')
scaler = preprocessing.StandardScaler().fit(ytest)
predicted_data = scaler.inverse_transform(predicted_data_scaled)
plotting_feats(ytest, predicted_data, 'DP+RC', 'set1_v2')