In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
import itertools
from matplotlib.pyplot import figure

In [2]:
train = pd.read_csv('train.csv')
val=pd.read_csv('val.csv')

In [3]:
df=pd.concat([train, val], axis=0)

In [4]:
test = pd.read_csv('test.csv')

In [5]:
y_tot=df['data_IMDBscore']
y_train=train['data_IMDBscore']
y_test=test['data_IMDBscore']

In [6]:
df.drop('data_IMDBscore', inplace=True, axis=1)
test.drop('data_IMDBscore', inplace=True, axis=1)
train.drop('data_IMDBscore', inplace=True, axis=1)

In [7]:
from sklearn.svm import LinearSVR

In [8]:
eps = 5
svr = LinearSVR(epsilon=eps, C=0.01, fit_intercept=True,verbose=1)

In [9]:
svr.fit(train,y_train)

[LibLinear]

LinearSVR(C=0.01, epsilon=5, verbose=1)

In [10]:
def svr_results(y_test, X_test, fitted_svr_model):
    
    print("C: {}".format(fitted_svr_model.C))
    print("Epsilon: {}".format(fitted_svr_model.epsilon))
    
    print("Intercept: {:,.3f}".format(fitted_svr_model.intercept_[0]))
    print("Coefficient: {:,.3f}".format(fitted_svr_model.coef_[0]))
    
    mae = mean_squared_error(y_test, fitted_svr_model.predict(X_test))
    print("MSE = ",mae)
    print("RMSE= ",mae**0.5)
    
    perc_within_eps = 100*np.sum(y_test - fitted_svr_model.predict(X_test) < eps) / len(y_test)
    print("Percentage within Epsilon = {:,.2f}%".format(perc_within_eps))
   

In [11]:
from sklearn.metrics import mean_squared_error
svr_results(y_test, test, svr)

C: 0.01
Epsilon: 5
Intercept: 2.067
Coefficient: 0.065
MSE =  11.444897084288295
RMSE=  3.383030754262854
Percentage within Epsilon = 98.38%


In [12]:
svr = LinearSVR(epsilon=eps, C=1, fit_intercept=True,verbose=1)
svr.fit(train,y_train)
svr_results(y_test, test, svr)

[LibLinear]C: 1
Epsilon: 5
Intercept: 2.313
Coefficient: 0.121
MSE =  10.257289553040286
RMSE=  3.2027003533019265
Percentage within Epsilon = 99.48%


In [13]:
from sklearn.model_selection import GridSearchCV

In [14]:

grid = {
    'C': np.linspace(0.01, 10),
    'epsilon': np.linspace(0.01, 10)
}
    
svr_gridsearch =  LinearSVR(fit_intercept=True, max_iter=1000)

In [15]:
grid_svr = GridSearchCV(svr_gridsearch, grid, scoring='neg_mean_squared_error', cv=2,n_jobs=-1,verbose=1)

In [16]:
y_val=val['data_IMDBscore']
val.drop('data_IMDBscore', inplace=True, axis=1)
grid_svr.fit(val, y_val)

Fitting 2 folds for each of 2500 candidates, totalling 5000 fits


GridSearchCV(cv=2, estimator=LinearSVR(), n_jobs=-1,
             param_grid={'C': array([ 0.01      ,  0.21387755,  0.4177551 ,  0.62163265,  0.8255102 ,
        1.02938776,  1.23326531,  1.43714286,  1.64102041,  1.84489796,
        2.04877551,  2.25265306,  2.45653061,  2.66040816,  2.86428571,
        3.06816327,  3.27204082,  3.47591837,  3.67979592,  3.88367347,
        4.08755102,  4.29142857,  4.49530612,  4.69918367,  4.90306122,
        5.10693878...
        4.08755102,  4.29142857,  4.49530612,  4.69918367,  4.90306122,
        5.10693878,  5.31081633,  5.51469388,  5.71857143,  5.92244898,
        6.12632653,  6.33020408,  6.53408163,  6.73795918,  6.94183673,
        7.14571429,  7.34959184,  7.55346939,  7.75734694,  7.96122449,
        8.16510204,  8.36897959,  8.57285714,  8.77673469,  8.98061224,
        9.1844898 ,  9.38836735,  9.5922449 ,  9.79612245, 10.        ])},
             scoring='neg_mean_squared_error', verbose=1)

In [17]:
best_grid_svr_mae = grid_svr.best_estimator_
print(grid_svr.best_estimator_)
best_grid_svr_mae.fit(train, y_train)
svr_results(y_test, test, best_grid_svr_mae)

LinearSVR(C=0.21387755102040817, epsilon=1.029387755102041)
C: 0.21387755102040817
Epsilon: 1.029387755102041
Intercept: 5.850
Coefficient: 0.137
MSE =  0.7821028580968622
RMSE=  0.884365794282469
Percentage within Epsilon = 100.00%


In [18]:
svr_results(y_train, train, best_grid_svr_mae)

C: 0.21387755102040817
Epsilon: 1.029387755102041
Intercept: 5.850
Coefficient: 0.137
MSE =  0.5784278890005977
RMSE=  0.7605444687857493
Percentage within Epsilon = 100.00%


In [20]:
svr_results(y_val, val, best_grid_svr_mae)

C: 0.21387755102040817
Epsilon: 1.029387755102041
Intercept: 5.850
Coefficient: 0.137
MSE =  0.7981994694230311
RMSE=  0.8934200968318494
Percentage within Epsilon = 100.00%
