# Importations

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from tqdm import tnrange
from tqdm.notebook import tqdm

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso
from sklearn import svm
from sklearn.ensemble import GradientBoostingRegressor

# Creation of train_set and test_set

In [3]:
train_set = pd.read_csv('data/train_set.csv')
test_set = pd.read_csv('data/test_set.csv')

In [4]:
def create_sets(dataset) :

    X = dataset.drop(['Y_0','Y_1', 'Y_2', 'Y_3', 'Y_4', 'Y_5', 'Y_6', 'Y_7', 'Y_8', 'Y_9', 'Y_10', 'Y_11','Y_12',
                              'Y_13','Y_14', 'Y_15', 'Y_16', 'Y_17', 'Y_18', 'Y_19', 'Y_20', 'Y_21', 'Y_22', 'Y_23'], axis=1)
    
    Y = dataset.drop(['X_0', 'X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 'X_7', 'X_8', 'X_9', 'X_10', 'X_11','X_12',
                              'X_13','X_14', 'X_15', 'X_16', 'X_17', 'X_18', 'X_19', 'X_20', 'X_21', 'X_22', 'X_23'], axis=1)


    return (X,Y)

In [5]:
X_train, Y_train = create_sets(train_set)
X_test, Y_test = create_sets(test_set) 

In [6]:
X_train.describe()

Unnamed: 0,X_0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,...,X_14,X_15,X_16,X_17,X_18,X_19,X_20,X_21,X_22,X_23
count,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,...,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0
mean,1.146616,0.507799,0.0,0.0,0.070799,2.311949,4.87183,6.250237,7.718703,7.356571,...,5.579276,5.566255,5.898277,6.679506,6.848773,6.482843,4.873864,3.650075,3.202496,3.282517
std,1.059367,0.621134,0.0,0.0,0.238223,1.897051,2.527264,3.482964,4.092989,3.750953,...,2.905553,2.90658,3.131703,4.030889,4.10416,3.995929,2.644393,1.891158,1.722928,1.697661
min,0.0,0.0,0.0,0.0,0.0,0.0,0.792079,0.792079,1.188119,2.138614,...,1.158416,1.158416,1.158416,1.158416,1.158416,0.792079,0.772277,0.772277,0.732673,0.752475
25%,0.0,0.0,0.0,0.0,0.0,0.891089,3.089109,3.336634,4.871287,4.871287,...,2.613861,2.613861,2.613861,3.089109,3.059406,3.059406,2.970297,2.19802,1.663366,2.138614
50%,0.891089,0.0,0.0,0.0,0.0,1.782178,4.514851,5.940594,7.247525,6.772277,...,5.346535,5.346535,6.29703,6.772277,7.128713,6.623762,4.514851,3.564356,3.564356,3.564356
75%,2.257426,0.891089,0.0,0.0,0.0,3.564356,5.940594,7.722772,8.673267,8.19802,...,6.772277,6.772277,7.247525,8.19802,9.148515,8.19802,5.940594,4.514851,3.564356,3.564356
max,4.039604,2.039604,0.0,0.0,0.861386,7.188119,11.405941,15.089109,17.346535,16.871287,...,12.118812,12.118812,13.544554,16.39604,16.39604,16.39604,10.772277,8.554455,7.128713,7.128713


In [7]:
X_test.describe()

Unnamed: 0,X_0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,...,X_14,X_15,X_16,X_17,X_18,X_19,X_20,X_21,X_22,X_23
count,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,...,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0
mean,1.72642,0.313184,0.0,0.0,0.025013,3.328817,5.616467,7.572173,8.798854,8.332465,...,6.456488,6.456488,6.781657,7.732152,7.782178,7.253257,5.433038,3.979156,3.701928,3.82074
std,1.011933,0.519267,0.0,0.0,0.109029,1.696394,2.488284,3.527842,4.186151,3.716426,...,2.541127,2.541127,2.899757,4.102023,4.114561,3.819546,2.666342,1.768392,1.643902,1.723347
min,0.0,0.0,0.0,0.0,0.0,0.0,2.19802,2.19802,3.534653,3.534653,...,3.772277,3.772277,3.29703,2.346535,2.346535,2.336634,1.564356,1.564356,1.485149,1.207921
25%,1.049505,0.0,0.0,0.0,0.0,2.673267,3.861386,5.465347,6.831683,5.985149,...,5.346535,5.346535,4.871287,5.584158,5.821782,5.168317,4.277228,3.564356,3.564356,3.564356
50%,1.782178,0.0,0.0,0.0,0.0,3.148515,4.990099,6.831683,7.247525,6.772277,...,5.346535,5.346535,6.29703,7.247525,7.247525,6.772277,4.514851,3.564356,3.564356,3.564356
75%,2.257426,0.747525,0.0,0.0,0.0,4.009901,6.178218,8.940594,9.415842,8.910891,...,6.772277,6.772277,7.722772,9.148515,9.148515,8.435644,7.425743,4.277228,3.564356,4.039604
max,4.039604,1.782178,0.0,0.0,0.475248,7.663366,10.990099,15.50495,17.346535,16.39604,...,12.118812,12.118812,13.544554,16.39604,16.39604,15.50495,10.39604,8.079208,7.128713,7.60396


# Linear Model

In [8]:
model = LinearRegression()
model.fit(X_train, Y_train)

pred = model.predict(X_train)
print("score MSE on the train_set : ",(np.linalg.norm(pred-Y_train)**2/(Y_train.shape[0])))

pred = model.predict(X_test)
print("score MSE on the test_set : ", (np.linalg.norm(pred-Y_test)**2/(Y_test.shape[0])))

score MSE on the train_set :  0.19172069399492322
score MSE on the test_set :  0.8708515038988245


# Ridge

In [9]:
model = MultiOutputRegressor(Ridge())
model.fit(X_train, Y_train)

pred = model.predict(X_train)
print("score MSE on the train_set : ",(np.linalg.norm(pred-Y_train)**2/(Y_train.shape[0])))

pred = model.predict(X_test)
print("score MSE on the test_set : ", (np.linalg.norm(pred-Y_test)**2/(Y_test.shape[0])))

score MSE on the train_set :  0.22296139320128236
score MSE on the test_set :  0.367754094322364


In [10]:
hyper_params={'estimator__fit_intercept' : [True, False],
              'estimator__alpha' : [0.01,0.1, 1, 10],
              'estimator__tol' : [0.0001,0.001,0.01],
              'estimator__max_iter' : [100, 200, 1000]
             }


model = GridSearchCV(MultiOutputRegressor(Ridge()), param_grid=hyper_params,
                                        scoring='neg_root_mean_squared_error', cv=5)

model.fit(X_train, Y_train)

print(model.best_params_)


pred = model.predict(X_train)
print("score MSE on the train_set : ",(np.linalg.norm(pred-Y_train)**2/(Y_train.shape[0])))

pred = model.predict(X_test)
print("score MSE on the test_set : ", (np.linalg.norm(pred-Y_test)**2/(Y_test.shape[0])))


{'estimator__alpha': 1, 'estimator__fit_intercept': False, 'estimator__max_iter': 100, 'estimator__tol': 0.0001}
score MSE on the train_set :  0.23823728598994234
score MSE on the test_set :  0.3497778488769704


# RandomForest

In [11]:
model = RandomForestRegressor()
model.fit(X_train, Y_train)

pred = model.predict(X_train)
print("score MSE on the train_set : ",(np.linalg.norm(pred-Y_train)**2/(Y_train.shape[0])))

pred = model.predict(X_test)
print("score MSE on the test_set : ", (np.linalg.norm(pred-Y_test)**2/(Y_test.shape[0])))

score MSE on the train_set :  0.10078502739725952
score MSE on the test_set :  0.870514947368425


In [12]:
hyper_params={'n_estimators' : [200,400,800],
              'min_samples_split': [2,3],
              'max_features' : [0.4,0.8],
              'max_depth' : [4,6,10],
              'min_samples_leaf' : [1,2],
              'bootstrap' : [False, True],
             }


model = GridSearchCV(RandomForestRegressor(n_jobs=1), param_grid=hyper_params,
                                        scoring='neg_root_mean_squared_error', cv=5)

model.fit(X_train, Y_train)

print(model.best_params_)

pred = model.predict(X_train)
print("score MSE on the train_set : ",(np.linalg.norm(pred-Y_train)**2/(Y_train.shape[0])))

pred = model.predict(X_test)
print("score MSE on the test_set : ", (np.linalg.norm(pred-Y_test)**2/(Y_test.shape[0])))

{'bootstrap': False, 'max_depth': 10, 'max_features': 0.4, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
score MSE on the train_set :  0.003575178322772086
score MSE on the test_set :  0.8391574353654975


# SVM

In [13]:
model = MultiOutputRegressor(svm.SVR())
model.fit(X_train, Y_train)

pred = model.predict(X_train)
print("score MSE on the train_set : ",(np.linalg.norm(pred-Y_train)**2/(Y_train.shape[0])))

pred = model.predict(X_test)
print("score MSE on the test_set : ", (np.linalg.norm(pred-Y_test)**2/(Y_test.shape[0])))

score MSE on the train_set :  0.7639282187484986
score MSE on the test_set :  0.9628736535640292


In [14]:
hyper_params={'estimator__kernel' : ['linear','rbf'],
              'estimator__epsilon' : [0.001,0.01],
              'estimator__C' : [0.01, 0.1,1],
              'estimator__gamma' : ['auto', 'scale'],
             }


model = GridSearchCV(MultiOutputRegressor(svm.SVR()), param_grid=hyper_params,
                     scoring='neg_root_mean_squared_error', cv=5)

model.fit(X_train, Y_train)

print(model.best_params_)

pred = model.predict(X_train)
print("score MSE on the train_set : ",(np.linalg.norm(pred-Y_train)**2/(Y_train.shape[0])))

pred = model.predict(X_test)
print("score MSE on the test_set : ", (np.linalg.norm(pred-Y_test)**2/(Y_test.shape[0])))

{'estimator__C': 0.1, 'estimator__epsilon': 0.01, 'estimator__gamma': 'auto', 'estimator__kernel': 'linear'}
score MSE on the train_set :  0.27652590966572116
score MSE on the test_set :  0.36802575278701727


# Gradient Boosting

In [15]:
model = MultiOutputRegressor(GradientBoostingRegressor())

model.fit(X_train, Y_train)


pred = model.predict(X_train)
print("score MSE on the train_set : ",(np.linalg.norm(pred-Y_train)**2/(Y_train.shape[0])))

pred = model.predict(X_test)
print("score MSE on the test_set : ", (np.linalg.norm(pred-Y_test)**2/(Y_test.shape[0])))

score MSE on the train_set :  0.004204198531889425
score MSE on the test_set :  0.7667222740024954


In [16]:
hyper_params={'estimator__n_estimators' : [500,1000],
              'estimator__max_depth' : [2,3,4],
              'estimator__learning_rate' : [0.05, 0.1],
              'estimator__subsample' : [0.5,1],
              'estimator__init' : [Ridge(), RandomForestRegressor(), svm.SVR()]
             }




model = GridSearchCV(MultiOutputRegressor(GradientBoostingRegressor(init=Ridge())), param_grid=hyper_params,
                     scoring='neg_root_mean_squared_error', cv=5)

model.fit(X_train, Y_train)

print(model.best_params_)

pred = model.predict(X_train)
print("score MSE on the train_set : ",(np.linalg.norm(pred-Y_train)**2/(Y_train.shape[0])))

pred = model.predict(X_test)
print("score MSE on the test_set : ", (np.linalg.norm(pred-Y_test)**2/(Y_test.shape[0])))

{'estimator__init': Ridge(), 'estimator__learning_rate': 0.05, 'estimator__max_depth': 2, 'estimator__n_estimators': 500, 'estimator__subsample': 0.5}
score MSE on the train_set :  0.006503495029480505
score MSE on the test_set :  0.2740283409694716


In [17]:
hyper_params_ridge={'fit_intercept' : [True,False],
              'alpha' : [0.1, 1, 10],
              'tol' : [0.0001,0.01],
              'max_iter' : [100, 200,500]
             }



ridge = GridSearchCV(Ridge(), param_grid=hyper_params_ridge,
                                        scoring='neg_root_mean_squared_error', cv=5)

ridge.fit(X_train, Y_train)

hyper_params={'estimator__n_estimators' : [500,1000],
              'estimator__max_depth' : [2,4],
              'estimator__learning_rate' : [0.05, 0.1],
              'estimator__subsample' : [0.5,1],
              'estimator__min_samples_leaf' : [1,2]
             }


model = GridSearchCV(MultiOutputRegressor(GradientBoostingRegressor(init=ridge)), param_grid=hyper_params,
                     scoring='neg_root_mean_squared_error', cv=5)
model.fit(X_train, Y_train)

print(model.best_params_)

pred = model.predict(X_train)
print("score MSE on the train_set : ",(np.linalg.norm(pred-Y_train)**2/(Y_train.shape[0])))

pred = model.predict(X_test)
print("score MSE on the test_set : ", (np.linalg.norm(pred-Y_test)**2/(Y_test.shape[0])))

{'estimator__learning_rate': 0.05, 'estimator__max_depth': 4, 'estimator__min_samples_leaf': 1, 'estimator__n_estimators': 500, 'estimator__subsample': 0.5}
score MSE on the train_set :  7.559931152345061e-05
score MSE on the test_set :  0.2556659574055599
