In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_log_error as mse
from sklearn.metrics import make_scorer
from sklearn.linear_model import *
import pickle

sns.set(style="darkgrid", context="notebook")
rand_seed = 1009
np.random.seed(rand_seed)
xsize, ysize = 12.0, 8.0

import warnings
warnings.filterwarnings('ignore')

In [2]:
X_train = pd.read_csv("../data/X_train.csv", index_col=False)
X_test = pd.read_csv("../data/X_test.csv", index_col=False)
y_train = np.genfromtxt("../data/y_train.txt")
train_id = np.genfromtxt("../data/train_id.txt").astype(int)
test_id = np.genfromtxt("../data/test_id.txt").astype(int)

In [3]:
def log_rmse(y_true, y_pred):
    y_pred[y_pred < 0.0] = 0.0
    return np.sqrt(mse(y_true, y_pred))

log_rmse_scorer = make_scorer(log_rmse, greater_is_better=False)

In [4]:
lin_reg = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=None)
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [5]:
y_pred = lin_reg.predict(X_train)
print("RMSE: "+str("%.5f"%log_rmse(y_train, y_pred)))

RMSE: 0.11098


In [6]:
file = open("../models/linear_regression.pkl", "wb")
file.write(pickle.dumps(lin_reg))
file.close()

In [7]:
ridge_reg = RidgeCV(alphas=np.geomspace(1e-4, 1e4, 500), fit_intercept=True, normalize=False, 
                    scoring=log_rmse_scorer, cv=4, gcv_mode=None, store_cv_values=False)
ridge_reg.fit(X_train, y_train)

RidgeCV(alphas=array([1.00000e-04, 1.03761e-04, ..., 9.63758e+03, 1.00000e+04]),
    cv=4, fit_intercept=True, gcv_mode=None, normalize=False,
    scoring=make_scorer(log_rmse, greater_is_better=False),
    store_cv_values=False)

In [8]:
ridge_reg.alpha_

22.63129568399525

In [9]:
y_pred = ridge_reg.predict(X_train)
print("RMSE: "+str("%.5f"%log_rmse(y_train, y_pred)))

RMSE: 0.12754


In [10]:
file = open("../models/ridge_regression.pkl", "wb")
file.write(pickle.dumps(ridge_reg))
file.close()

In [26]:
elastic_net_reg = ElasticNetCV(l1_ratio=np.linspace(0.0, 1.0, 50), eps=0.0001, n_alphas=100, alphas=np.geomspace(1e-4, 1e4, 50), 
                               fit_intercept=True, normalize=False, precompute="auto", max_iter=5000, 
                               tol=0.00001, cv=4, copy_X=True, verbose=0, n_jobs=1, positive=False, 
                               random_state=None, selection="cyclic")
elastic_net_reg.fit(X_train, y_train)

ElasticNetCV(alphas=array([1.00000e-04, 1.45635e-04, 2.12095e-04, 3.08884e-04, 4.49843e-04,
       6.55129e-04, 9.54095e-04, 1.38950e-03, 2.02359e-03, 2.94705e-03,
       4.29193e-03, 6.25055e-03, 9.10298e-03, 1.32571e-02, 1.93070e-02,
       2.81177e-02, 4.09492e-02, 5.96362e-02, 8.68511e-02, 1.26486e-01,
     ..., 1.04811e+03, 1.52642e+03,
       2.22300e+03, 3.23746e+03, 4.71487e+03, 6.86649e+03, 1.00000e+04]),
       copy_X=True, cv=4, eps=0.0001, fit_intercept=True,
       l1_ratio=array([0.     , 0.02041, 0.04082, 0.06122, 0.08163, 0.10204, 0.12245,
       0.14286, 0.16327, 0.18367, 0.20408, 0.22449, 0.2449 , 0.26531,
       0.28571, 0.30612, 0.32653, 0.34694, 0.36735, 0.38776, 0.40816,
       0.42857, 0.44898, 0.46939, 0.4898 , 0.5102 , 0.53061, 0.55102,
       0.57...633, 0.83673,
       0.85714, 0.87755, 0.89796, 0.91837, 0.93878, 0.95918, 0.97959,
       1.     ]),
       max_iter=5000, n_alphas=100, n_jobs=1, normalize=False,
       positive=False, precompute='auto', random_

In [27]:
elastic_net_reg.alpha_

159.98587196060572

In [29]:
elastic_net_reg.l1_ratio_

1.0

In [28]:
y_pred = elastic_net_reg.predict(X_train)
print("RMSE: "+str("%.5f"%log_rmse(y_train, y_pred)))

RMSE: 0.13842


In [30]:
elastic_net_reg = ElasticNetCV(l1_ratio=np.linspace(0.95, 1.0, 6), eps=0.0001, n_alphas=100, alphas=np.geomspace(75.0, 275.0, 50), 
                               fit_intercept=True, normalize=False, precompute="auto", max_iter=5000, 
                               tol=0.00001, cv=4, copy_X=True, verbose=0, n_jobs=1, positive=False, 
                               random_state=None, selection="cyclic")
elastic_net_reg.fit(X_train, y_train)

ElasticNetCV(alphas=array([ 75.     ,  77.0153 ,  79.08475,  81.20981,  83.39197,  85.63277,
        87.93378,  90.29661,  92.72294,  95.21447,  97.77295, 100.40017,
       103.09799, 105.8683 , 108.71305, 111.63424, 114.63392, 117.71421,
       120.87727, 124.12532, 127.46065, 130.8856 , 134.40258, 138.01407,
 ...     228.41388, 234.55151, 240.85406, 247.32597, 253.97178, 260.79617,
       267.80393, 275.     ]),
       copy_X=True, cv=4, eps=0.0001, fit_intercept=True,
       l1_ratio=array([0.95, 0.96, 0.97, 0.98, 0.99, 1.  ]), max_iter=5000,
       n_alphas=100, n_jobs=1, normalize=False, positive=False,
       precompute='auto', random_state=None, selection='cyclic', tol=1e-05,
       verbose=0)

In [31]:
elastic_net_reg.alpha_

170.62761141922633

In [32]:
elastic_net_reg.l1_ratio_

1.0

In [33]:
y_pred = elastic_net_reg.predict(X_train)
print("RMSE: "+str("%.5f"%log_rmse(y_train, y_pred)))

RMSE: 0.14145


In [34]:
elastic_net_reg = ElasticNetCV(l1_ratio=1.0, eps=0.0001, n_alphas=100, alphas=np.geomspace(75.0, 275.0, 500), 
                               fit_intercept=True, normalize=False, precompute="auto", max_iter=5000, 
                               tol=0.00001, cv=4, copy_X=True, verbose=0, n_jobs=1, positive=False, 
                               random_state=None, selection="cyclic")
elastic_net_reg.fit(X_train, y_train)

ElasticNetCV(alphas=array([ 75.     ,  75.19554, ..., 274.28489, 275.     ]),
       copy_X=True, cv=4, eps=0.0001, fit_intercept=True, l1_ratio=1.0,
       max_iter=5000, n_alphas=100, n_jobs=1, normalize=False,
       positive=False, precompute='auto', random_state=None,
       selection='cyclic', tol=1e-05, verbose=0)

In [35]:
elastic_net_reg.alpha_

168.99437837340759

In [36]:
y_pred = elastic_net_reg.predict(X_train)
print("RMSE: "+str("%.5f"%log_rmse(y_train, y_pred)))

RMSE: 0.14105


In [37]:
file = open("../models/elastic_net_regression.pkl", "wb")
file.write(pickle.dumps(elastic_net_reg))
file.close()

In [38]:
theil_sen_reg = TheilSenRegressor(fit_intercept=True, copy_X=True, max_subpopulation=10000.0, n_subsamples=None, 
                                  max_iter=1000, tol=0.00001, random_state=None, n_jobs=None, verbose=False)
theil_sen_reg.fit(X_train, y_train)

TheilSenRegressor(copy_X=True, fit_intercept=True, max_iter=1000,
         max_subpopulation=10000, n_jobs=None, n_subsamples=None,
         random_state=None, tol=1e-05, verbose=False)

In [39]:
y_pred = theil_sen_reg.predict(X_train)
print("RMSE: "+str("%.5f"%log_rmse(y_train, y_pred)))

RMSE: 0.12845


In [40]:
file = open("../models/theil_sen_regression.pkl", "wb")
file.write(pickle.dumps(theil_sen_reg))
file.close()