# Training an Elastic Net model on the database

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('final_df.csv')

In [3]:
X = df.drop(['SalePrice'], axis=1) # features dataframe
y = df['SalePrice'] #label

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101) #split data into training and testing

In [6]:
from sklearn.preprocessing import StandardScaler #normalizing the data to make sure that training and test data is not skewed
scaler = StandardScaler() 
scaler.fit(X_train)
scaler.transform(X_train)
scaler.transform(X_test)

array([[ 1.24836191, -0.02221126, -0.22654344, ..., -0.09030979,
         0.25610904, -1.34857677],
       [-1.32323065, -0.02221126, -0.22654344, ..., -0.09030979,
        -1.21109191,  0.93877786],
       [-1.57483801, -0.02221126, -0.22654344, ..., -0.09030979,
        -1.57789214,  1.70122941],
       ...,
       [-0.43728926, -0.02221126, -0.22654344, ..., -0.09030979,
        -1.94469238,  0.17632632],
       [-1.19683635, -0.02221126, -0.22654344, ..., -0.09030979,
        -0.1106912 ,  0.93877786],
       [ 0.60575908, -0.02221126, -0.22654344, ..., -0.09030979,
         1.72330999, -0.58612523]])

In [7]:
from sklearn.linear_model import ElasticNet #importing the ElasticNet model
base_elastic_model = ElasticNet()

In [8]:
param_grid = {'alpha': [0.1, 0.25, 0.5, 0.75, 1, 2, 5, 10, 15, 20, 25, 50, 100], 'l1_ratio': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]} 
#parameter grid for the ElasticNet model, alpha is the regularization parameter
#l1_ratio is the ratio of the L1 regularization to L2 regularization

In [9]:
from sklearn.model_selection import GridSearchCV
#using grid search to find the best parameters for the ElasticNet model

In [10]:
grid_model = GridSearchCV(base_elastic_model, param_grid, cv=10, verbose=1)

In [11]:
grid_model.fit(X_train, y_train)

Fitting 10 folds for each of 130 candidates, totalling 1300 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

GridSearchCV(cv=10, estimator=ElasticNet(),
             param_grid={'alpha': [0.1, 0.25, 0.5, 0.75, 1, 2, 5, 10, 15, 20,
                                   25, 50, 100],
                         'l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
                                      0.9, 1]},
             verbose=1)

In [12]:
pd.DataFrame(grid_model.cv_results_) # results of the grid search

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.478835,0.449205,0.003778,0.000980,0.1,0.1,"{'alpha': 0.1, 'l1_ratio': 0.1}",0.910559,0.910236,0.887830,0.892999,0.894105,0.905901,0.910912,0.913649,0.926691,0.923136,0.907602,0.012075,27
1,0.223175,0.014235,0.003808,0.000794,0.1,0.2,"{'alpha': 0.1, 'l1_ratio': 0.2}",0.911423,0.911253,0.889021,0.893680,0.895224,0.907014,0.912481,0.915065,0.927516,0.923943,0.908662,0.012052,26
2,0.224603,0.015678,0.003533,0.000646,0.1,0.3,"{'alpha': 0.1, 'l1_ratio': 0.3}",0.912387,0.912317,0.890314,0.894379,0.896448,0.908229,0.914215,0.916593,0.928381,0.924806,0.909807,0.012031,23
3,0.214384,0.005086,0.003199,0.000581,0.1,0.4,"{'alpha': 0.1, 'l1_ratio': 0.4}",0.913481,0.913435,0.891732,0.895096,0.897795,0.909568,0.916151,0.918260,0.929295,0.925734,0.911055,0.012016,22
4,0.219218,0.013535,0.003073,0.000538,0.1,0.5,"{'alpha': 0.1, 'l1_ratio': 0.5}",0.914745,0.914612,0.893309,0.895827,0.899295,0.911062,0.918338,0.920098,0.930276,0.926737,0.912430,0.012011,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,0.237810,0.010534,0.003166,0.000313,100,0.6,"{'alpha': 100, 'l1_ratio': 0.6}",0.850782,0.809874,0.798799,0.822251,0.822666,0.815599,0.827690,0.805766,0.851500,0.852257,0.825719,0.018704,124
126,0.236219,0.010822,0.003732,0.000590,100,0.7,"{'alpha': 100, 'l1_ratio': 0.7}",0.851547,0.811201,0.799861,0.822755,0.823524,0.817139,0.827969,0.807053,0.852175,0.853126,0.826635,0.018520,121
127,0.235645,0.014291,0.003703,0.000550,100,0.8,"{'alpha': 100, 'l1_ratio': 0.8}",0.852895,0.813451,0.801731,0.823911,0.825026,0.819603,0.828915,0.809262,0.853532,0.854676,0.828300,0.018254,117
128,0.239075,0.011923,0.003962,0.000645,100,0.9,"{'alpha': 100, 'l1_ratio': 0.9}",0.856423,0.819105,0.806605,0.827534,0.828978,0.825348,0.832522,0.814996,0.857462,0.858799,0.832777,0.017694,103


In [13]:
grid_model.best_estimator_ #values of the best parameters corresponding to the best model

ElasticNet(alpha=20, l1_ratio=1)

In [14]:
y_pred = grid_model.predict(X_test) #predicting the test data

In [15]:
from sklearn.metrics import mean_squared_error, mean_absolute_error #getting the MSE and MAE
mse = mean_squared_error(y_test, y_pred)
np.sqrt(mse)

23334.222801635566

In [16]:
mae = mean_absolute_error(y_test, y_pred)
mae

15105.40457310991

In [17]:
df['SalePrice'].mean() #mean of the label

180005.08143547273

## the results of using Elastic Net and Grid Search for fitting an estimator to our dataset
    - MAE = 15105.40457310991
    - MSE = 23334.222801635566
    - Mean of the label = 180005.08143547273

In [19]:
from joblib import dump #saving the model
dump(grid_model, 'grid_model.joblib')   
dump(scaler, 'scaler.joblib')

['scaler.joblib']