In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import random

import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
from scipy.stats.stats import pearsonr

%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline

## Read the Cleaned Data 
First we load the data we cleaned in the previous part.

In [2]:
X_train_complete = pd.read_csv("TrainClean.csv")
X_test = pd.read_csv("TestClean.csv")
X_train_complete.head()

Unnamed: 0.1,Unnamed: 0,IsHoliday,HasPromotions,NearestCompetitor,Region_AreaKM2,Region_GDP,Region_PopulationK,CloudCover,Max_Dew_PointC,Max_Humidity,...,Fog,Wind0,Wind1,Wind2,Wind3,Wind4,Wind5,Wind6,Wind7,NumberOfSales
0,0,0,0,326,9643,17130,2770,1000.0,1,100.0,...,0,1,0,0,0,0,0,0,0,2.266354
1,1,0,0,326,9643,17130,2770,1000.0,0,87.0,...,0,0,1,0,0,0,0,0,0,2.302695
2,2,0,0,326,9643,17130,2770,1000.0,0,81.0,...,0,1,0,0,0,0,0,0,0,2.304995
3,3,0,0,326,9643,17130,2770,1000.0,-3,80.0,...,0,0,0,1,0,0,0,0,0,2.290064
4,5,0,1,326,9643,17130,2770,1000.0,-2,93.0,...,0,0,1,0,0,0,0,0,0,2.324482


In [3]:
X_train = X_train_complete.loc[:,'IsHoliday':'Wind7']
y = X_train_complete['NumberOfSales']

In [4]:
X_train

Unnamed: 0,IsHoliday,HasPromotions,NearestCompetitor,Region_AreaKM2,Region_GDP,Region_PopulationK,CloudCover,Max_Dew_PointC,Max_Humidity,Max_Sea_Level_PressurehPa,...,Rain,Fog,Wind0,Wind1,Wind2,Wind3,Wind4,Wind5,Wind6,Wind7
0,0,0,326,9643,17130,2770,1000.0,1,100.0,1032,...,1,0,1,0,0,0,0,0,0,0
1,0,0,326,9643,17130,2770,1000.0,0,87.0,1030,...,0,0,0,1,0,0,0,0,0,0
2,0,0,326,9643,17130,2770,1000.0,0,81.0,1026,...,1,0,1,0,0,0,0,0,0,0
3,0,0,326,9643,17130,2770,1000.0,-3,80.0,1027,...,0,0,0,0,1,0,0,0,0,0
4,0,1,326,9643,17130,2770,1000.0,-2,93.0,1024,...,0,0,0,1,0,0,0,0,0,0
5,0,1,326,9643,17130,2770,1000.0,-1,87.0,1017,...,0,0,0,1,0,0,0,0,0,0
6,0,1,326,9643,17130,2770,1000.0,2,93.0,1009,...,0,0,0,1,0,0,0,0,0,0
7,0,1,326,9643,17130,2770,1000.0,8,87.0,1002,...,1,0,0,1,0,0,0,0,0,0
8,0,1,326,9643,17130,2770,1000.0,8,93.0,1002,...,1,0,1,0,0,0,0,0,0,0
9,0,0,326,9643,17130,2770,1000.0,10,100.0,1002,...,1,0,0,0,0,0,1,0,0,0


## Linear Regression Models
Now we are going to use plain linear regressio and regularized models (both l_1 Lasso and l_2 Ridge) from the scikit learn module. I'll also define a function that returns the cross-validation rmse error so we can evaluate our models and pick the best tuning par

In [5]:
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, ElasticNet, Lasso, LassoCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


def r2_cv(model, X_train, y, random_state):
    r2= cross_val_score(model, X_train, y, scoring="r2", cv =KFold(10, shuffle=True, random_state=random_state))
    return(r2)

def rmse_cv(model, X_train, y, random_state):
    rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv =KFold(10, shuffle=True, random_state=random_state)))
    return(rmse)

In [6]:
model_simple = LinearRegression()
model_simple.fit(X_train, y)
yp = model_simple.predict(X_train)

# compute random_state, the same for both R2 and RMSE
random_state = random.randrange(99999999)

# compute R2 for train and using crossvalidation
r2_simple_train = r2_score(y,yp)
r2_xval_simple =  r2_cv(model_simple, X_train, y, random_state)

# compute RMSE for train and using crossvalidation
rmse_simple_train = mean_squared_error(y,yp,multioutput='raw_values')
rmse_xval_simple =  rmse_cv(model_simple, X_train, y, random_state)

print("Linear Regression")
print("==================================================")
print("\t                  Train R2=%.3f"%(r2_simple_train))
print("\t10-fold Crossvalidation R2=%.3f"%(r2_xval_simple.mean()))
print("\t                  Train RMSE=%.3f"%(rmse_simple_train))
print("\t10-fold Crossvalidation RMSE=%.3f"%(rmse_xval_simple.mean()))

Linear Regression
	                  Train R2=0.232
	10-fold Crossvalidation R2=0.232
	                  Train RMSE=0.002
	10-fold Crossvalidation RMSE=0.044
