In [13]:
# Loading the packages to be used
from __future__ import print_function  # Python 2 and 3
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from scipy import stats
import math
%matplotlib inline

In [14]:
# Import data; for linear regression, either import encoded datasets or dummify and then oneHotEncode in python
houses_train = pd.read_csv('../Data/Xencoded_houses_train.csv')
houses_test = pd.read_csv('../Data/Xencoded_houses_test.csv')
private_train = pd.read_csv('../Data/Xencoded_private_train.csv')
private_test = pd.read_csv('../Data/Xencoded_private_test.csv')

In [15]:
private_train.shape

(1168, 207)

In [19]:
# Define data frame of predictors and BoxCox response variable
X_housetrain_linear = houses_train.drop('SalePrice', axis = 1)
X_privtrain_linear = private_train.drop('SalePrice', axis = 1)
X_privtest_linear = private_test.drop('SalePrice', axis = 1)
#X_privtrain_linear = X_privtrain_linear.values

private_trainBC, private_lambda = stats.boxcox(private_train[['SalePrice']]) # T indicates transformation; ndarray output
houses_trainBC, houses_lambda = stats.boxcox(houses_train[['SalePrice']]) # T indicates transformation; ndarray output

In [20]:
# Turn ndarray of BoxCoxed response variable into a data frame
y_housetrain_linear = pd.DataFrame(houses_trainBC)
y_privtrain_linear = pd.DataFrame(private_trainBC)

y_housetrain_linear.columns = ['SalePrice_BC']
y_privtrain_linear.columns = ['SalePrice_BC']

private_test[['SalePrice']]= np.log(private_test[['SalePrice']].values)

y_privtest_linear = pd.DataFrame(private_test[['SalePrice']])


In [21]:
#cross validate elastic net regularization for parameters alpha
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import ElasticNetCV


y_privtrain_linear=np.ravel(y_privtrain_linear)

alphas = np.linspace(0.01,1,20)
l1_params= np.linspace(0.01,1,20)

ENCV= ElasticNetCV(alphas= alphas,l1_ratio= l1_params, n_alphas= 1000,cv=5, random_state=0)

ENCV.fit(X_privtrain_linear, y_privtrain_linear)

predictions_train=ENCV.predict(X_privtest_linear)


#transform predictions boxcox to real values
predictions_transformed= np.log(np.power((predictions_train * private_lambda) + 1, 1 / private_lambda))


#rmse for test data
rmse_test=math.sqrt(mse(predictions_transformed, y_privtest_linear))


#optimum alpha value and l1 ratio value
print('rmse_test alpha\t l1_ratio score')
print("{:0.5f}\t {:0.4f}\t {:0.4f}\t {:0.4f}".format(rmse_test, ENCV.alpha_, ENCV.l1_ratio_, ENCV.score(X_privtrain_linear, y_privtrain_linear)))




rmse_test alpha	 l1_ratio score
9.54522	 0.1663	 0.0100	 0.9048


In [38]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error as mse
from sklearn.cross_validation import KFold

t_rmse = np.array([])
cv_rmse = np.array([])
y_privtrain_linear=np.ravel(y_privtrain_linear)

alphas = np.linspace(0.01,1,30)



print('model\t alpha\t RMSE_train\t RMSE_5cv \t R square\n')

for a in alphas:
    models=[('lasso', Lasso( fit_intercept=True, alpha=a)),
            ('ridge', Ridge(fit_intercept=True, alpha=a)),
            ('el_net', ElasticNet(fit_intercept=True, alpha=a))]
    
    
    print('\n')
    for name, method in models:
              
        
        method.fit(X_privtrain_linear, y_privtrain_linear)
        
        score= method.score(X_privtrain_linear, y_privtrain_linear)
    
        # computing the RMSE on training data
        
        p = method.predict(X_privtrain_linear)
        err = p- y_privtrain_linear
        total_error_squared = np.dot(err,err)
        rmse_train = np.sqrt(total_error_squared/len(p))

        # computing RMSE using 5-fold cross validation
        kf = KFold(len(X_privtrain_linear), n_folds=5, random_state=0)

        xval_err_squared = 0
        for train, test in kf:
            method.fit(X_privtrain_linear, y_privtrain_linear)
            p = method.predict(X_privtest_linear)
            err = p - y_privtest_linear['SalePrice']
            xval_err_squared += np.dot(err,err)
        rmse_5cv = np.sqrt(xval_err_squared/len(X_privtest_linear))

        t_rmse = np.append(t_rmse, [rmse_train])
        cv_rmse = np.append(cv_rmse, [rmse_5cv])
        
        
        print('{}\t {:0.3f}\t {:.5f}\t {:.6f}\t{:.5f}'.format(name,a,rmse_train,rmse_5cv, score))

        #print('%s \t %0.3f \t %.5f \t %.6f \t %.5f'%(name,a,rmse_train,rmse_5cv, score))



model	 alpha	 RMSE_train	 RMSE_5cv 	 R square



lasso	 0.010	 0.06834	 13.302033	0.87104
ridge	 0.010	 0.05154	 13.302740	0.92665
el_net	 0.010	 0.06262	 13.303979	0.89171


lasso	 0.044	 0.09543	 13.295552	0.74851
ridge	 0.044	 0.05176	 13.302353	0.92601
el_net	 0.044	 0.07895	 13.298230	0.82787


lasso	 0.078	 0.12569	 13.292953	0.56379
ridge	 0.078	 0.05181	 13.302254	0.92589
el_net	 0.078	 0.09175	 13.296357	0.76756


lasso	 0.112	 0.15478	 13.293443	0.33847
ridge	 0.112	 0.05183	 13.302208	0.92582
el_net	 0.112	 0.10641	 13.294423	0.68733


lasso	 0.147	 0.18096	 13.294091	0.09576
ridge	 0.147	 0.05185	 13.302185	0.92576
el_net	 0.147	 0.12252	 13.293097	0.58551


lasso	 0.181	 0.19030	 13.294403	0.00000
ridge	 0.181	 0.05187	 13.302173	0.92572
el_net	 0.181	 0.13835	 13.292629	0.47145


lasso	 0.215	 0.19030	 13.294403	0.00000
ridge	 0.215	 0.05188	 13.302168	0.92568
el_net	 0.215	 0.15337	 13.292778	0.35051


lasso	 0.249	 0.19030	 13.294403	0.00000
ridge	 0.249	 0.05190	 13.30

In [40]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error as mse
from sklearn.cross_validation import KFold

values= np.array([])


y_privtrain_linear=np.ravel(y_privtrain_linear)

alphas = np.linspace(0.01,1,30)

l1_params= np.linspace(0.01,1,20)

print('model\t l1\t alpha\t RMSE_train\t RMSE_5cv \t R square\n')
print('\n')
for l1 in l1_params:  
    
                      
    for a in alphas:
        EN= ElasticNet(fit_intercept= True, alpha=a, l1_ratio= l1, random_state=0)
        
        EN.fit(X_privtrain_linear, y_privtrain_linear)
        
        score= EN.score(X_privtrain_linear, y_privtrain_linear)
    
        # computing the RMSE on training data
        
        p = EN.predict(X_privtrain_linear)
        err = p- y_privtrain_linear
        total_error_squared = np.dot(err,err)
        rmse_train = np.sqrt(total_error_squared/len(p))

        # computing RMSE using 5-fold cross validation
        kf = KFold(len(X_privtrain_linear), n_folds=5, random_state=0)

        xval_err_squared = 0
        for train, test in kf:
            EN.fit(X_privtrain_linear, y_privtrain_linear)
            p = EN.predict(X_privtest_linear)
            err = p - y_privtest_linear['SalePrice']
            xval_err_squared += np.dot(err,err)
        rmse_5cv = np.sqrt(xval_err_squared/len(X_privtest_linear))

        values = np.append(values,[l1,a,rmse_train, rmse_5cv, score])
        
        
        print('{:0.3f}\t {:0.3f}\t {:.5f}\t {:.6f}\t{:.5f}'.format(l1,a,rmse_train,rmse_5cv, score))

        #print('%s \t %0.3f \t %.5f \t %.6f \t %.5f'%(name,a,rmse_train,rmse_5cv, score))

model	 l1	 alpha	 RMSE_train	 RMSE_5cv 	 R square



0.010	 0.010	 0.05318	 13.302090	0.92192
0.010	 0.044	 0.05468	 13.301741	0.91745
0.010	 0.078	 0.05590	 13.301833	0.91370
0.010	 0.112	 0.05708	 13.302237	0.91002
0.010	 0.147	 0.05818	 13.302429	0.90655
0.010	 0.181	 0.05909	 13.302505	0.90358
0.010	 0.215	 0.05990	 13.302370	0.90093
0.010	 0.249	 0.06069	 13.302196	0.89829
0.010	 0.283	 0.06144	 13.301976	0.89576
0.010	 0.317	 0.06212	 13.301954	0.89343
0.010	 0.351	 0.06281	 13.301934	0.89106
0.010	 0.386	 0.06350	 13.301911	0.88867
0.010	 0.420	 0.06418	 13.301864	0.88626
0.010	 0.454	 0.06486	 13.301747	0.88385
0.010	 0.488	 0.06551	 13.301596	0.88149
0.010	 0.522	 0.06618	 13.301449	0.87907
0.010	 0.556	 0.06683	 13.301336	0.87668
0.010	 0.590	 0.06748	 13.301227	0.87427
0.010	 0.624	 0.06813	 13.301120	0.87182
0.010	 0.659	 0.06879	 13.301013	0.86933
0.010	 0.693	 0.06944	 13.300861	0.86685
0.010	 0.727	 0.07008	 13.300666	0.86438
0.010	 0.761	 0.07072	 13.300479	0.86189
0.01

0.323	 0.693	 0.19030	 13.294403	0.00000
0.323	 0.727	 0.19030	 13.294403	0.00000
0.323	 0.761	 0.19030	 13.294403	0.00000
0.323	 0.795	 0.19030	 13.294403	0.00000
0.323	 0.829	 0.19030	 13.294403	0.00000
0.323	 0.863	 0.19030	 13.294403	0.00000
0.323	 0.898	 0.19030	 13.294403	0.00000
0.323	 0.932	 0.19030	 13.294403	0.00000
0.323	 0.966	 0.19030	 13.294403	0.00000
0.323	 1.000	 0.19030	 13.294403	0.00000
0.375	 0.010	 0.06084	 13.303522	0.89778
0.375	 0.044	 0.07486	 13.298927	0.84525
0.375	 0.078	 0.08405	 13.297708	0.80494
0.375	 0.112	 0.09451	 13.296076	0.75336
0.375	 0.147	 0.10581	 13.294672	0.69088
0.375	 0.181	 0.11784	 13.293531	0.61657
0.375	 0.215	 0.13012	 13.292912	0.53251
0.375	 0.249	 0.14215	 13.292629	0.44200
0.375	 0.283	 0.15371	 13.292491	0.34760
0.375	 0.317	 0.16388	 13.293433	0.25837
0.375	 0.351	 0.17289	 13.293854	0.17465
0.375	 0.386	 0.18139	 13.294105	0.09146
0.375	 0.420	 0.18982	 13.294386	0.00511
0.375	 0.454	 0.19030	 13.294403	0.00000
0.375	 0.488	 0.

0.687	 0.454	 0.19030	 13.294403	0.00000
0.687	 0.488	 0.19030	 13.294403	0.00000
0.687	 0.522	 0.19030	 13.294403	0.00000
0.687	 0.556	 0.19030	 13.294403	0.00000
0.687	 0.590	 0.19030	 13.294403	0.00000
0.687	 0.624	 0.19030	 13.294403	0.00000
0.687	 0.659	 0.19030	 13.294403	0.00000
0.687	 0.693	 0.19030	 13.294403	0.00000
0.687	 0.727	 0.19030	 13.294403	0.00000
0.687	 0.761	 0.19030	 13.294403	0.00000
0.687	 0.795	 0.19030	 13.294403	0.00000
0.687	 0.829	 0.19030	 13.294403	0.00000
0.687	 0.863	 0.19030	 13.294403	0.00000
0.687	 0.898	 0.19030	 13.294403	0.00000
0.687	 0.932	 0.19030	 13.294403	0.00000
0.687	 0.966	 0.19030	 13.294403	0.00000
0.687	 1.000	 0.19030	 13.294403	0.00000
0.739	 0.010	 0.06560	 13.303225	0.88119
0.739	 0.044	 0.08642	 13.297313	0.79376
0.739	 0.078	 0.10746	 13.294115	0.68114
0.739	 0.112	 0.13067	 13.292802	0.52853
0.739	 0.147	 0.15272	 13.293161	0.35598
0.739	 0.181	 0.17157	 13.293818	0.18723
0.739	 0.215	 0.19030	 13.294403	0.00000
0.739	 0.249	 0.

In [41]:
df_EN=pd.DataFrame(values.reshape(600,5), columns= ["l1", "alpha", "rmse_train", "rmse_5cv", "score"])

df_EN[df_EN["rmse_5cv"]==min(df_EN["rmse_5cv"])]

Unnamed: 0,l1,alpha,rmse_train,rmse_5cv,score
483,0.843684,0.112414,0.141233,13.292384,0.449213


In [None]:
#best parameters l1=0.01 alpha= 0.18069

In [42]:
X_privtrain_linear.shape

(1168, 206)

In [43]:
y_privtrain_linear.shape

(1168,)

In [44]:
p.shape

(292,)

In [45]:
private_test[['SalePrice']].shape

(292, 1)

In [48]:
EN_tuned= ElasticNet(fit_intercept= True, alpha=0.18069, l1_ratio= 0.01, random_state=0)
        
EN_tuned.fit(X_privtrain_linear, y_privtrain_linear)
        
score= EN_tuned.score(X_privtrain_linear, y_privtrain_linear)
    
# computing the RMSE on training data
        
p = EN_tuned.predict(X_privtest_linear)
p_transformed=np.log(np.power((p * private_lambda) + 1, 1 / private_lambda))
#rmse_test=math.sqrt(mse(p_transformed,private_test[['SalePrice']]))

print(rmse_test,score)

9.545218250257365 0.903583006291


In [50]:
ridge_tuned= Ridge(fit_intercept= True, alpha=1, random_state=0)
        
ridge_tuned.fit(X_privtrain_linear, y_privtrain_linear)
        
score= ridge_tuned.score(X_privtrain_linear, y_privtrain_linear)
    
# computing the RMSE on training data
        
p = ridge_tuned.predict(X_privtest_linear)
p_transformed=np.log(np.power((p * private_lambda) + 1, 1 / private_lambda))
rmse_test=math.sqrt(mse(p_transformed,y_privtest_linear['SalePrice']))

print(rmse_test, score)

9.54563927160587 0.925001151379


In [51]:
lasso_tuned= Lasso(fit_intercept= True, alpha=0.01, random_state=0)
        
lasso_tuned.fit(X_privtrain_linear, y_privtrain_linear)
        
score= lasso_tuned.score(X_privtrain_linear, y_privtrain_linear)
    
# computing the RMSE on training data
        
p = lasso_tuned.predict(X_privtest_linear)
p_transformed=np.log(np.power((p * private_lambda) + 1, 1 / private_lambda))
rmse_test=math.sqrt(mse(p_transformed,y_privtest_linear['SalePrice']))

print(rmse_test, score)

9.544346952943346 0.871041712481
