In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso,ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [2]:
def load_data():
    columns = ["Col1","Col 2","Col 3","Col 4","Col 5","Col 6", "Col 7", "Col 8","Col 9","Col 10"]
    data = pd.read_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/Ames_data.csv")
    testIds = pd.read_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/project1_testIDs.dat",sep = " ", names = columns)
    
    return data, testIds

def created_train_test(data,testIds,j):
    j = j-1
    test = data.iloc[np.array(testIds)[:,j]]
    train = data.drop(np.array(testIds)[:,j], axis=0)
    return train,test

def onehot_encoding(x_train,x_test):
    #number of record
    train_num = x_train.shape[0]
    test_num = x_test.shape[0]
    #Merge the Train & Test
    df = [x_train,x_test]
    df_train_test = pd.concat(df)
    
    #Label Encoder to transform the Categoricsal Variable
    lbe = LabelEncoder()
    for col_name in train.columns[train.dtypes == 'object']:
        #col_name = cols+'_Cat'
        df_train_test[col_name] = lbe.fit_transform(df_train_test[col_name])
        
    #Let's drop the column "Garage_Yr_Blt" for now, as it ha NaN
    df_train_test = df_train_test.drop(['Garage_Yr_Blt'],axis=1)
    x_train = df_train_test.iloc[0:train_num]
    x_test = df_train_test.iloc[train_num:]
    
    return x_train,x_test


def winsorization(x_train,x_test):
    winsorization_cols = ['Lot_Frontage', 'Lot_Area', 'Mas_Vnr_Area', 'BsmtFin_SF_2', 'Bsmt_Unf_SF', 
                          'Total_Bsmt_SF', 'Second_Flr_SF', 'First_Flr_SF', 'Gr_Liv_Area', 'Garage_Area',
                          'Wood_Deck_SF', 'Open_Porch_SF', 'Enclosed_Porch', 'Three_season_porch', 
                          'Screen_Porch', 'Misc_Val']
    quan_val = 0.95
    for winso_columns in winsorization_cols:
        col_quant_value = np.quantile(x_train_transformed[winso_columns],quan_val)
        x_train_transformed[winso_columns][x_train_transformed[winso_columns] > col_quant_value] = col_quant_value
        x_test_transformed[winso_columns][x_test_transformed[winso_columns] > col_quant_value] = col_quant_value
        #print ("Column : {} 95% Quantile: {}".format(winso_columns,col_quant_value))
        
    return x_train,x_test
        
def lasso_model(x_train,y_train,x_test,y_test,alpha=0.5):
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(x_train,y_train)
    y_predict = lasso_model.predict(x_test)
    return lasso_model,y_predict

In [3]:
data, testIds = load_data()
train,test = created_train_test(data,testIds,j=1)

#Print the dataset size
print ("Number of elements in the Training Set: {}".format(train.shape))
print ("Number of elements in the Test Set: {}".format(test.shape))

#Write to the Train.csv file
train.to_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/train.csv",index=False)
test.to_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/test.csv",index=False)

y_train = np.log(train['Sale_Price'])
x_train = train.drop(['Sale_Price'],axis=1)
y_test = np.log(test['Sale_Price'])
x_test = test.drop(['Sale_Price'],axis=1)

x_train_transformed,x_test_transformed = onehot_encoding(x_train,x_test)
x_train_winsor,x_test_winsor = winsorization(x_train_transformed,x_test_transformed)

#Print the dataset size
print ("Number of elements in the Train Transformed Set: {}".format(x_train_transformed.shape))
print ("Number of elements in the Test Transformed Set: {}".format(x_test_transformed.shape))

drop_columns = ['PID','Street', 'Utilities', 'Condition_2', 'Roof_Matl', 'Heating', 
                'Pool_QC', 'Misc_Feature', 'Low_Qual_Fin_SF', 'Pool_Area', 'Longitude',
                'Latitude'
                ]
x_train_final = x_train_winsor.drop(drop_columns,axis = 1)
x_test_final = x_test_winsor.drop(drop_columns,axis = 1)


Number of elements in the Training Set: (2051, 83)
Number of elements in the Test Set: (879, 83)
Number of elements in the Train Transformed Set: (2051, 81)
Number of elements in the Test Transformed Set: (879, 81)


In [4]:
#------------------------------------------------------------------------
#
# Shrinking Methods (Lasso)
#
#------------------------------------------------------------------------

split_ = 10
lamda_count_ = 0
alpha_ = np.array([0,1,0.00001,0.0001,0.0002,0.0003])
#alpha_ = np.array([1,0.00001,0.0001,0.0002,0.0003,0.001,0.002,0.003,0.01,0.02])

kf = KFold(n_splits=split_)

y_train_actual_lamda = np.zeros((x_train_final.shape[0],alpha_.shape[0]))
y_train_predict_lamda = np.zeros((x_train_final.shape[0],alpha_.shape[0]))
y_test_predict_lamda = np.zeros((x_test_final.shape[0],alpha_.shape[0]))

for lamda in alpha_:
    print ("Processing for lamda:{}".format(lamda))
    
    fold_ = 0
    y_test_predict_kfold = np.zeros((x_test_final.shape[0],split_))

    y_train_predict_array = np.array(())
    y_train_actual_array = np.array(())
    for (train_idx,test_idx) in kf.split(x_train_final,y_train):
        #print (test_idx.shape)
        #print(fold_)
        lasso_model = Lasso(alpha=lamda)
        lasso_model.fit(x_train_final.iloc[train_idx],y_train.iloc[train_idx])
        y_train_predict = lasso_model.predict(x_train_final.iloc[test_idx])
        y_test_predict  = lasso_model.predict(x_test_final)
        
        y_train_actual_array = np.append(y_train_actual_array,y_train.iloc[test_idx])
        y_train_predict_array = np.append(y_train_predict_array,y_train_predict)
        
        y_test_predict_kfold[:,fold_] =  y_test_predict
        
        
        print ("Lamda: {} - Fold: {} - Error Test: {:.3f}".format(lamda,fold_+1,mean_squared_error(y_train.iloc[test_idx], y_train_predict, squared=False)))
        
        fold_ = fold_ + 1
    
    y_train_actual_lamda[:,lamda_count_] = y_train_actual_array
    y_train_predict_lamda[:,lamda_count_] = y_train_predict_array
    
    y_test_predict_lamda[:,lamda_count_] = np.mean(y_test_predict_kfold,axis=1)
    
    
    print ("Lamda: {} - Error Train: {:.3f}".format(lamda,mean_squared_error(y_train, y_train_predict_lamda[:,lamda_count_], squared=False)))
    print ("Lamda: {} - Error Test : {:.3f}".format(lamda,mean_squared_error(y_test, y_test_predict_lamda[:,lamda_count_], squared=False)))
    
    lamda_count_ = lamda_count_ + 1
    

Processing for lamda:0.0
Lamda: 0.0 - Fold: 1 - Error Test: 0.184
Lamda: 0.0 - Fold: 2 - Error Test: 0.136
Lamda: 0.0 - Fold: 3 - Error Test: 0.163
Lamda: 0.0 - Fold: 4 - Error Test: 0.114
Lamda: 0.0 - Fold: 5 - Error Test: 0.142
Lamda: 0.0 - Fold: 6 - Error Test: 0.158
Lamda: 0.0 - Fold: 7 - Error Test: 0.153
Lamda: 0.0 - Fold: 8 - Error Test: 0.151
Lamda: 0.0 - Fold: 9 - Error Test: 0.108
Lamda: 0.0 - Fold: 10 - Error Test: 0.162
Lamda: 0.0 - Error Train: 0.149
Lamda: 0.0 - Error Test : 0.146
Processing for lamda:1.0
Lamda: 1.0 - Fold: 1 - Error Test: 0.218
Lamda: 1.0 - Fold: 2 - Error Test: 0.169
Lamda: 1.0 - Fold: 3 - Error Test: 0.200
Lamda: 1.0 - Fold: 4 - Error Test: 0.139
Lamda: 1.0 - Fold: 5 - Error Test: 0.168
Lamda: 1.0 - Fold: 6 - Error Test: 0.176
Lamda: 1.0 - Fold: 7 - Error Test: 0.180
Lamda: 1.0 - Fold: 8 - Error Test: 0.185
Lamda: 1.0 - Fold: 9 - Error Test: 0.140
Lamda: 1.0 - Fold: 10 - Error Test: 0.195
Lamda: 1.0 - Error Train: 0.179
Lamda: 1.0 - Error Test : 0.180


In [5]:
#------------------------------------------------------------------------
#
# Random Forest Regressor
#
#------------------------------------------------------------------------
split_ = 10
max_depth_count_ = 0
max_depth_ = np.array([16,18,24,28,30,90,120])

kf = KFold(n_splits=split_)

y_train_actual_max_depth = np.zeros((x_train_final.shape[0],max_depth_.shape[0]))
y_train_predict_max_depth = np.zeros((x_train_final.shape[0],max_depth_.shape[0]))
y_test_predict_max_depth = np.zeros((x_test_final.shape[0],max_depth_.shape[0]))

for depth in max_depth_:
    print ("Processing for max_depth:{}".format(depth))
    
    fold_ = 0
    y_test_predict_kfold = np.zeros((x_test_final.shape[0],split_))

    y_train_predict_array = np.array(())
    y_train_actual_array = np.array(())
    for (train_idx,test_idx) in kf.split(x_train_final,y_train):
        rf_model = RandomForestRegressor(max_depth=depth,random_state=125247)
        rf_model.fit(x_train_final.iloc[train_idx],y_train.iloc[train_idx])
        y_train_predict = rf_model.predict(x_train_final.iloc[test_idx])
        y_test_predict  = rf_model.predict(x_test_final)
        
        y_train_actual_array = np.append(y_train_actual_array,y_train.iloc[test_idx])
        y_train_predict_array = np.append(y_train_predict_array,y_train_predict)
        
        y_test_predict_kfold[:,fold_] =  y_test_predict
        
        
        print ("Max_Depth: {} - Fold: {} - Error Test: {:.3f}".format(depth,fold_+1,mean_squared_error(y_train.iloc[test_idx], y_train_predict, squared=False)))
        
        fold_ = fold_ + 1
    
    y_train_actual_max_depth[:,max_depth_count_] = y_train_actual_array
    y_train_predict_max_depth[:,max_depth_count_] = y_train_predict_array
    
    y_test_predict_max_depth[:,max_depth_count_] = np.mean(y_test_predict_kfold,axis=1)
    
    
    print ("Max Depth: {} - Error Train: {:.3f}".format(depth,mean_squared_error(y_train, y_train_predict_max_depth[:,max_depth_count_], squared=False)))
    print ("Max Depth: {} - Error Test : {:.3f}".format(depth,mean_squared_error(y_test, y_test_predict_max_depth[:,max_depth_count_], squared=False)))
    
    max_depth_count_ = max_depth_count_ + 1
    

Processing for max_depth:16
Max_Depth: 16 - Fold: 1 - Error Test: 0.191
Max_Depth: 16 - Fold: 2 - Error Test: 0.141
Max_Depth: 16 - Fold: 3 - Error Test: 0.172
Max_Depth: 16 - Fold: 4 - Error Test: 0.108
Max_Depth: 16 - Fold: 5 - Error Test: 0.137
Max_Depth: 16 - Fold: 6 - Error Test: 0.143
Max_Depth: 16 - Fold: 7 - Error Test: 0.152
Max_Depth: 16 - Fold: 8 - Error Test: 0.159
Max_Depth: 16 - Fold: 9 - Error Test: 0.102
Max_Depth: 16 - Fold: 10 - Error Test: 0.157
Max Depth: 16 - Error Train: 0.148
Max Depth: 16 - Error Test : 0.144
Processing for max_depth:18
Max_Depth: 18 - Fold: 1 - Error Test: 0.191
Max_Depth: 18 - Fold: 2 - Error Test: 0.140
Max_Depth: 18 - Fold: 3 - Error Test: 0.170
Max_Depth: 18 - Fold: 4 - Error Test: 0.110
Max_Depth: 18 - Fold: 5 - Error Test: 0.137
Max_Depth: 18 - Fold: 6 - Error Test: 0.143
Max_Depth: 18 - Fold: 7 - Error Test: 0.151
Max_Depth: 18 - Fold: 8 - Error Test: 0.157
Max_Depth: 18 - Fold: 9 - Error Test: 0.102
Max_Depth: 18 - Fold: 10 - Error Test

In [8]:
#------------------------------------------------------------------------
#
# Elastic Nets
#
#------------------------------------------------------------------------

split_ = 10
lamda_count_ = 0
alpha_ = np.array([0,1,0.2,0.02,0.0002,0.00002,0.000002])
#alpha_ = np.array([1,0.00001,0.0001,0.0002,0.0003,0.001,0.002,0.003,0.01,0.02])

kf = KFold(n_splits=split_)

y_train_actual_en_lamda = np.zeros((x_train_final.shape[0],alpha_.shape[0]))
y_train_predict_en_lamda = np.zeros((x_train_final.shape[0],alpha_.shape[0]))
y_test_predict_en_lamda = np.zeros((x_test_final.shape[0],alpha_.shape[0]))

for lamda in alpha_:
    print ("Processing for lamda:{}".format(lamda))
    
    fold_ = 0
    y_test_predict_en_kfold = np.zeros((x_test_final.shape[0],split_))

    y_train_predict_en_array = np.array(())
    y_train_actual_en_array = np.array(())
    for (train_idx,test_idx) in kf.split(x_train_final,y_train):
        #print (test_idx.shape)
        #print(fold_)
        en_model = ElasticNet(alpha=lamda)
        en_model.fit(x_train_final.iloc[train_idx],y_train.iloc[train_idx])
        y_train_predict = en_model.predict(x_train_final.iloc[test_idx])
        y_test_predict  = en_model.predict(x_test_final)
        
        y_train_actual_en_array = np.append(y_train_actual_en_array,y_train.iloc[test_idx])
        y_train_predict_en_array = np.append(y_train_predict_en_array,y_train_predict)
        
        y_test_predict_en_kfold[:,fold_] =  y_test_predict
        
        
        print ("Lamda: {} - Fold: {} - Error Test: {:.3f}".format(lamda,fold_+1,mean_squared_error(y_train.iloc[test_idx], y_train_predict, squared=False)))
        
        fold_ = fold_ + 1
    
    y_train_actual_en_lamda[:,lamda_count_] = y_train_actual_en_array
    y_train_predict_en_lamda[:,lamda_count_] = y_train_predict_en_array
    
    y_test_predict_en_lamda[:,lamda_count_] = np.mean(y_test_predict_en_kfold,axis=1)
    
    
    print ("Lamda: {} - Error Train: {:.3f}".format(lamda,mean_squared_error(y_train, y_train_predict_en_lamda[:,lamda_count_], squared=False)))
    print ("Lamda: {} - Error Test : {:.3f}".format(lamda,mean_squared_error(y_test, y_test_predict_en_lamda[:,lamda_count_], squared=False)))
    
    lamda_count_ = lamda_count_ + 1
    

Processing for lamda:0.0
Lamda: 0.0 - Fold: 1 - Error Test: 0.184
Lamda: 0.0 - Fold: 2 - Error Test: 0.136
Lamda: 0.0 - Fold: 3 - Error Test: 0.163
Lamda: 0.0 - Fold: 4 - Error Test: 0.114
Lamda: 0.0 - Fold: 5 - Error Test: 0.142
Lamda: 0.0 - Fold: 6 - Error Test: 0.158
Lamda: 0.0 - Fold: 7 - Error Test: 0.153
Lamda: 0.0 - Fold: 8 - Error Test: 0.151
Lamda: 0.0 - Fold: 9 - Error Test: 0.108
Lamda: 0.0 - Fold: 10 - Error Test: 0.162
Lamda: 0.0 - Error Train: 0.149
Lamda: 0.0 - Error Test : 0.146
Processing for lamda:1.0
Lamda: 1.0 - Fold: 1 - Error Test: 0.213
Lamda: 1.0 - Fold: 2 - Error Test: 0.160
Lamda: 1.0 - Fold: 3 - Error Test: 0.187
Lamda: 1.0 - Fold: 4 - Error Test: 0.133
Lamda: 1.0 - Fold: 5 - Error Test: 0.163
Lamda: 1.0 - Fold: 6 - Error Test: 0.168
Lamda: 1.0 - Fold: 7 - Error Test: 0.174
Lamda: 1.0 - Fold: 8 - Error Test: 0.179
Lamda: 1.0 - Fold: 9 - Error Test: 0.133
Lamda: 1.0 - Fold: 10 - Error Test: 0.192
Lamda: 1.0 - Error Train: 0.172
Lamda: 1.0 - Error Test : 0.176
