In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [2]:
def load_data():
    columns = ["Col1","Col 2","Col 3","Col 4","Col 5","Col 6", "Col 7", "Col 8","Col 9","Col 10"]
    data = pd.read_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/Ames_data.csv")
    testIds = pd.read_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/project1_testIDs.dat",sep = " ", names = columns)
    
    return data, testIds

def created_train_test(data,testIds,j):
    j = j-1
    test = data.iloc[np.array(testIds)[:,j]]
    train = data.drop(np.array(testIds)[:,j], axis=0)
    return train,test

def feature_engineering(x_train,x_test):
    #number of record
    train_num = x_train.shape[0]
    test_num = x_test.shape[0]
    #Merge the Train & Test
    df = [x_train,x_test]
    df_train_test = pd.concat(df)
    
    #Label Encoder to transform the Categoricsal Variable
    lbe = LabelEncoder()
    for col_name in train.columns[train.dtypes == 'object']:
        #col_name = cols+'_Cat'
        df_train_test[col_name] = lbe.fit_transform(df_train_test[col_name])
        
    #Let's drop the column "Garage_Yr_Blt" for now, as it ha NaN
    df_train_test = df_train_test.drop(['Garage_Yr_Blt'],axis=1)
    x_train = df_train_test.iloc[0:train_num]
    x_test = df_train_test.iloc[train_num:]
    
    return x_train,x_test

def lasso_model(x_train,y_train,x_test,y_test,alpha=0.5):
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(x_train,y_train)
    y_predict = lasso_model.predict(x_test)
    return lasso_model,y_predict


In [3]:
data, testIds = load_data()
train,test = created_train_test(data,testIds,j=1)

#Print the dataset size
print ("Number of elements in the Training Set: {}".format(train.shape))
print ("Number of elements in the Test Set: {}".format(test.shape))

#Write to the Train.csv file
train.to_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/train.csv",index=False)
test.to_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/test.csv",index=False)

y_train = np.log(train['Sale_Price'])
x_train = train.drop(['Sale_Price'],axis=1)
y_test = np.log(test['Sale_Price'])
x_test = test.drop(['Sale_Price'],axis=1)

x_train_transformed,x_test_transformed = feature_engineering(x_train,x_test)

#Print the dataset size
print ("Number of elements in the Train Transformed Set: {}".format(x_train_transformed.shape))
print ("Number of elements in the Test Transformed Set: {}".format(x_test_transformed.shape))


Number of elements in the Training Set: (2051, 83)
Number of elements in the Test Set: (879, 83)
Number of elements in the Train Transformed Set: (2051, 81)
Number of elements in the Test Transformed Set: (879, 81)


In [86]:
#Lot_Area
#BsmtFin_Type_1
#BsmtFin_SF_1
#BsmtFin_Type_2
#BsmtFin_SF_2
#Bsmt_Unf_SF
#Total_Bsmt_SF
#Year_Built
#Year_Remod_Add
#Low_Qual_Fin_SF
#Year_Sold
#Street - Only 2 values (0,1)
#Utilities - Only 2 values (0,1)
#
selected_columns = ['MS_SubClass','MS_Zoning','Lot_Frontage','Street','Alley','Lot_Shape',
                    'Land_Contour','Utilities','Lot_Config','Land_Slope','Neighborhood',
                    'Condition_1','Condition_2','Bldg_Type','House_Style','Overall_Qual',
                    'Overall_Cond','Roof_Style','Roof_Matl','Exterior_1st','Exterior_2nd',
                    'Mas_Vnr_Type','Mas_Vnr_Area','Exter_Qual','Exter_Cond','Foundation',
                    'Bsmt_Qual','Bsmt_Cond','Bsmt_Exposure','Total_Bsmt_SF','Heating',
                    'Heating_QC','Central_Air','Electrical','First_Flr_SF','Second_Flr_SF',
                    'Gr_Liv_Area','Bsmt_Full_Bath','Bsmt_Half_Bath','Full_Bath','TotRms_AbvGrd',
                    'Functional','Fireplaces','Fireplace_Qu','Garage_Type','Garage_Finish',
                    'Garage_Cars','Garage_Area','Garage_Qual','Garage_Cond','Paved_Drive',
                    'Wood_Deck_SF','Open_Porch_SF','Enclosed_Porch','Three_season_porch',
                    'Pool_Area', 'Pool_QC', 'Fence', 'Misc_Feature', 'Misc_Val', 'Mo_Sold',
                    'Sale_Type', 'Sale_Condition'
                   ]
                    
split_ = 10
lamda_count_ = 0
alpha_ = np.array([0.00001,0.0001,0.0002,0.0003,0.001,0.002,0.003,0.01,0.02])

kf = KFold(n_splits=split_)

y_train_actual_lamda = np.zeros((x_train_transformed.shape[0],alpha_.shape[0]))
y_train_predict_lamda = np.zeros((x_train_transformed.shape[0],alpha_.shape[0]))
y_test_predict_lamda = np.zeros((x_test_transformed.shape[0],alpha_.shape[0]))

for lamda in alpha_:
    print ("Processing for lamda:{}".format(lamda))
    
    fold_ = 0
    y_test_predict_kfold = np.zeros((x_test_transformed.shape[0],split_))

    y_train_predict_array = np.array(())
    y_train_actual_array = np.array(())
    for (train_idx,test_idx) in kf.split(x_train,y_train):
        #print (test_idx.shape)
        #print(fold_)
        lasso_model = Lasso(alpha=lamda)
        lasso_model.fit(x_train_transformed[selected_columns].iloc[train_idx],y_train.iloc[train_idx])
        y_train_predict = lasso_model.predict(x_train_transformed[selected_columns].iloc[test_idx])
        y_test_predict  = lasso_model.predict(x_test_transformed[selected_columns])
        
        y_train_actual_array = np.append(y_train_actual_array,y_train.iloc[test_idx])
        y_train_predict_array = np.append(y_train_predict_array,y_train_predict)
        
        y_test_predict_kfold[:,fold_] =  y_test_predict
        
        
        print ("Lamda: {} - Fold: {} - Error Test: {:.3f}".format(lamda,fold_+1,mean_squared_error(y_train.iloc[test_idx], y_train_predict, squared=False)))
        
        fold_ = fold_ + 1
    
    y_train_actual_lamda[:,lamda_count_] = y_train_actual_array
    y_train_predict_lamda[:,lamda_count_] = y_train_predict_array
    
    y_test_predict_lamda[:,lamda_count_] = np.mean(y_test_predict_kfold,axis=1)
    
    
    print ("Lamda: {} - Error Train: {:.3f}".format(lamda,mean_squared_error(y_train, y_train_predict_lamda[:,lamda_count_], squared=False)))
    print ("Lamda: {} - Error Test : {:.3f}".format(lamda,mean_squared_error(y_test, y_test_predict_lamda[:,lamda_count_], squared=False)))
    
    lamda_count_ = lamda_count_ + 1
    

Processing for lamda:1e-05
Lamda: 1e-05 - Fold: 1 - Error Test: 0.198
Lamda: 1e-05 - Fold: 2 - Error Test: 0.147
Lamda: 1e-05 - Fold: 3 - Error Test: 0.182
Lamda: 1e-05 - Fold: 4 - Error Test: 0.124
Lamda: 1e-05 - Fold: 5 - Error Test: 0.162
Lamda: 1e-05 - Fold: 6 - Error Test: 0.236
Lamda: 1e-05 - Fold: 7 - Error Test: 0.168
Lamda: 1e-05 - Fold: 8 - Error Test: 0.241
Lamda: 1e-05 - Fold: 9 - Error Test: 0.132
Lamda: 1e-05 - Fold: 10 - Error Test: 0.176
Lamda: 1e-05 - Error Train: 0.181
Lamda: 1e-05 - Error Test : 0.167
Processing for lamda:0.0001
Lamda: 0.0001 - Fold: 1 - Error Test: 0.198
Lamda: 0.0001 - Fold: 2 - Error Test: 0.147
Lamda: 0.0001 - Fold: 3 - Error Test: 0.182
Lamda: 0.0001 - Fold: 4 - Error Test: 0.124
Lamda: 0.0001 - Fold: 5 - Error Test: 0.162
Lamda: 0.0001 - Fold: 6 - Error Test: 0.236
Lamda: 0.0001 - Fold: 7 - Error Test: 0.168
Lamda: 0.0001 - Fold: 8 - Error Test: 0.239
Lamda: 0.0001 - Fold: 9 - Error Test: 0.132
Lamda: 0.0001 - Fold: 10 - Error Test: 0.176
Lamda