In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso,ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

In [2]:
def load_data():
    columns = ["Col1","Col 2","Col 3","Col 4","Col 5","Col 6", "Col 7", "Col 8","Col 9","Col 10"]
    data = pd.read_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/Ames_data.csv")
    testIds = pd.read_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/project1_testIDs.dat",sep = " ", names = columns)
    
    return data, testIds

def created_train_test(data,testIds,j):
    j = j-1
    test = data.iloc[np.array(testIds)[:,j]]
    train = data.drop(np.array(testIds)[:,j], axis=0)
    return train,test

def onehot_encoding(x_train_input,x_test_input):
    #number of record
    train_num = x_train_input.shape[0]
    test_num = x_test_input.shape[0]
    #Merge the Train & Test
    df = [x_train_input,x_test_input]
    df_train_test = pd.concat(df)
    
    #Below columns needs to be dropped because of High Imbalance in data
    #"Garage_Yr_Blt" for now, as it has lots of NaN
    drop_columns = ['PID','Street', 'Utilities', 'Condition_2', 'Roof_Matl', 'Heating', 
                    'Pool_QC', 'Misc_Feature', 'Low_Qual_Fin_SF', 'Pool_Area', 'Longitude',
                    'Latitude','Land_Slope','Bsmt_Half_Bath','Three_season_porch','Misc_Val',
                    'Garage_Yr_Blt'
                    ]
    #Let's drop the column
    df_train_test = df_train_test.drop(drop_columns,axis=1)
    
    #Label Encoder to transform the Categoricsal Variable
    #lbe = LabelEncoder() -- This is to create the dummy value for all the Categorical value
    ohe = OneHotEncoder()
    
    for col_name in df_train_test.columns[df_train_test.dtypes == 'object']:
        #df_train_test[col_name] = lbe.fit_transform(df_train_test[col_name])
        df_get_dummies = pd.get_dummies(df_train_test[col_name],drop_first=True,prefix=col_name)
        df_train_test=pd.concat([df_train_test,df_get_dummies],axis=1)
        
    #Drop all the categorical columns, as we have created the dummies columns
    drop_cat_col_name= []
    for col_name in df_train_test.columns[df_train_test.dtypes == 'object']:
        drop_cat_col_name = np.append(drop_cat_col_name,col_name)
    
    df_train_test = df_train_test.drop(drop_cat_col_name,axis=1)
    
    #Split the Train & Test Data
    x_train_return = df_train_test.iloc[0:train_num]
    x_test_return = df_train_test.iloc[train_num:]
    
    return x_train_return,x_test_return


def winsorization(x_train_input,x_test_input):
    #Purposefully, removed the column = "Three_season_porch", "Misc_Val"
    winsorization_cols = ['Lot_Frontage', 'Lot_Area', 'Mas_Vnr_Area', 'BsmtFin_SF_2', 'Bsmt_Unf_SF', 
                          'Total_Bsmt_SF', 'Second_Flr_SF', 'First_Flr_SF', 'Gr_Liv_Area', 'Garage_Area',
                          'Wood_Deck_SF', 'Open_Porch_SF', 'Enclosed_Porch',  
                          'Screen_Porch']
    quan_val = 0.95
    for winso_columns in winsorization_cols:
        col_quant_value = np.quantile(x_train_input[winso_columns],quan_val)
        x_train_input[winso_columns][x_train_input[winso_columns] > col_quant_value] = col_quant_value
        x_test_input[winso_columns][x_test_input[winso_columns] > col_quant_value] = col_quant_value
        #print ("Column : {} 95% Quantile: {}".format(winso_columns,col_quant_value))
        
    return x_train_input,x_test_input
        
def lasso_model(x_train,y_train,x_test,y_test,alpha=0.5):
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(x_train,y_train)
    y_predict = lasso_model.predict(x_test)
    return lasso_model,y_predict

In [3]:
data, testIds = load_data()
train,test = created_train_test(data,testIds,j=1)

#Print the dataset size
print ("Number of elements in the Training Set: {}".format(train.shape))
print ("Number of elements in the Test Set: {}".format(test.shape))

#Write to the Train.csv file
train.to_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/train.csv",index=False)
test.to_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/test.csv",index=False)

y_train = np.log(train['Sale_Price'])
x_train = train.drop(['Sale_Price'],axis=1)
y_test = np.log(test['Sale_Price'])
x_test = test.drop(['Sale_Price'],axis=1)

x_train_onehot,x_test_onehot = onehot_encoding(x_train,x_test)
#Print the dataset size
print ("Number of elements in the Train Transformed Set: {}".format(x_train_onehot.shape))
print ("Number of elements in the Test Transformed Set: {}".format(x_test_onehot.shape))

x_train_final,x_test_final = winsorization(x_train_onehot,x_test_onehot)

#Print the dataset size
print ("Number of elements in the Train Winsor Set: {}".format(x_train_final.shape))
print ("Number of elements in the Test Winsor Set : {}".format(x_test_final.shape))


Number of elements in the Training Set: (2051, 83)
Number of elements in the Test Set: (879, 83)
Number of elements in the Train Transformed Set: (2051, 266)
Number of elements in the Test Transformed Set: (879, 266)
Number of elements in the Train Winsor Set: (2051, 266)
Number of elements in the Test Winsor Set : (879, 266)


In [4]:
#------------------------------------------------------------------------
#
# Shrinking Methods (Lasso)
#
#------------------------------------------------------------------------

split_ = 10
alpha_ = np.array([0.0001])
#alpha_ = np.array([1,0.00001,0.0001,0.0002,0.0003,0.001,0.002,0.003,0.01,0.02])

kf = KFold(n_splits=split_)

y_train_actual_lamda = np.zeros((x_train_final.shape[0],alpha_.shape[0]))
y_train_predict_lamda = np.zeros((x_train_final.shape[0],alpha_.shape[0]))
y_test_predict_lamda = np.zeros((x_test_final.shape[0],alpha_.shape[0]))

for lamda in alpha_:
    print ("Processing for lamda:{}".format(lamda))
    
    fold_ = 0
    y_test_predict_kfold = np.zeros((x_test_final.shape[0],split_))

    y_train_predict_array = np.array(())
    y_train_actual_array = np.array(())
    for (train_idx,test_idx) in kf.split(x_train_final,y_train):
        #print (test_idx.shape)
        #print(fold_)
        lasso_model = Lasso(alpha=lamda)
        lasso_model.fit(x_train_final.iloc[train_idx],y_train.iloc[train_idx])
        y_train_predict = lasso_model.predict(x_train_final.iloc[test_idx])
        y_test_predict  = lasso_model.predict(x_test_final)
        
        y_train_actual_array = np.append(y_train_actual_array,y_train.iloc[test_idx])
        y_train_predict_array = np.append(y_train_predict_array,y_train_predict)
        
        y_test_predict_kfold[:,fold_] =  y_test_predict
        
        
        print ("Lamda: {} - Fold: {} - Error Validation: {:.3f}, Error Test: {:.3f}".format(lamda,fold_,mean_squared_error(y_train.iloc[test_idx], y_train_predict, squared=False),mean_squared_error(y_test, y_test_predict, squared=False)))
        
        fold_ = fold_ + 1
    
    y_train_actual_lamda[:,lamda_count_] = y_train_actual_array
    y_train_predict_lamda[:,lamda_count_] = y_train_predict_array
    
    y_test_predict_lamda[:,lamda_count_] = np.mean(y_test_predict_kfold,axis=1)
    
    
    print ("Lamda: {} - Overall Error Train: {:.3f} - Overall Error Test : {:.3f}".format(lamda,mean_squared_error(y_train, y_train_predict_lamda[:,1], squared=False)))
    print ("Lamda: {} - ".format(lamda,mean_squared_error(y_test, y_test_predict_lamda[:,1], squared=False)))


Processing for lamda:0.0001
Lamda: 0.0001 - Fold: 0 - Error Validation: 0.155, Error Test: 0.122
Lamda: 0.0001 - Fold: 1 - Error Validation: 0.118, Error Test: 0.125
Lamda: 0.0001 - Fold: 2 - Error Validation: 0.159, Error Test: 0.130
Lamda: 0.0001 - Fold: 3 - Error Validation: 0.089, Error Test: 0.124
Lamda: 0.0001 - Fold: 4 - Error Validation: 0.120, Error Test: 0.124
Lamda: 0.0001 - Fold: 5 - Error Validation: 0.135, Error Test: 0.126
Lamda: 0.0001 - Fold: 6 - Error Validation: 0.131, Error Test: 0.128
Lamda: 0.0001 - Fold: 7 - Error Validation: 0.133, Error Test: 0.126
Lamda: 0.0001 - Fold: 8 - Error Validation: 0.088, Error Test: 0.125
Lamda: 0.0001 - Fold: 9 - Error Validation: 0.115, Error Test: 0.126


IndexError: tuple index out of range

In [None]:
#------------------------------------------------------------------------
#
# Elastic Nets
#
#------------------------------------------------------------------------

split_ = 10
lamda_count_ = 0
#alpha_ = np.array([1,0.00001,0.0001,0.0002,0.0003,0.001,0.002,0.003,0.01,0.02])
alpha_ = np.array([0.0001])

kf = KFold(n_splits=split_)

y_train_actual_en_lamda = np.zeros((x_train_final.shape[0],alpha_.shape[0]))
y_train_predict_en_lamda = np.zeros((x_train_final.shape[0],alpha_.shape[0]))
y_test_predict_en_lamda = np.zeros((x_test_final.shape[0],alpha_.shape[0]))

for lamda in alpha_:
    print ("Processing for lamda:{}".format(lamda))
    
    fold_ = 0
    y_test_predict_en_kfold = np.zeros((x_test_final.shape[0],split_))

    y_train_predict_en_array = np.array(())
    y_train_actual_en_array = np.array(())
    for (train_idx,test_idx) in kf.split(x_train_final,y_train):
        #print (test_idx.shape)
        #print(fold_)
        en_model = ElasticNet(alpha=lamda)
        en_model.fit(x_train_final.iloc[train_idx],y_train.iloc[train_idx])
        y_train_predict = en_model.predict(x_train_final.iloc[test_idx])
        y_test_predict  = en_model.predict(x_test_final)
        
        y_train_actual_en_array = np.append(y_train_actual_en_array,y_train.iloc[test_idx])
        y_train_predict_en_array = np.append(y_train_predict_en_array,y_train_predict)
        
        y_test_predict_en_kfold[:,fold_] =  y_test_predict
        
        
        print ("Lamda: {} - Fold: {} - Error Validation: {:.3f}, Error Test: {:.3f}".format(lamda,fold_,mean_squared_error(y_train.iloc[test_idx], y_train_predict, squared=False),mean_squared_error(y_test, y_test_predict, squared=False)))
        
        fold_ = fold_ + 1
    
    y_train_actual_en_lamda[:,lamda_count_] = y_train_actual_en_array
    y_train_predict_en_lamda[:,lamda_count_] = y_train_predict_en_array
    
    y_test_predict_en_lamda[:,lamda_count_] = np.mean(y_test_predict_en_kfold,axis=1)
    
    
    print ("Lamda: {} - Error Train: {:.3f}".format(lamda,mean_squared_error(y_train, y_train_predict_en_lamda[:,lamda_count_], squared=False)))
    print ("Lamda: {} - Error Test : {:.3f}".format(lamda,mean_squared_error(y_test, y_test_predict_en_lamda[:,lamda_count_], squared=False)))
    
    lamda_count_ = lamda_count_ + 1
    

In [None]:
#------------------------------------------------------------------------
#
# Random Forest Regressor
#
#------------------------------------------------------------------------
split_ = 10
max_depth_count_ = 0
min_samples_split = np.array([2])
min_samples_leaf = np.array([1])
#max_depth_ = np.array([2,5,10,16,18,24,28,30,90,120])
max_depth_ = np.array([30])


kf = KFold(n_splits=split_)

y_train_actual_max_depth = np.zeros((x_train_final.shape[0],max_depth_.shape[0]))
y_train_predict_max_depth = np.zeros((x_train_final.shape[0],max_depth_.shape[0]))
y_test_predict_max_depth = np.zeros((x_test_final.shape[0],max_depth_.shape[0]))

for depth in max_depth_:
    for sample in min_samples_split:
        for leaf in min_samples_leaf:
            print ("max_depth:{} min_samples_split:{} min_samples_leaf:{} ".format(depth,sample,leaf))

            fold_ = 0
            #y_test_predict_kfold = np.zeros((x_test_final.shape[0],split_))

            #y_train_predict_array = np.array(())
            #y_train_actual_array = np.array(())
            for (train_idx,test_idx) in kf.split(x_train_final,y_train):
                rf_model = RandomForestRegressor(max_depth=depth,
                                                 n_estimators=1000,
                                                 random_state=125247)
                rf_model.fit(x_train_final.iloc[train_idx],y_train.iloc[train_idx])
                y_train_predict = rf_model.predict(x_train_final.iloc[test_idx])
                y_test_predict  = rf_model.predict(x_test_final)

                #y_train_actual_array = np.append(y_train_actual_array,y_train.iloc[test_idx])
                #y_train_predict_array = np.append(y_train_predict_array,y_train_predict)

                #y_test_predict_kfold[:,fold_] =  y_test_predict


                print ("Max_Depth: {} - min_samples_split:{} - min_samples_leaf:{} - Fold: {} - Error Validation: {:.3f}, Error Test: {:.3f}".format(depth,sample,leaf,fold_,mean_squared_error(y_train.iloc[test_idx], y_train_predict, squared=False),mean_squared_error(y_test, y_test_predict, squared=False)))

                fold_ = fold_ + 1

            #y_train_actual_max_depth[:,max_depth_count_] = y_train_actual_array
            #y_train_predict_max_depth[:,max_depth_count_] = y_train_predict_array

            #y_test_predict_max_depth[:,max_depth_count_] = np.mean(y_test_predict_kfold,axis=1)


            #print ("Max Depth: {} - Error Train: {:.3f}".format(depth,mean_squared_error(y_train, y_train_predict_max_depth[:,max_depth_count_], squared=False)))
            #print ("Max Depth: {} - Error Test : {:.3f}".format(depth,mean_squared_error(y_test, y_test_predict_max_depth[:,max_depth_count_], squared=False)))

            #max_depth_count_ = max_depth_count_ + 1
    

In [None]:
#------------------------------------------------------------------------
#
# Xgboost Regressor
#
#------------------------------------------------------------------------
#max_depth: 25 - learning_rate:0.04 - colsample_bytree:0.1 - alpha:1 - Fold: 0 - Validation: 0.172, Test: 0.125
#max_depth: 50 - learning_rate:0.04 - colsample_bytree:0.1 - alpha:1 - Fold: 0 - Validation: 0.172, Test: 0.125
#max_depth: 70 - learning_rate:0.04 - colsample_bytree:0.1 - alpha:1 - Fold: 0 - Validation: 0.172, Test: 0.125
#max_depth: 25 - learning_rate:0.01 - colsample_bytree:0.1 - alpha:1 - Fold: 0 - Validation: 0.176, Test: 0.127

split_ = 10
max_depth_count_ = 0
#n_estimators = [100,500,1000,1500,2000]
colsample_bytree = [0.1]
#learning_rate = [0.0001,0.001,0.01,0.1]
learning_rate = [0.04]
#max_depth_ = np.array([2,5,10,16,18,24,28,30,90,120])
max_depth_ = np.array([25])
alpha = np.array([1])
#max_depth_ = np.array([24])

kf = KFold(n_splits=split_)

#y_train_actual_max_depth = np.zeros((x_train_final.shape[0],max_depth_.shape[0]))
#y_train_predict_max_depth = np.zeros((x_train_final.shape[0],max_depth_.shape[0]))
#y_test_predict_max_depth = np.zeros((x_test_final.shape[0],max_depth_.shape[0]))

for d in max_depth_:
    for l in learning_rate:
        for c in colsample_bytree:
            for a in alpha:
                
                print ("max_depth:{} learning_rate:{} colsample_bytree:{} alpha:{}".format(d,l,c,a))

                fold_ = 0
                #y_test_predict_kfold = np.zeros((x_test_final.shape[0],split_))

                #y_train_predict_array = np.array(())
                #y_train_actual_array = np.array(())
                for (train_idx,test_idx) in kf.split(x_train_final,y_train):
                    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', 
                                              colsample_bytree = c, 
                                              learning_rate = l,
                                              max_depth = d, 
                                              alpha = a, 
                                              n_estimators = 1000)
                    xg_reg.fit(x_train_final.iloc[train_idx],y_train.iloc[train_idx])
                    y_train_predict = xg_reg.predict(x_train_final.iloc[test_idx])
                    y_test_predict  = xg_reg.predict(x_test_final)

                    #y_train_actual_array = np.append(y_train_actual_array,y_train.iloc[test_idx])
                    #y_train_predict_array = np.append(y_train_predict_array,y_train_predict)

                    #y_test_predict_kfold[:,fold_] =  y_test_predict


                    print ("max_depth: {} - learning_rate:{} - colsample_bytree:{} - alpha:{} - Fold: {} - Validation: {:.3f}, Test: {:.3f}".format(d,l,c,a,fold_,mean_squared_error(y_train.iloc[test_idx], y_train_predict, squared=False),mean_squared_error(y_test, y_test_predict, squared=False)))

                    fold_ = fold_ + 1

                #y_train_actual_max_depth[:,max_depth_count_] = y_train_actual_array
                #y_train_predict_max_depth[:,max_depth_count_] = y_train_predict_array

                #y_test_predict_max_depth[:,max_depth_count_] = np.mean(y_test_predict_kfold,axis=1)


                #print ("Max Depth: {} - Error Train: {:.3f}".format(depth,mean_squared_error(y_train, y_train_predict_max_depth[:,max_depth_count_], squared=False)))
                #print ("Max Depth: {} - Error Test : {:.3f}".format(depth,mean_squared_error(y_test, y_test_predict_max_depth[:,max_depth_count_], squared=False)))

                #max_depth_count_ = max_depth_count_ + 1
    

In [None]:
max_depth: 25 - learning_rate:0.04 - colsample_bytree:0.1 - alpha:1 - Fold: 0 - Validation: 0.172, Test: 0.125
max_depth: 50 - learning_rate:0.04 - colsample_bytree:0.1 - alpha:1 - Fold: 0 - Validation: 0.172, Test: 0.125
max_depth: 70 - learning_rate:0.04 - colsample_bytree:0.1 - alpha:1 - Fold: 0 - Validation: 0.172, Test: 0.125
max_depth: 25 - learning_rate:0.01 - colsample_bytree:0.1 - alpha:1 - Fold: 0 - Validation: 0.176, Test: 0.127