In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso,ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
def load_data(folder_name):
    columns = ["Col1","Col 2","Col 3","Col 4","Col 5","Col 6", "Col 7", "Col 8","Col 9","Col 10"]
    data = pd.read_csv(folder_name+"Ames_data.csv")
    testIds = pd.read_csv(folder_name+"project1_testIDs.dat",sep = " ", names = columns)
    
    return data, testIds

def created_train_test(data,testIds,j):
    j = j-1
    test = data.iloc[np.array(testIds)[:,j]]
    train = data.drop(np.array(testIds)[:,j], axis=0)
    return train,test

def create_folders_for_train_test_files(data,testIds,folder_name):
    for i in np.arange(10):
        folder = i+1
        train,test = created_train_test(data,testIds,j=folder)
        test_y = pd.DataFrame({'Sale_Price':test['Sale_Price']})
        test = test.drop(['Sale_Price'],axis=1)
        print ("Folder: {} Created - Training Set Size:{} - Test Set Size:{}".format(folder,train.shape,test.shape))
        
        folder_name_final_ = folder_name+"folder_"+np.str(folder)
        #Create the Directory, if not exists
        if not os.path.exists(folder_name_final_):
            os.mkdir(folder_name_final_)

        train.to_csv(folder_name_final_+"/train.csv",index=False)
        test.to_csv(folder_name_final_+"/test.csv",index=False)
        test_y.to_csv(folder_name_final_+"/y_test.csv",index=False)
    return True
    

def onehot_encoding(x_train_input,x_test_input):
    train_num = x_train_input.shape[0]
    test_num = x_test_input.shape[0]
    df = [x_train_input,x_test_input]
    df_train_test = pd.concat(df)
    
    #Below columns needs to be dropped because of High Imbalance in data
    #"Garage_Yr_Blt" for now, as it has lots of NaN
    
    #Fill the na values to "0" for the feature 'Garage_Yr_Blt'
    df_train_test['Garage_Yr_Blt'] = df_train_test['Garage_Yr_Blt'].fillna(0)
    
    drop_columns = ['Street', 'Utilities', 'Condition_2', 'Roof_Matl', 'Heating', 
                    'Pool_QC', 'Misc_Feature', 'Low_Qual_Fin_SF', 'Pool_Area', 'Longitude',
                    'Latitude','Land_Slope','Bsmt_Half_Bath','Three_season_porch','Misc_Val'
                    #,'Garage_Yr_Blt'
                    ]
    #Let's drop the column
    df_train_test = df_train_test.drop(drop_columns,axis=1)
    
    #Label Encoder to transform the Categoricsal Variable
    #lbe = LabelEncoder() -- This is to create the dummy value for all the Categorical value
    ohe = OneHotEncoder()
    
    for col_name in df_train_test.columns[df_train_test.dtypes == 'object']:
        df_get_dummies = pd.get_dummies(df_train_test[col_name],drop_first=True,prefix=col_name)
        df_train_test=pd.concat([df_train_test,df_get_dummies],axis=1)
        
    #Drop all the categorical columns, as we have created the dummies columns
    drop_cat_col_name= []
    for col_name in df_train_test.columns[df_train_test.dtypes == 'object']:
        drop_cat_col_name = np.append(drop_cat_col_name,col_name)
    
    df_train_test = df_train_test.drop(drop_cat_col_name,axis=1)
    
    #Split the Train & Test Data
    x_train_return = df_train_test.iloc[0:train_num]
    x_test_return = df_train_test.iloc[train_num:]
    
    return x_train_return,x_test_return


def winsorization(x_train_input,x_test_input):
    #Purposefully, removed the column = "Three_season_porch", "Misc_Val"
    winsorization_cols = ['Lot_Frontage', 'Lot_Area', 'Mas_Vnr_Area', 'BsmtFin_SF_2', 'Bsmt_Unf_SF', 
                          'Total_Bsmt_SF', 'Second_Flr_SF', 'First_Flr_SF', 'Gr_Liv_Area', 'Garage_Area',
                          'Wood_Deck_SF', 'Open_Porch_SF', 'Enclosed_Porch',  
                          'Screen_Porch']
    quan_val = 0.95
    for winso_columns in winsorization_cols:
        col_quant_value = np.quantile(x_train_input[winso_columns],quan_val)
        x_train_input[winso_columns][x_train_input[winso_columns] > col_quant_value] = col_quant_value
        x_test_input[winso_columns][x_test_input[winso_columns] > col_quant_value] = col_quant_value
        #print ("Column : {} 95% Quantile: {}".format(winso_columns,col_quant_value))
        
    return x_train_input,x_test_input

#------------------------------------------------------------------------
#
# Shrinking Methods (Lasso)
#
#------------------------------------------------------------------------
def lasso_model(x_train_lasso,y_train_lasso,x_test_lasso,y_test_lasso,print_ind=False):
    
    x_train_lasso_PID = x_train_lasso['PID']
    x_test_lasso_PID = x_test_lasso['PID']
    
    x_train_lasso = x_train_lasso.drop(['PID'],axis=1)
    x_test_lasso = x_test_lasso.drop(['PID'],axis=1)

    split_ = 10
    alpha_ = 0.0001
    fold_ = 0
    kf = KFold(n_splits=split_)

    y_train_actual_lasso = np.zeros((x_train_lasso.shape[0],1))
    y_train_predict_lasso = np.zeros((x_train_lasso.shape[0],1))
    y_test_predict_lasso = np.zeros((x_test_lasso.shape[0],1))

    y_test_predict_kfold = np.zeros((x_test_lasso.shape[0],split_))

    y_train_predict_array = np.array(())
    y_train_actual_array = np.array(())
    for (train_idx,test_idx) in kf.split(x_train_lasso,y_train_lasso):
        lasso_model = Lasso(alpha=alpha_)
        lasso_model.fit(x_train_lasso.iloc[train_idx],y_train_lasso.iloc[train_idx])
        y_train_predict = lasso_model.predict(x_train_lasso.iloc[test_idx])
        y_test_predict  = lasso_model.predict(x_test_lasso)

        y_train_actual_array = np.append(y_train_actual_array,y_train_lasso.iloc[test_idx])
        y_train_predict_array = np.append(y_train_predict_array,y_train_predict)

        y_test_predict_kfold[:,fold_] =  y_test_predict
        
        if (print_ind):
            print ("Lasso - Fold: {} - Error Validation: {:.3f}, Error Test: {:.3f}".format(fold_,mean_squared_error(y_train_lasso.iloc[test_idx], y_train_predict, squared=False),mean_squared_error(y_test_lasso, y_test_predict, squared=False)))
        fold_ = fold_ + 1

    y_train_actual_lasso[:,0] = y_train_actual_array
    y_train_predict_lasso[:,0] = y_train_predict_array

    y_test_predict_lasso[:,0] = np.mean(y_test_predict_kfold,axis=1)
    
    df_submission = pd.DataFrame({'PID':x_test_lasso_PID,'Sale_Price':round(np.exp(pd.Series(y_test_predict_lasso[:,0])),1)})
    
    rmse_train = mean_squared_error(y_train_lasso, y_train_predict_lasso[:,0], squared=False)
    rmse_test = mean_squared_error(y_test_lasso, y_test_predict_lasso[:,0], squared=False)
    
    if (print_ind):
        print ("Lasso - Lamda: {} - Overall Train Error: {:.3f} - Overall Test Error: {:.3f}".format(alpha_,rmse_train,rmse_test))
        
    return rmse_train, rmse_test, df_submission

    
#------------------------------------------------------------------------
#
# Boosting Model (Xgboost)
#
#------------------------------------------------------------------------
def xgboost_model(x_train_xgboost,y_train_xgboost,x_test_xgboost,y_test_xgboost,print_ind=False):
    split_ = 10
    max_depth_count_ = 0
    colsample_bytree_ = 0.1
    learning_rate_ = 0.04
    max_depth_ = 25
    alpha_ = 1
    fold_ = 0
    kf = KFold(n_splits=split_)
    
    x_train_xgboost_PID = x_train_xgboost['PID']
    x_test_xgboost_PID = x_test_xgboost['PID']
    
    x_train_xgboost = x_train_xgboost.drop(['PID'],axis=1)
    x_test_xgboost = x_test_xgboost.drop(['PID'],axis=1)
    
    y_train_actual_xgboost = np.zeros((x_train_xgboost.shape[0],1))
    y_train_predict_xgboost = np.zeros((x_train_xgboost.shape[0],1))
    y_test_predict_xgboost = np.zeros((x_test_xgboost.shape[0],1))

    y_test_predict_kfold = np.zeros((x_test_xgboost.shape[0],split_))

    y_train_predict_array = np.array(())
    y_train_actual_array = np.array(())
    for (train_idx,test_idx) in kf.split(x_train_xgboost,y_train_xgboost):
        xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', 
                                  colsample_bytree = 0.1, 
                                  learning_rate = 0.04,
                                  max_depth = 25, 
                                  alpha = 1, 
                                  n_estimators = 1000)

        xg_reg.fit(x_train_xgboost.iloc[train_idx],y_train_xgboost.iloc[train_idx])
        y_train_predict = xg_reg.predict(x_train_xgboost.iloc[test_idx])
        y_test_predict  = xg_reg.predict(x_test_xgboost)

        y_train_actual_array = np.append(y_train_actual_array,y_train_xgboost.iloc[test_idx])
        y_train_predict_array = np.append(y_train_predict_array,y_train_predict)

        y_test_predict_kfold[:,fold_] =  y_test_predict
        
        
        if (print_ind):
            print ("Xgboost - Fold: {} - Error Validation: {:.3f}, Error Test: {:.3f}".format(fold_,mean_squared_error(y_train_xgboost.iloc[test_idx], y_train_predict, squared=False),mean_squared_error(y_test_xgboost, y_test_predict, squared=False)))
            
        fold_ = fold_ + 1

    y_train_actual_xgboost[:,0] = y_train_actual_array
    y_train_predict_xgboost[:,0] = y_train_predict_array

    y_test_predict_xgboost[:,0] = np.mean(y_test_predict_kfold,axis=1)
    
    df_submission = pd.DataFrame({'PID':x_test_xgboost_PID,'Sale_Price':round(np.exp(pd.Series(y_test_predict_xgboost[:,0])),1)})
    
    rmse_train = mean_squared_error(y_train_xgboost, y_train_predict_xgboost[:,0], squared=False)
    rmse_test = mean_squared_error(y_test_xgboost, y_test_predict_xgboost[:,0], squared=False)
    
    
    if (print_ind):
        print ("Xgboost - Lamda: {} - Overall Error Train: {:.3f} - Overall Error Test : {:.3f}".format(alpha_,rmse_train,rmse_test))
    
    return rmse_train,rmse_test,df_submission


In [3]:
#Load the Data
folder_name = "/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/"
#Load the dataset
data, testIds = load_data(folder_name)
#Create the 10 Folders and dump its associated train.csv and test.csv
return_output = create_folders_for_train_test_files(data,
                                                    testIds,
                                                    folder_name)

Folder: 1 Created - Training Set Size:(2051, 83) - Test Set Size:(879, 82)
Folder: 2 Created - Training Set Size:(2051, 83) - Test Set Size:(879, 82)
Folder: 3 Created - Training Set Size:(2051, 83) - Test Set Size:(879, 82)
Folder: 4 Created - Training Set Size:(2051, 83) - Test Set Size:(879, 82)
Folder: 5 Created - Training Set Size:(2051, 83) - Test Set Size:(879, 82)
Folder: 6 Created - Training Set Size:(2051, 83) - Test Set Size:(879, 82)
Folder: 7 Created - Training Set Size:(2051, 83) - Test Set Size:(879, 82)
Folder: 8 Created - Training Set Size:(2051, 83) - Test Set Size:(879, 82)
Folder: 9 Created - Training Set Size:(2051, 83) - Test Set Size:(879, 82)
Folder: 10 Created - Training Set Size:(2051, 83) - Test Set Size:(879, 82)


In [None]:
print_ind = False
for i in np.arange(10):

    train = pd.read_csv(folder_name+"folder_"+np.str(i+1)+"/"+"train.csv")
    x_test = pd.read_csv(folder_name+"folder_"+np.str(i+1)+"/"+"test.csv")
    y_test = pd.read_csv(folder_name+"folder_"+np.str(i+1)+"/"+"y_test.csv")
    
    #print ("Folder :{} - Train Set Size:{} Test Set Size :{}".format(i,train.shape,test.shape))
    
    y_train = np.log(train['Sale_Price'])
    x_train = train.drop(['Sale_Price'],axis=1)
    y_test = np.log(y_test)
    #x_test = test.drop(['Sale_Price'],axis=1)
    
    x_train_onehot,x_test_onehot = onehot_encoding(x_train,x_test)
    
    if (print_ind):
        print ("After One Hot - Train:{} Test:{}".format(x_train_onehot.shape,x_test_onehot.shape))
    
    x_train_final,x_test_final = winsorization(x_train_onehot,x_test_onehot)
    
    if (print_ind):
        print ("After Winsor  - Train:{} Test:{}".format(x_train_final.shape,x_test_final.shape))
        
    #Calling the Model - 1
    rmse_train_lasso,rmse_test_lasso,df_submission_lasso = lasso_model(x_train_final,y_train,x_test_final,y_test)
    #rmse_train_lasso,rmse_test_lasso,df_submission_lasso = lasso_model(x_train_onehot,y_train,x_test_onehot,y_test)
    #Calling the Model - 2    
    rmse_train_xgboost,rmse_test_xgboost,df_submission_xgboost = xgboost_model(x_train_final,y_train,x_test_final,y_test)
    #rmse_train_xgboost,rmse_test_xgboost,df_submission_xgboost = xgboost_model(x_train_onehot,y_train,x_test_onehot,y_test)
    
    #Write the Submission File into the Folder
    df_submission_lasso.to_csv((folder_name+"folder_"+np.str(i+1)+"/"+"mysubmission1.txt"),index=False)
    df_submission_xgboost.to_csv((folder_name+"folder_"+np.str(i+1)+"/"+"mysubmission2.txt"),index=False)
    
    print ("Folder:{} |-| Lasso Train Error:{:.3f},Lasso Test Error:{:.3f} |-| Xgboost Train Error:{:.3f},Xgboost Test Error:{:.3f}".format(i+1,rmse_train_lasso,rmse_test_lasso,rmse_train_xgboost,rmse_test_xgboost))
    

Folder:1 |-| Lasso Train Error:0.126,Lasso Test Error:0.124 |-| Xgboost Train Error:0.129,Xgboost Test Error:0.127
Folder:2 |-| Lasso Train Error:0.129,Lasso Test Error:0.114 |-| Xgboost Train Error:0.136,Xgboost Test Error:0.116
Folder:3 |-| Lasso Train Error:0.127,Lasso Test Error:0.124 |-| Xgboost Train Error:0.131,Xgboost Test Error:0.127
Folder:4 |-| Lasso Train Error:0.120,Lasso Test Error:0.133 |-| Xgboost Train Error:0.126,Xgboost Test Error:0.135
Folder:5 |-| Lasso Train Error:0.122,Lasso Test Error:0.131 |-| Xgboost Train Error:0.125,Xgboost Test Error:0.136
Folder:6 |-| Lasso Train Error:0.126,Lasso Test Error:0.124 |-| Xgboost Train Error:0.129,Xgboost Test Error:0.127
Folder:7 |-| Lasso Train Error:0.129,Lasso Test Error:0.114 |-| Xgboost Train Error:0.135,Xgboost Test Error:0.116
Folder:8 |-| Lasso Train Error:0.127,Lasso Test Error:0.124 |-| Xgboost Train Error:0.131,Xgboost Test Error:0.126


In [None]:
#After One Hot - Train:(2051, 267) Test:(879, 267)
#After Winsor  - Train:(2051, 267) Test:(879, 267)

In [None]:
Folder:1 |-| Lasso Train Error:0.126,Lasso Test Error:0.124 |-| Xgboost Train Error:0.129,Xgboost Test Error:0.127
Folder:2 |-| Lasso Train Error:0.129,Lasso Test Error:0.114 |-| Xgboost Train Error:0.136,Xgboost Test Error:0.116
Folder:3 |-| Lasso Train Error:0.127,Lasso Test Error:0.124 |-| Xgboost Train Error:0.131,Xgboost Test Error:0.127
Folder:4 |-| Lasso Train Error:0.120,Lasso Test Error:0.133 |-| Xgboost Train Error:0.126,Xgboost Test Error:0.135
Folder:5 |-| Lasso Train Error:0.122,Lasso Test Error:0.131 |-| Xgboost Train Error:0.125,Xgboost Test Error:0.136
Folder:6 |-| Lasso Train Error:0.126,Lasso Test Error:0.124 |-| Xgboost Train Error:0.129,Xgboost Test Error:0.127
Folder:7 |-| Lasso Train Error:0.129,Lasso Test Error:0.114 |-| Xgboost Train Error:0.135,Xgboost Test Error:0.116
Folder:8 |-| Lasso Train Error:0.127,Lasso Test Error:0.124 |-| Xgboost Train Error:0.131,Xgboost Test Error:0.126
Folder:9 |-| Lasso Train Error:0.120,Lasso Test Error:0.133 |-| Xgboost Train Error:0.127,Xgboost Test Error:0.136
Folder:10 |-| Lasso Train Error:0.122,Lasso Test Error:0.131 |-| Xgboost Train Error:0.125,Xgboost Test Error:0.136

In [None]:
#Folder 1  - RMSE Lasso:0.124 - RMSE Xgboost:0.126
#Folder 2  - RMSE Lasso:0.114 - RMSE Xgboost:0.114
#Folder 3  - RMSE Lasso:0.126 - RMSE Xgboost:0.126
#Folder 4  - RMSE Lasso:0.132 - RMSE Xgboost:0.132
#Folder 5  - RMSE Lasso:0.131 - RMSE Xgboost:0.131
#Folder 6  - RMSE Lasso:0.124 - RMSE Xgboost:0.124
#Folder 7  - RMSE Lasso:0.114 - RMSE Xgboost:0.114
#Folder 8  - RMSE Lasso:0.126 - RMSE Xgboost:0.126
#Folder 9  - RMSE Lasso:0.132 - RMSE Xgboost:0.132
#Folder 10 - RMSE Lasso:0.131 - RMSE Xgboost:0.131