In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso,ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

In [2]:
def load_data():
    columns = ["Col1","Col 2","Col 3","Col 4","Col 5","Col 6", "Col 7", "Col 8","Col 9","Col 10"]
    data = pd.read_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/Ames_data.csv")
    testIds = pd.read_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/project1_testIDs.dat",sep = " ", names = columns)
    
    return data, testIds

def created_train_test(data,testIds,j):
    j = j-1
    test = data.iloc[np.array(testIds)[:,j]]
    train = data.drop(np.array(testIds)[:,j], axis=0)
    return train,test

def onehot_encoding(x_train_input,x_test_input):
    #number of record
    train_num = x_train_input.shape[0]
    test_num = x_test_input.shape[0]
    #Merge the Train & Test
    df = [x_train_input,x_test_input]
    df_train_test = pd.concat(df)
    
    #Below columns needs to be dropped because of High Imbalance in data
    #"Garage_Yr_Blt" for now, as it has lots of NaN
    drop_columns = ['PID','Street', 'Utilities', 'Condition_2', 'Roof_Matl', 'Heating', 
                    'Pool_QC', 'Misc_Feature', 'Low_Qual_Fin_SF', 'Pool_Area', 'Longitude',
                    'Latitude','Land_Slope','Bsmt_Half_Bath','Three_season_porch','Misc_Val',
                    'Garage_Yr_Blt'
                    ]
    #Let's drop the column
    df_train_test = df_train_test.drop(drop_columns,axis=1)
    
    #Label Encoder to transform the Categoricsal Variable
    #lbe = LabelEncoder() -- This is to create the dummy value for all the Categorical value
    ohe = OneHotEncoder()
    
    for col_name in df_train_test.columns[df_train_test.dtypes == 'object']:
        #df_train_test[col_name] = lbe.fit_transform(df_train_test[col_name])
        df_get_dummies = pd.get_dummies(df_train_test[col_name],drop_first=True,prefix=col_name)
        df_train_test=pd.concat([df_train_test,df_get_dummies],axis=1)
        
    #Drop all the categorical columns, as we have created the dummies columns
    drop_cat_col_name= []
    for col_name in df_train_test.columns[df_train_test.dtypes == 'object']:
        drop_cat_col_name = np.append(drop_cat_col_name,col_name)
    
    df_train_test = df_train_test.drop(drop_cat_col_name,axis=1)
    
    #Split the Train & Test Data
    x_train_return = df_train_test.iloc[0:train_num]
    x_test_return = df_train_test.iloc[train_num:]
    
    return x_train_return,x_test_return


def winsorization(x_train_input,x_test_input):
    #Purposefully, removed the column = "Three_season_porch", "Misc_Val"
    winsorization_cols = ['Lot_Frontage', 'Lot_Area', 'Mas_Vnr_Area', 'BsmtFin_SF_2', 'Bsmt_Unf_SF', 
                          'Total_Bsmt_SF', 'Second_Flr_SF', 'First_Flr_SF', 'Gr_Liv_Area', 'Garage_Area',
                          'Wood_Deck_SF', 'Open_Porch_SF', 'Enclosed_Porch',  
                          'Screen_Porch']
    quan_val = 0.95
    for winso_columns in winsorization_cols:
        col_quant_value = np.quantile(x_train_input[winso_columns],quan_val)
        x_train_input[winso_columns][x_train_input[winso_columns] > col_quant_value] = col_quant_value
        x_test_input[winso_columns][x_test_input[winso_columns] > col_quant_value] = col_quant_value
        #print ("Column : {} 95% Quantile: {}".format(winso_columns,col_quant_value))
        
    return x_train_input,x_test_input
        
def lasso_model(x_train,y_train,x_test,y_test,alpha=0.5):
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(x_train,y_train)
    y_predict = lasso_model.predict(x_test)
    return lasso_model,y_predict

In [7]:
for i in np.arange(10):
    print (i+1)

1
2
3
4
5
6
7
8
9
10


In [3]:
data, testIds = load_data()

for i in np.arange(10):
    train,test = created_train_test(data,testIds,j=i+1)
    #Print the dataset size
    print ("Number of elements in the Training Set: {}".format(train.shape))
    print ("Number of elements in the Test Set: {}".format(test.shape))
    #Write to the Train.csv file
    train.to_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/train.csv",index=False)
    test.to_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/test.csv",index=False)
    

y_train = np.log(train['Sale_Price'])
x_train = train.drop(['Sale_Price'],axis=1)
y_test = np.log(test['Sale_Price'])
x_test = test.drop(['Sale_Price'],axis=1)

x_train_onehot,x_test_onehot = onehot_encoding(x_train,x_test)
#Print the dataset size
print ("Number of elements in the Train Transformed Set: {}".format(x_train_onehot.shape))
print ("Number of elements in the Test Transformed Set: {}".format(x_test_onehot.shape))

x_train_final,x_test_final = winsorization(x_train_onehot,x_test_onehot)

#Print the dataset size
print ("Number of elements in the Train Winsor Set: {}".format(x_train_final.shape))
print ("Number of elements in the Test Winsor Set : {}".format(x_test_final.shape))


Number of elements in the Training Set: (2051, 83)
Number of elements in the Test Set: (879, 83)
Number of elements in the Train Transformed Set: (2051, 266)
Number of elements in the Test Transformed Set: (879, 266)
Number of elements in the Train Winsor Set: (2051, 266)
Number of elements in the Test Winsor Set : (879, 266)


In [4]:
#------------------------------------------------------------------------
#
# Shrinking Methods (Lasso)
#
#------------------------------------------------------------------------

print ("#------------------------------------------------------------------------")
print ("# Shrinking Methods (Lasso)")
print ("#------------------------------------------------------------------------")

split_ = 10
alpha_ = 0.0001
fold_ = 0
kf = KFold(n_splits=split_)

y_train_actual_lasso = np.zeros((x_train_final.shape[0],1))
y_train_predict_lasso = np.zeros((x_train_final.shape[0],1))
y_test_predict_lasso = np.zeros((x_test_final.shape[0],1))

y_test_predict_kfold = np.zeros((x_test_final.shape[0],split_))

y_train_predict_array = np.array(())
y_train_actual_array = np.array(())
for (train_idx,test_idx) in kf.split(x_train_final,y_train):
    lasso_model = Lasso(alpha=alpha_) #alpha value set to 0.0001
    lasso_model.fit(x_train_final.iloc[train_idx],y_train.iloc[train_idx])
    y_train_predict = lasso_model.predict(x_train_final.iloc[test_idx])
    y_test_predict  = lasso_model.predict(x_test_final)

    y_train_actual_array = np.append(y_train_actual_array,y_train.iloc[test_idx])
    y_train_predict_array = np.append(y_train_predict_array,y_train_predict)

    y_test_predict_kfold[:,fold_] =  y_test_predict

    print ("Fold: {} - Error Validation: {:.3f}, Error Test: {:.3f}".format(fold_,mean_squared_error(y_train.iloc[test_idx], y_train_predict, squared=False),mean_squared_error(y_test, y_test_predict, squared=False)))
    fold_ = fold_ + 1

y_train_actual_lasso[:,0] = y_train_actual_array
y_train_predict_lasso[:,0] = y_train_predict_array

y_test_predict_lasso[:,0] = np.mean(y_test_predict_kfold,axis=1)

print ("#------------------------------------------------------------------------")
print ("Lamda: {} - Overall Train Error: {:.3f} - Overall Test Error: {:.3f}".format(alpha_,mean_squared_error(y_train, y_train_predict_lasso[:,0], squared=False),mean_squared_error(y_test, y_test_predict_lasso[:,0], squared=False)))
print ("#------------------------------------------------------------------------")


#------------------------------------------------------------------------
# Shrinking Methods (Lasso)
#------------------------------------------------------------------------
Fold: 0 - Error Validation: 0.155, Error Test: 0.122
Fold: 1 - Error Validation: 0.118, Error Test: 0.125
Fold: 2 - Error Validation: 0.159, Error Test: 0.130
Fold: 3 - Error Validation: 0.089, Error Test: 0.124
Fold: 4 - Error Validation: 0.120, Error Test: 0.124
Fold: 5 - Error Validation: 0.135, Error Test: 0.126
Fold: 6 - Error Validation: 0.131, Error Test: 0.128
Fold: 7 - Error Validation: 0.133, Error Test: 0.126
Fold: 8 - Error Validation: 0.088, Error Test: 0.125
Fold: 9 - Error Validation: 0.115, Error Test: 0.126
#------------------------------------------------------------------------
Lamda: 0.0001 - Overall Train Error: 0.126 - Overall Test Error: 0.124
#------------------------------------------------------------------------


In [5]:
#------------------------------------------------------------------------
#
# Boosting Model (Xgboost)
#
#------------------------------------------------------------------------

print ("#------------------------------------------------------------------------")
print ("# Boosting Model (Xgboost)")
print ("#------------------------------------------------------------------------")


split_ = 10
max_depth_count_ = 0
colsample_bytree_ = 0.1
learning_rate_ = 0.04
max_depth_ = 25
alpha_ = 1
fold_ = 0
kf = KFold(n_splits=split_)

y_train_actual_xgboost = np.zeros((x_train_final.shape[0],1))
y_train_predict_xgboost = np.zeros((x_train_final.shape[0],1))
y_test_predict_xgboost = np.zeros((x_test_final.shape[0],1))

y_test_predict_kfold = np.zeros((x_test_final.shape[0],split_))

y_train_predict_array = np.array(())
y_train_actual_array = np.array(())
for (train_idx,test_idx) in kf.split(x_train_final,y_train):
    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', 
                                              colsample_bytree = 0.1, 
                                              learning_rate = 0.04,
                                              max_depth = 25, 
                                              alpha = 1, 
                                              n_estimators = 1000)
    
    xg_reg.fit(x_train_final.iloc[train_idx],y_train.iloc[train_idx])
    y_train_predict = xg_reg.predict(x_train_final.iloc[test_idx])
    y_test_predict  = xg_reg.predict(x_test_final)

    y_train_actual_array = np.append(y_train_actual_array,y_train.iloc[test_idx])
    y_train_predict_array = np.append(y_train_predict_array,y_train_predict)

    y_test_predict_kfold[:,fold_] =  y_test_predict

    print ("Fold: {} - Error Validation: {:.3f}, Error Test: {:.3f}".format(fold_,mean_squared_error(y_train.iloc[test_idx], y_train_predict, squared=False),mean_squared_error(y_test, y_test_predict, squared=False)))
    fold_ = fold_ + 1

y_train_actual_xgboost[:,0] = y_train_actual_array
y_train_predict_xgboost[:,0] = y_train_predict_array

y_test_predict_xgboost[:,0] = np.mean(y_test_predict_kfold,axis=1)

print ("#------------------------------------------------------------------------")
print ("Lamda: {} - Overall Error Train: {:.3f} - Overall Error Test : {:.3f}".format(alpha_,mean_squared_error(y_train, y_train_predict_lasso[:,0], squared=False),mean_squared_error(y_test, y_test_predict_lasso[:,0], squared=False)))
print ("#------------------------------------------------------------------------")

#------------------------------------------------------------------------
# Boosting Model (Xgboost)
#------------------------------------------------------------------------
Fold: 0 - Error Validation: 0.172, Error Test: 0.125
Fold: 1 - Error Validation: 0.122, Error Test: 0.128
Fold: 2 - Error Validation: 0.142, Error Test: 0.125
Fold: 3 - Error Validation: 0.096, Error Test: 0.127
Fold: 4 - Error Validation: 0.121, Error Test: 0.128
Fold: 5 - Error Validation: 0.129, Error Test: 0.127
Fold: 6 - Error Validation: 0.127, Error Test: 0.128
Fold: 7 - Error Validation: 0.138, Error Test: 0.129
Fold: 8 - Error Validation: 0.088, Error Test: 0.127
Fold: 9 - Error Validation: 0.141, Error Test: 0.128
#------------------------------------------------------------------------
Lamda: 1 - Overall Error Train: 0.126 - Overall Error Test : 0.124
#------------------------------------------------------------------------
