In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [2]:
def load_data():
    columns = ["Col1","Col 2","Col 3","Col 4","Col 5","Col 6", "Col 7", "Col 8","Col 9","Col 10"]
    data = pd.read_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/Ames_data.csv")
    testIds = pd.read_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/project1_testIDs.dat",sep = " ", names = columns)
    
    return data, testIds

def created_train_test(data,testIds,j):
    j = j-1
    test = data.iloc[np.array(testIds)[:,j]]
    train = data.drop(np.array(testIds)[:,j], axis=0)
    return train,test

def feature_engineering(x_train,x_test):
    #number of record
    train_num = x_train.shape[0]
    test_num = x_test.shape[0]
    #Merge the Train & Test
    df = [x_train,x_test]
    df_train_test = pd.concat(df)
    
    #Label Encoder to transform the Categoricsal Variable
    lbe = LabelEncoder()
    for col_name in train.columns[train.dtypes == 'object']:
        #col_name = cols+'_Cat'
        df_train_test[col_name] = lbe.fit_transform(df_train_test[col_name])
        
    #Let's drop the column "Garage_Yr_Blt" for now, as it ha NaN
    df_train_test = df_train_test.drop(['Garage_Yr_Blt'],axis=1)
    x_train = df_train_test.iloc[0:train_num]
    x_test = df_train_test.iloc[train_num:]
    
    return x_train,x_test

def lasso_model(x_train,y_train,x_test,y_test,alpha=0.5):
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(x_train,y_train)
    y_predict = lasso_model.predict(x_test)
    return lasso_model,y_predict


In [3]:
data, testIds = load_data()
train,test = created_train_test(data,testIds,j=1)

#Print the dataset size
print ("Number of elements in the Training Set: {}".format(train.shape))
print ("Number of elements in the Test Set: {}".format(test.shape))

#Write to the Train.csv file
train.to_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/train.csv",index=False)
test.to_csv("/users/Sushanta/Documents/GitHub/Illinois/CS598 Practical Statistical Learning/Project/data/test.csv",index=False)

y_train = np.log(train['Sale_Price'])
x_train = train.drop(['Sale_Price'],axis=1)
y_test = np.log(test['Sale_Price'])
x_test = test.drop(['Sale_Price'],axis=1)

x_train_transformed,x_test_transformed = feature_engineering(x_train,x_test)

#Print the dataset size
print ("Number of elements in the Train Transformed Set: {}".format(x_train_transformed.shape))
print ("Number of elements in the Test Transformed Set: {}".format(x_test_transformed.shape))


Number of elements in the Training Set: (2051, 83)
Number of elements in the Test Set: (879, 83)
Number of elements in the Train Transformed Set: (2051, 81)
Number of elements in the Test Transformed Set: (879, 81)


In [6]:
x_train.columns

Index(['PID', 'MS_SubClass', 'MS_Zoning', 'Lot_Frontage', 'Lot_Area', 'Street',
       'Alley', 'Lot_Shape', 'Land_Contour', 'Utilities', 'Lot_Config',
       'Land_Slope', 'Neighborhood', 'Condition_1', 'Condition_2', 'Bldg_Type',
       'House_Style', 'Overall_Qual', 'Overall_Cond', 'Year_Built',
       'Year_Remod_Add', 'Roof_Style', 'Roof_Matl', 'Exterior_1st',
       'Exterior_2nd', 'Mas_Vnr_Type', 'Mas_Vnr_Area', 'Exter_Qual',
       'Exter_Cond', 'Foundation', 'Bsmt_Qual', 'Bsmt_Cond', 'Bsmt_Exposure',
       'BsmtFin_Type_1', 'BsmtFin_SF_1', 'BsmtFin_Type_2', 'BsmtFin_SF_2',
       'Bsmt_Unf_SF', 'Total_Bsmt_SF', 'Heating', 'Heating_QC', 'Central_Air',
       'Electrical', 'First_Flr_SF', 'Second_Flr_SF', 'Low_Qual_Fin_SF',
       'Gr_Liv_Area', 'Bsmt_Full_Bath', 'Bsmt_Half_Bath', 'Full_Bath',
       'Half_Bath', 'Bedroom_AbvGr', 'Kitchen_AbvGr', 'Kitchen_Qual',
       'TotRms_AbvGrd', 'Functional', 'Fireplaces', 'Fireplace_Qu',
       'Garage_Type', 'Garage_Yr_Blt', 'Garage_Fi

In [94]:
column_name = 'Street'
(x_train[column_name].value_counts()/np.sum(x_train[column_name].value_counts()))*100

Pave    99.56119
Grvl     0.43881
Name: Street, dtype: float64

In [None]:
#MS_Zoning
Residential_Low_Density         77.133106
Residential_Medium_Density      15.797172
Floating_Village_Residential     5.070697
C_all                            0.975134
Residential_High_Density         0.926377
A_agr                            0.048757
I_all                            0.048757
#Street <------------------------------------ Removed
Pave    99.56119
Grvl     0.43881
#Alley  
No_Alley_Access    93.369088
Gravel              3.851780
Paved               2.779132
#Lot_Shape
Regular                 63.578742
Slightly_Irregular      33.398342
Moderately_Irregular     2.486592
Irregular                0.536324
#Land_Contour
Lvl    90.443686
Bnk     3.754266
HLS     3.705510
Low     2.096538
#Utilities <------------------------------- Removed
AllPub    99.951243
NoSewr     0.048757
#Lot_Config
Inside     73.183813
Corner     17.308630
CulDSac     5.899561
FR2         3.022916
FR3         0.585080
#Land_Slope<------------------------------- May Be !!!
Gtl    95.27060
Mod     4.14432
Sev     0.58508
#Condition_1
Norm      86.104339
Feedr      5.655778
Artery     2.827889
RRAn       1.755241
PosN       1.365188
RRAe       1.072647
PosA       0.633837
RRNn       0.292540
RRNe       0.292540
#Condition_2 <----------------------------- Removed
Norm      99.073623
Feedr      0.390054
Artery     0.195027
PosA       0.097513
PosN       0.097513
RRAn       0.048757
RRNn       0.048757
RRAe       0.048757
#Bldg_Type
OneFam      82.057533
TwnhsE       8.142370
Duplex       3.754266
Twnhs        3.607996
TwoFmCon     2.437835
#Roof_Matl <------------------------------- Removed
CompShg    98.586056
Tar&Grv     0.682594
WdShake     0.341297
WdShngl     0.243784
Roll        0.048757
ClyTile     0.048757
Membran     0.048757
#Bsmt_Cond
Typical        89.907362
Good            3.949293
Fair            3.266699
No_Basement     2.535349
Poor            0.195027
Excellent       0.146270
#BsmtFin_Type_2
Unf            85.811799
Rec             3.266699
LwQ             3.120429
No_Basement     2.584105
ALQ             1.950268
BLQ             1.901511
GLQ             1.365188
#BsmtFin_SF_2
0      88.395904
180     0.243784
294     0.195027
147     0.146270
435     0.146270
         ...    
139     0.048757
125     0.048757
123     0.048757
121     0.048757
764     0.048757
#Heating <------------------------- Removed
GasA     98.391029
GasW      1.023891
Grav      0.292540
Wall      0.195027
OthW      0.048757
Floor     0.048757
#Central_Air
Y    93.515358
N     6.484642
#Electrical
SBrkr      91.565090
FuseA       6.289615
FuseF       1.755241
FuseP       0.341297
Unknown     0.048757
#Low_Qual_Fin_SF <------------------------- Removed
0       98.732326
80       0.195027
481      0.048757
108      0.048757
120      0.048757
140      0.048757
144      0.048757
232      0.048757
234      0.048757
312      0.048757
360      0.048757
362      0.048757
697      0.048757
390      0.048757
392      0.048757
420      0.048757
450      0.048757
512      0.048757
514      0.048757
528      0.048757
1064     0.048757
259      0.048757
397      0.048757
384      0.048757
#Bsmt_Half_Bath <------------------------- May Be !!
0    94.051682
1     5.850804
2     0.097513
#Kitchen_AbvGr <------------------------- May Be !!
1    95.270600
2     4.583130
3     0.097513
0     0.048757
#Functional
Typ     92.979035
Min2     2.389078
Min1     2.242808
Mod      1.365188
Maj1     0.780107
Maj2     0.195027
Sev      0.048757
#Garage_Qual
Typical      89.468552
No_Garage     4.973184
Fair          4.534373
Good          0.731351
Poor          0.195027
Excellent     0.097513
#Garage_Cond
Typical      91.077523
No_Garage     4.973184
Fair          2.730375
Good          0.585080
Poor          0.487567
Excellent     0.146270
#Paved_Drive<------------------------- May BE !!
Paved               91.077523
Dirt_Gravel          7.313506
Partial_Pavement     1.608971
#Three_season_porch <------------------------- May Be !!
0      98.927353
180     0.048757
86      0.048757
120     0.048757
140     0.048757
144     0.048757
150     0.048757
162     0.048757
168     0.048757
174     0.048757
176     0.048757
255     0.048757
245     0.048757
238     0.048757
290     0.048757
304     0.048757
320     0.048757
508     0.048757
23      0.048757
153     0.048757
219     0.048757
225     0.048757
224     0.048757
#Screen_Porch
0      91.662604
144     0.390054
200     0.341297
192     0.341297
120     0.292540
         ...    
163     0.048757
184     0.048757
165     0.048757
171     0.048757
385     0.048757
#Pool_Area <----------------------------------- Removed
0      99.512433
561     0.048757
555     0.048757
519     0.048757
800     0.048757
648     0.048757
512     0.048757
480     0.048757
444     0.048757
228     0.048757
144     0.048757
#Pool_QC <----------------------------------- Removed
No_Pool      99.512433
Excellent     0.195027
Good          0.097513
Typical       0.097513
Fair          0.097513
#Misc_Feature <------------------------------ Removed
None    95.806923
Shed     3.656753
Gar2     0.243784
Othr     0.195027
TenC     0.048757
Elev     0.048757
#Misc_Val<------------------------------ May Be !!!
0        95.904437
400       0.487567
500       0.487567
450       0.390054
2000      0.341297
700       0.341297
600       0.292540
1500      0.146270
1200      0.146270
650       0.146270
2500      0.097513
3000      0.097513
4500      0.097513
480       0.097513
6500      0.048757
80        0.048757
8300      0.048757
12500     0.048757
350       0.048757
455       0.048757
460       0.048757
17000     0.048757
560       0.048757
1512      0.048757
620       0.048757
15500     0.048757
750       0.048757
900       0.048757
1000      0.048757
1150      0.048757
1300      0.048757
1400      0.048757
3500      0.048757


In [5]:
#------------------------------------------------------------------------
#
# Shrinking Methods (Lasso)
#
#------------------------------------------------------------------------

#Lot_Area
#BsmtFin_Type_1
#BsmtFin_SF_1
#BsmtFin_Type_2
#BsmtFin_SF_2
#Bsmt_Unf_SF
#Total_Bsmt_SF
#Year_Built
#Year_Remod_Add
#Low_Qual_Fin_SF
#Year_Sold
#Street - Only 2 values (0,1)
#Utilities - Only 2 values (0,1)
#
drop_columns = ['PID','Street', 'Utilities', 'Condition_2', 'Roof_Matl', 'Heating', 
                'Pool_QC', 'Misc_Feature', 'Low_Qual_Fin_SF', 'Pool_Area', 'Longitude',
                'Latitude'
                ]
x_train_transformed_final = x_train_transformed.drop(drop_columns,axis = 1)
x_test_transformed_final = x_test_transformed.drop(drop_columns,axis = 1)
                    
split_ = 10
lamda_count_ = 0
alpha_ = np.array([0.00001,0.0001,0.0002,0.0003,0.001,0.002,0.003,0.01,0.02])

kf = KFold(n_splits=split_)

y_train_actual_lamda = np.zeros((x_train_transformed.shape[0],alpha_.shape[0]))
y_train_predict_lamda = np.zeros((x_train_transformed.shape[0],alpha_.shape[0]))
y_test_predict_lamda = np.zeros((x_test_transformed.shape[0],alpha_.shape[0]))

for lamda in alpha_:
    print ("Processing for lamda:{}".format(lamda))
    
    fold_ = 0
    y_test_predict_kfold = np.zeros((x_test_transformed.shape[0],split_))

    y_train_predict_array = np.array(())
    y_train_actual_array = np.array(())
    for (train_idx,test_idx) in kf.split(x_train,y_train):
        #print (test_idx.shape)
        #print(fold_)
        lasso_model = Lasso(alpha=lamda)
        lasso_model.fit(x_train_transformed_final.iloc[train_idx],y_train.iloc[train_idx])
        y_train_predict = lasso_model.predict(x_train_transformed_final.iloc[test_idx])
        y_test_predict  = lasso_model.predict(x_test_transformed_final)
        
        y_train_actual_array = np.append(y_train_actual_array,y_train.iloc[test_idx])
        y_train_predict_array = np.append(y_train_predict_array,y_train_predict)
        
        y_test_predict_kfold[:,fold_] =  y_test_predict
        
        
        print ("Lamda: {} - Fold: {} - Error Test: {:.3f}".format(lamda,fold_+1,mean_squared_error(y_train.iloc[test_idx], y_train_predict, squared=False)))
        
        fold_ = fold_ + 1
    
    y_train_actual_lamda[:,lamda_count_] = y_train_actual_array
    y_train_predict_lamda[:,lamda_count_] = y_train_predict_array
    
    y_test_predict_lamda[:,lamda_count_] = np.mean(y_test_predict_kfold,axis=1)
    
    
    print ("Lamda: {} - Error Train: {:.3f}".format(lamda,mean_squared_error(y_train, y_train_predict_lamda[:,lamda_count_], squared=False)))
    print ("Lamda: {} - Error Test : {:.3f}".format(lamda,mean_squared_error(y_test, y_test_predict_lamda[:,lamda_count_], squared=False)))
    
    lamda_count_ = lamda_count_ + 1
    

Processing for lamda:1e-05
Lamda: 1e-05 - Fold: 1 - Error Test: 0.196
Lamda: 1e-05 - Fold: 2 - Error Test: 0.142
Lamda: 1e-05 - Fold: 3 - Error Test: 0.168
Lamda: 1e-05 - Fold: 4 - Error Test: 0.122
Lamda: 1e-05 - Fold: 5 - Error Test: 0.158
Lamda: 1e-05 - Fold: 6 - Error Test: 0.245
Lamda: 1e-05 - Fold: 7 - Error Test: 0.152
Lamda: 1e-05 - Fold: 8 - Error Test: 0.209
Lamda: 1e-05 - Fold: 9 - Error Test: 0.117
Lamda: 1e-05 - Fold: 10 - Error Test: 0.173
Lamda: 1e-05 - Error Train: 0.172
Lamda: 1e-05 - Error Test : 0.153
Processing for lamda:0.0001
Lamda: 0.0001 - Fold: 1 - Error Test: 0.196
Lamda: 0.0001 - Fold: 2 - Error Test: 0.142
Lamda: 0.0001 - Fold: 3 - Error Test: 0.168
Lamda: 0.0001 - Fold: 4 - Error Test: 0.122
Lamda: 0.0001 - Fold: 5 - Error Test: 0.158
Lamda: 0.0001 - Fold: 6 - Error Test: 0.245
Lamda: 0.0001 - Fold: 7 - Error Test: 0.152
Lamda: 0.0001 - Fold: 8 - Error Test: 0.209
Lamda: 0.0001 - Fold: 9 - Error Test: 0.116
Lamda: 0.0001 - Fold: 10 - Error Test: 0.173
Lamda

In [None]:
#------------------------------------------------------------------------
#
# Random Forest Regressor
#
#------------------------------------------------------------------------
#Lot_Area
#BsmtFin_Type_1
#BsmtFin_SF_1
#BsmtFin_Type_2
#BsmtFin_SF_2
#Bsmt_Unf_SF
#Total_Bsmt_SF
#Year_Built
#Year_Remod_Add
#Low_Qual_Fin_SF
#Year_Sold
#Street - Only 2 values (0,1)
#Utilities - Only 2 values (0,1)
#
selected_columns = ['MS_SubClass','MS_Zoning','Lot_Frontage','Street','Alley','Lot_Shape',
                    'Land_Contour','Utilities','Lot_Config','Land_Slope','Neighborhood',
                    'Condition_1','Condition_2','Bldg_Type','House_Style','Overall_Qual',
                    'Overall_Cond','Roof_Style','Roof_Matl','Exterior_1st','Exterior_2nd',
                    'Mas_Vnr_Type','Mas_Vnr_Area','Exter_Qual','Exter_Cond','Foundation',
                    'Bsmt_Qual','Bsmt_Cond','Bsmt_Exposure','Total_Bsmt_SF','Heating',
                    'Heating_QC','Central_Air','Electrical','First_Flr_SF','Second_Flr_SF',
                    'Gr_Liv_Area','Bsmt_Full_Bath','Bsmt_Half_Bath','Full_Bath','TotRms_AbvGrd',
                    'Functional','Fireplaces','Fireplace_Qu','Garage_Type','Garage_Finish',
                    'Garage_Cars','Garage_Area','Garage_Qual','Garage_Cond','Paved_Drive',
                    'Wood_Deck_SF','Open_Porch_SF','Enclosed_Porch','Three_season_porch',
                    'Pool_Area', 'Pool_QC', 'Fence', 'Misc_Feature', 'Misc_Val', 'Mo_Sold',
                    'Sale_Type', 'Sale_Condition'
                   ]
                    
split_ = 10
max_depth_count_ = 0
max_depth_ = np.array([16,18,24,28,30,38,50,60,70,90,120])

kf = KFold(n_splits=split_)

y_train_actual_max_depth = np.zeros((x_train_transformed.shape[0],max_depth_.shape[0]))
y_train_predict_max_depth = np.zeros((x_train_transformed.shape[0],max_depth_.shape[0]))
y_test_predict_max_depth = np.zeros((x_test_transformed.shape[0],max_depth_.shape[0]))

for depth in max_depth_:
    print ("Processing for max_depth:{}".format(depth))
    
    fold_ = 0
    y_test_predict_kfold = np.zeros((x_test_transformed.shape[0],split_))

    y_train_predict_array = np.array(())
    y_train_actual_array = np.array(())
    for (train_idx,test_idx) in kf.split(x_train,y_train):
        rf_model = RandomForestRegressor(max_depth=depth,random_state=125247)
        rf_model.fit(x_train_transformed[selected_columns].iloc[train_idx],y_train.iloc[train_idx])
        y_train_predict = rf_model.predict(x_train_transformed[selected_columns].iloc[test_idx])
        y_test_predict  = rf_model.predict(x_test_transformed[selected_columns])
        
        y_train_actual_array = np.append(y_train_actual_array,y_train.iloc[test_idx])
        y_train_predict_array = np.append(y_train_predict_array,y_train_predict)
        
        y_test_predict_kfold[:,fold_] =  y_test_predict
        
        
        print ("Max_Depth: {} - Fold: {} - Error Test: {:.3f}".format(depth,fold_+1,mean_squared_error(y_train.iloc[test_idx], y_train_predict, squared=False)))
        
        fold_ = fold_ + 1
    
    y_train_actual_max_depth[:,max_depth_count_] = y_train_actual_array
    y_train_predict_max_depth[:,max_depth_count_] = y_train_predict_array
    
    y_test_predict_max_depth[:,max_depth_count_] = np.mean(y_test_predict_kfold,axis=1)
    
    
    print ("Max Depth: {} - Error Train: {:.3f}".format(depth,mean_squared_error(y_train, y_train_predict_max_depth[:,max_depth_count_], squared=False)))
    print ("Max Depth: {} - Error Test : {:.3f}".format(depth,mean_squared_error(y_test, y_test_predict_max_depth[:,max_depth_count_], squared=False)))
    
    max_depth_count_ = max_depth_count_ + 1
    

In [95]:
arr = [20, 2, 7, 1, 34] 

In [105]:
np.quantile(data['Screen_Porch'],0.95)

161.0

In [102]:
np.quantile(x_train_transformed['Screen_Porch'],0.95)

160.0