In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso,ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
def onehot_encoding(x_train_input,x_test_input):
    train_num = x_train_input.shape[0]
    test_num = x_test_input.shape[0]
    df = [x_train_input,x_test_input]
    df_train_test = pd.concat(df)
    
    #Fill the na values to "0" for the feature 'Garage_Yr_Blt'
    df_train_test['Garage_Yr_Blt'] = df_train_test['Garage_Yr_Blt'].fillna(0)
    
    #Below columns needs to be dropped because of High Imbalance in data
    drop_columns = ['Street', 'Utilities', 'Condition_2', 'Roof_Matl', 'Heating', 
                    'Pool_QC', 'Misc_Feature', 'Low_Qual_Fin_SF', 'Pool_Area', 'Longitude',
                    'Latitude','Land_Slope','Bsmt_Half_Bath','Three_season_porch','Misc_Val',
                    'Garage_Yr_Blt']
                    #'Garage_Yr_Blt']
    #Let's drop the column
    df_train_test = df_train_test.drop(drop_columns,axis=1)
    
    #Convert the Categorical Variable into dummy variable using Pandas's get_dummmies
    for col_name in df_train_test.columns[df_train_test.dtypes == 'object']:
        df_get_dummies = pd.get_dummies(df_train_test[col_name],drop_first=True,prefix=col_name)
        df_train_test=pd.concat([df_train_test,df_get_dummies],axis=1)
        
    #Drop all the categorical columns, as we have created the dummies columns
    drop_cat_col_name= []
    for col_name in df_train_test.columns[df_train_test.dtypes == 'object']:
        drop_cat_col_name = np.append(drop_cat_col_name,col_name)
    
    df_train_test = df_train_test.drop(drop_cat_col_name,axis=1)
    
    #Split the Train & Test Data
    x_train_return = df_train_test.iloc[0:train_num]
    x_test_return = df_train_test.iloc[train_num:]
    
    return x_train_return,x_test_return


def winsorization(x_train_input,x_test_input):
    #Purposefully, removed the column = "Three_season_porch", "Misc_Val"
    winsorization_cols = ['Lot_Frontage', 'Lot_Area', 'Mas_Vnr_Area', 'BsmtFin_SF_1','BsmtFin_SF_2', 'Bsmt_Unf_SF', 
                          'Total_Bsmt_SF', 'Second_Flr_SF', 'First_Flr_SF', 'Gr_Liv_Area', 'Garage_Area',
                          'Wood_Deck_SF', 'Open_Porch_SF', 'Enclosed_Porch',  
                          'Screen_Porch','Mo_Sold'
                         ]
    quan_val = 0.95
    for winso_columns in winsorization_cols:
        col_quant_value = np.quantile(x_train_input[winso_columns],quan_val)
        x_train_input[winso_columns][x_train_input[winso_columns] > col_quant_value] = col_quant_value
        x_test_input[winso_columns][x_test_input[winso_columns] > col_quant_value] = col_quant_value
        #print ("Column : {} 95% Quantile: {}".format(winso_columns,col_quant_value))
        
    return x_train_input,x_test_input

#------------------------------------------------------------------------
#
# Shrinking Methods (Lasso)
#
#------------------------------------------------------------------------
def lasso_model(x_train_lasso,y_train_lasso,x_test_lasso,alpha,print_ind=False):
    
    x_train_lasso_PID = x_train_lasso['PID']
    x_test_lasso_PID = x_test_lasso['PID']
    
    x_train_lasso = x_train_lasso.drop(['PID'],axis=1)
    x_test_lasso = x_test_lasso.drop(['PID'],axis=1)

    alpha_ = alpha # 0.0001
    
    lasso_model = Lasso(alpha=alpha_)
    lasso_model.fit(x_train_lasso,y_train_lasso)
    y_test_predict  = lasso_model.predict(x_test_lasso)
    df_submission = pd.DataFrame({'PID':x_test_lasso_PID,'Sale_Price':round(np.exp(pd.Series(y_test_predict)),1)})
    
    return df_submission

train = pd.read_csv("train.csv")
x_test = pd.read_csv("test.csv")

y_train = np.log(train['Sale_Price'])
x_train = train.drop(['Sale_Price'],axis=1)

x_train_onehot,x_test_onehot = onehot_encoding(x_train,x_test)
x_train_final,x_test_final = winsorization(x_train_onehot,x_test_onehot)

alpha = [0.00001]
#Calling the Model - 1
for a in alpha:
    df_submission_lasso = lasso_model(x_train_final,y_train,x_test_final,a,True)
    
    #Write the Submission File into the Folder
    df_submission_lasso.to_csv("mysubmission1.txt",index=False)
    
    # print ("Completed ...")
    y_test = pd.read_csv("y_test.csv")
    y_predict_lasso = pd.read_csv("mysubmission1.txt")
    
    rmse_lasso = np.sqrt(np.mean((np.log(y_predict_lasso['Sale_Price']) - np.log(y_test['Sale_Price']))**2))
    print ("RMSE Lasso:{:.3f}".format(rmse_lasso))

KeyError: 'Low_Qual_Fin_SF'

In [None]:
#x_train.dtypes[x_train.dtypes == 'int64']

In [None]:
#x_train['Bedroom_AbvGr'].max()

In [None]:
#Folder 1  - RMSE Lasso:0.124 - RMSE Xgboost:0.126
#Folder 2  - RMSE Lasso:0.114 - RMSE Xgboost:0.114
#Folder 3  - RMSE Lasso:0.126 - RMSE Xgboost:0.126
#Folder 4  - RMSE Lasso:0.132 - RMSE Xgboost:0.132
#Folder 5  - RMSE Lasso:0.131 - RMSE Xgboost:0.131
#Folder 6  - RMSE Lasso:0.124 - RMSE Xgboost:0.124
#Folder 7  - RMSE Lasso:0.114 - RMSE Xgboost:0.114
#Folder 8  - RMSE Lasso:0.126 - RMSE Xgboost:0.126
#Folder 9  - RMSE Lasso:0.132 - RMSE Xgboost:0.132
#Folder 10 - RMSE Lasso:0.131 - RMSE Xgboost:0.131