In [830]:
# import needed libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_regression
from scipy import stats
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
import math
from sklearn.ensemble import RandomForestRegressor
from collections import Counter
from scipy.stats import pearsonr

# 1. Pre-processing

**Method used**
- Filling missing values
- Cardinality check
- Feauture selection with mutual regression
- Removing ouliers
- Encoding

## 1.1 Data loading and preview

In [831]:
test_data = pd.read_csv("test.csv")
train_data = pd.read_csv("train.csv")

In [832]:
# train data overview

train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [833]:
train_data.shape

(1460, 81)

In [834]:
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [835]:
test_data.shape

(1459, 80)

## 1.2 Handle Missing values

In [836]:
# filling missing value (Numerical and Categorical)

for col in train_data.columns:
    if train_data[col].dtype in [np.float64, np.int64]:
        sip = SimpleImputer(missing_values=np.nan, strategy='mean')
        train_data[col] = sip.fit_transform(train_data[col].values.reshape(-1, 1))[:, 0]
    else:
        sip = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value="Not_specified")
        train_data[col] = sip.fit_transform(train_data[col].values.reshape(-1, 1))[:, 0]

for col in test_data.columns:
    if test_data[col].dtype in [np.float64, np.int64]:
        sip = SimpleImputer(missing_values=np.nan, strategy='mean')
        test_data[col] = sip.fit_transform(test_data[col].values.reshape(-1, 1))[:, 0]
    else:
        sip = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value="Not_specified")
        test_data[col] = sip.fit_transform(test_data[col].values.reshape(-1, 1))[:, 0]

## 1.3 Feature and ouput seperation

In [837]:
# seperate features and outcome (SalePrice)

X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
X_test = test_data

## 1.4 Cardinality and Dimension reduction

In [838]:
# get each column cardinality (except the last column "SalePrice")

# we romve Id column first
X_train.drop(['Id'], axis=1, inplace=True)
X_test.drop(['Id'], axis=1, inplace=True)

high_cardinality_columns = []
cardinality_result = pd.DataFrame(columns=X_test.columns)
row = {}
# cardinality divided by total number of records
cardinality_threshold = 100
for col in X_train.columns:
    card = len(set(X_train[col]))
    row[col] = card
    if card > cardinality_threshold and X_train[col].dtype not in [np.float64, np.int64]:
        high_cardinality_columns.append(col)

cardinality_result = cardinality_result.append(row, ignore_index=True)
cardinality_result.drop(high_cardinality_columns, axis=1, inplace=True)
cardinality_result

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,15,5,111,1073,2,3,4,4,2,5,...,76,8,4,5,5,21,12,5,9,6


In [839]:
print(high_cardinality_columns)

[]


In [840]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60.0,RL,65.0,8450.0,Pave,Not_specified,Reg,Lvl,AllPub,Inside,...,0.0,0.0,Not_specified,Not_specified,Not_specified,0.0,2.0,2008.0,WD,Normal
1,20.0,RL,80.0,9600.0,Pave,Not_specified,Reg,Lvl,AllPub,FR2,...,0.0,0.0,Not_specified,Not_specified,Not_specified,0.0,5.0,2007.0,WD,Normal
2,60.0,RL,68.0,11250.0,Pave,Not_specified,IR1,Lvl,AllPub,Inside,...,0.0,0.0,Not_specified,Not_specified,Not_specified,0.0,9.0,2008.0,WD,Normal
3,70.0,RL,60.0,9550.0,Pave,Not_specified,IR1,Lvl,AllPub,Corner,...,0.0,0.0,Not_specified,Not_specified,Not_specified,0.0,2.0,2006.0,WD,Abnorml
4,60.0,RL,84.0,14260.0,Pave,Not_specified,IR1,Lvl,AllPub,FR2,...,0.0,0.0,Not_specified,Not_specified,Not_specified,0.0,12.0,2008.0,WD,Normal


## 1.5 Feature selection with Mutual Information

In [841]:
# we calculate the mutual information score between features and outcome (salePrice)

output = y_train.values.reshape(-1, 1)
feature_to_drop = []
mi_res = {}
dropping_threshold = 0.05
for col in X_train.columns:
    if X_train[col].dtype in [np.float64, np.int64]:
        mi_res[col] = mutual_info_regression(output, X_train[col], random_state=0)[0]
    else:
        f_factorized,_ = X_train[col].factorize()
        mi_res[col] = mutual_info_regression(output, f_factorized, random_state=0)[0]
        
    if mi_res[col] < dropping_threshold:
            feature_to_drop.append(col)

In [842]:
X_train.drop(feature_to_drop, axis=1, inplace=True)
X_test.drop(feature_to_drop, axis=1, inplace=True)
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,Neighborhood,HouseStyle,OverallQual,OverallCond,YearBuilt,...,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,WoodDeckSF,OpenPorchSF,SaleType,SaleCondition
0,60.0,RL,65.0,8450.0,Reg,CollgCr,2Story,7.0,5.0,2003.0,...,2003.0,RFn,2.0,548.0,TA,TA,0.0,61.0,WD,Normal
1,20.0,RL,80.0,9600.0,Reg,Veenker,1Story,6.0,8.0,1976.0,...,1976.0,RFn,2.0,460.0,TA,TA,298.0,0.0,WD,Normal
2,60.0,RL,68.0,11250.0,IR1,CollgCr,2Story,7.0,5.0,2001.0,...,2001.0,RFn,2.0,608.0,TA,TA,0.0,42.0,WD,Normal
3,70.0,RL,60.0,9550.0,IR1,Crawfor,2Story,7.0,5.0,1915.0,...,1998.0,Unf,3.0,642.0,TA,TA,0.0,35.0,WD,Abnorml
4,60.0,RL,84.0,14260.0,IR1,NoRidge,2Story,8.0,5.0,2000.0,...,2000.0,RFn,3.0,836.0,TA,TA,192.0,84.0,WD,Normal


In [843]:
X_train.shape

(1460, 46)

## 1.6 Feature Selection with Pearson Correlation

In [844]:
# pearson correlation feature select

y = list(y_train)
p_value_threhold = 0.05
min_corr_score = 0.25
column_to_drop = []
for col in X_train.columns:
    if X_train[col].dtype not in [np.float64, np.int64]:
        col_values,_ = X_train[col].factorize()
    else:
        col_values = list(X_train[col].values)
    corr = pearsonr(col_values, y)
    if corr[1] > p_value_threhold or abs(corr[0]) < min_corr_score:
        column_to_drop.append(col)

column_to_drop

['MSSubClass',
 'MSZoning',
 'Neighborhood',
 'HouseStyle',
 'OverallCond',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'BsmtQual',
 'BsmtExposure',
 'BsmtUnfSF',
 'BedroomAbvGr',
 'KitchenQual',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'SaleType',
 'SaleCondition']

In [845]:
X_train.drop(column_to_drop, axis=1, inplace=True)
X_test.drop(column_to_drop, axis=1, inplace=True)
X_train.head()

Unnamed: 0,LotFrontage,LotArea,LotShape,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,Foundation,BsmtFinType1,...,HalfBath,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF
0,65.0,8450.0,Reg,7.0,2003.0,2003.0,196.0,Gd,PConc,GLQ,...,1.0,8.0,0.0,Not_specified,Attchd,2003.0,2.0,548.0,0.0,61.0
1,80.0,9600.0,Reg,6.0,1976.0,1976.0,0.0,TA,CBlock,ALQ,...,0.0,6.0,1.0,TA,Attchd,1976.0,2.0,460.0,298.0,0.0
2,68.0,11250.0,IR1,7.0,2001.0,2002.0,162.0,Gd,PConc,GLQ,...,1.0,6.0,1.0,TA,Attchd,2001.0,2.0,608.0,0.0,42.0
3,60.0,9550.0,IR1,7.0,1915.0,1970.0,0.0,TA,BrkTil,ALQ,...,0.0,7.0,1.0,Gd,Detchd,1998.0,3.0,642.0,0.0,35.0
4,84.0,14260.0,IR1,8.0,2000.0,2000.0,350.0,Gd,PConc,GLQ,...,1.0,9.0,1.0,TA,Attchd,2000.0,3.0,836.0,192.0,84.0


In [846]:
X_train.shape

(1460, 28)

## 1.7 Outlier detection

In [847]:
rows_to_drop = []
for col in X_train.columns:
    if X_train[col].dtype not in [np.float64, np.int64]:
        continue
    z_scores = stats.zscore(X_train[col])
    for i in range(len(z_scores)):
        if abs(z_scores[i]) > 3:
            rows_to_drop.append(i)

rows_to_drop = list(set(rows_to_drop))
X_train = X_train.drop(rows_to_drop)
y_train = y_train.drop(rows_to_drop)
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

In [848]:
X_train.shape

(1320, 28)

## 1.8 Encoding

In [849]:
categorical_indexes = []
for col in X_train.columns:
    if X_train[col].dtype not in [np.float64, np.int64]:
        categorical_indexes.append(list(X_train.columns).index(col))

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categorical_indexes)], remainder='passthrough')

In [850]:
X_train = X_train.iloc[:, :].values
X_test = X_test.iloc[:, :].values
X_train = ct.fit_transform(X_train)
X_test = ct.fit_transform(X_test)

# 2. Random Forest Regression

In [851]:
rf_regressor = RandomForestRegressor(n_estimators=4000, random_state=0)
rf_regressor.fit(X_train, y_train)
y_pred_rf = rf_regressor.predict(X_test)
submission = pd.DataFrame(columns=['id', 'SalePrice'])
test_data = pd.read_csv("test.csv")
for i in range(len(y_pred_mlr)):
    row = {}
    row['id'] = int(test_data.iloc[i]['Id'])
    row['SalePrice'] = y_pred_rf[i]    
    submission = submission.append(row, ignore_index=True)

submission['id'] = submission['id'].astype(int)
submission['SalePrice'] = submission['SalePrice'].astype(float)
submission.to_csv('submssion_random_forest.csv', index=False)