In [314]:
# import needed libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_regression
from scipy import stats
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
import math

In [315]:
test_data = pd.read_csv("test.csv")
train_data = pd.read_csv("train.csv")

In [316]:
# train data overview

train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [317]:
len(train_data)

1460

In [318]:
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [319]:
len(test_data)

1459

In [320]:
# train_extra_col = []
# for col in train_data.columns:
#     if col not in test_data.columns:
#         train_extra_col.append(col)

# test_extra_col = []
# for col in test_data.columns:
#     if col not in train_data.columns:
#         test_extra_col.append(col)
        
# test_data = test_data.drop(columns=test_extra_col, axis=1)
# train_data = train_data.drop(columns=train_extra_col, axis=1)

## Missing values

In [321]:
# filling missing value (Numerical and Categorical)

for col in train_data.columns:
    if train_data[col].dtype in [np.float64, np.int64]:
        sip = SimpleImputer(missing_values=np.nan, strategy='mean')
        train_data[col] = sip.fit_transform(train_data[col].values.reshape(-1, 1))[:, 0]
    else:
        sip = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value="Not_specified")
        train_data[col] = sip.fit_transform(train_data[col].values.reshape(-1, 1))[:, 0]

for col in test_data.columns:
    if test_data[col].dtype in [np.float64, np.int64]:
        sip = SimpleImputer(missing_values=np.nan, strategy='mean')
        test_data[col] = sip.fit_transform(test_data[col].values.reshape(-1, 1))[:, 0]
    else:
        sip = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value="Not_specified")
        test_data[col] = sip.fit_transform(test_data[col].values.reshape(-1, 1))[:, 0]

## Feature and label seperation

In [322]:
# seperate features and outcome (SalePrice)

X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
X_test = test_data

## Cardinality and Dimension reduction

In [323]:
# get each column cardinality (except the last column "SalePrice")

high_cardinality_columns = []
cardinality_result = pd.DataFrame(columns=X_test.columns)
row = {}
# cardinality divided by total number of records
cardinality_threshold = 0.95
for col in X_train.columns:
    card = len(set(X_train[col]))
    row[col] = card / len(X_train)
    if card / len(X_train) > cardinality_threshold:
        high_cardinality_columns.append(col)

cardinality_result = cardinality_result.append(row, ignore_index=True)
cardinality_result.drop(high_cardinality_columns, axis=1, inplace=True)
cardinality_result

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0.010274,0.003425,0.076027,0.734932,0.00137,0.002055,0.00274,0.00274,0.00137,0.003425,...,0.052055,0.005479,0.00274,0.003425,0.003425,0.014384,0.008219,0.003425,0.006164,0.00411


In [324]:
print(high_cardinality_columns)

['Id']


In [325]:
# we romve the features with high cardinality
X_train.drop(high_cardinality_columns, axis=1, inplace=True)
X_test.drop(high_cardinality_columns, axis=1, inplace=True)

In [326]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60.0,RL,65.0,8450.0,Pave,Not_specified,Reg,Lvl,AllPub,Inside,...,0.0,0.0,Not_specified,Not_specified,Not_specified,0.0,2.0,2008.0,WD,Normal
1,20.0,RL,80.0,9600.0,Pave,Not_specified,Reg,Lvl,AllPub,FR2,...,0.0,0.0,Not_specified,Not_specified,Not_specified,0.0,5.0,2007.0,WD,Normal
2,60.0,RL,68.0,11250.0,Pave,Not_specified,IR1,Lvl,AllPub,Inside,...,0.0,0.0,Not_specified,Not_specified,Not_specified,0.0,9.0,2008.0,WD,Normal
3,70.0,RL,60.0,9550.0,Pave,Not_specified,IR1,Lvl,AllPub,Corner,...,0.0,0.0,Not_specified,Not_specified,Not_specified,0.0,2.0,2006.0,WD,Abnorml
4,60.0,RL,84.0,14260.0,Pave,Not_specified,IR1,Lvl,AllPub,FR2,...,0.0,0.0,Not_specified,Not_specified,Not_specified,0.0,12.0,2008.0,WD,Normal


## Mutual Information

In [327]:
# we calculate the mutual information score between features and outcome (salePrice)

output = y_train.values.reshape(-1, 1)
feature_to_drop = []
mi_res = {}
dropping_threshold = 0.05
for col in X_train.columns:
    if X_train[col].dtype in [np.float64, np.int64]:
        print(col)
        print(X_train[col].dtype)
        mi_res[col] = mutual_info_regression(output, X_train[col], random_state=0)[0]
    else:
        f_factorized,_ = X_train[col].factorize()
        mi_res[col] = mutual_info_regression(output, f_factorized, random_state=0)[0]
        
    if mi_res[col] < dropping_threshold:
            feature_to_drop.append(col)

feature_to_drop

MSSubClass
float64
LotFrontage
float64
LotArea
float64
OverallQual
float64
OverallCond
float64
YearBuilt
float64
YearRemodAdd
float64
MasVnrArea
float64
BsmtFinSF1
float64
BsmtFinSF2
float64
BsmtUnfSF
float64
TotalBsmtSF
float64
1stFlrSF
float64
2ndFlrSF
float64
LowQualFinSF
float64
GrLivArea
float64
BsmtFullBath
float64
BsmtHalfBath
float64
FullBath
float64
HalfBath
float64
BedroomAbvGr
float64
KitchenAbvGr
float64
TotRmsAbvGrd
float64
Fireplaces
float64
GarageYrBlt
float64
GarageCars
float64
GarageArea
float64
WoodDeckSF
float64
OpenPorchSF
float64
EnclosedPorch
float64
3SsnPorch
float64
ScreenPorch
float64
PoolArea
float64
MiscVal
float64
MoSold
float64
YrSold
float64


['Street',
 'Alley',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'Condition2',
 'BldgType',
 'RoofStyle',
 'RoofMatl',
 'ExterCond',
 'BsmtCond',
 'BsmtFinType2',
 'BsmtFinSF2',
 'Heating',
 'Electrical',
 'LowQualFinSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'KitchenAbvGr',
 'Functional',
 'PavedDrive',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'MiscVal',
 'MoSold',
 'YrSold']

In [328]:
X_train.drop(feature_to_drop, axis=1, inplace=True)
X_test.drop(feature_to_drop, axis=1, inplace=True)
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,Neighborhood,HouseStyle,OverallQual,OverallCond,YearBuilt,...,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,WoodDeckSF,OpenPorchSF,SaleType,SaleCondition
0,60.0,RL,65.0,8450.0,Reg,CollgCr,2Story,7.0,5.0,2003.0,...,2003.0,RFn,2.0,548.0,TA,TA,0.0,61.0,WD,Normal
1,20.0,RL,80.0,9600.0,Reg,Veenker,1Story,6.0,8.0,1976.0,...,1976.0,RFn,2.0,460.0,TA,TA,298.0,0.0,WD,Normal
2,60.0,RL,68.0,11250.0,IR1,CollgCr,2Story,7.0,5.0,2001.0,...,2001.0,RFn,2.0,608.0,TA,TA,0.0,42.0,WD,Normal
3,70.0,RL,60.0,9550.0,IR1,Crawfor,2Story,7.0,5.0,1915.0,...,1998.0,Unf,3.0,642.0,TA,TA,0.0,35.0,WD,Abnorml
4,60.0,RL,84.0,14260.0,IR1,NoRidge,2Story,8.0,5.0,2000.0,...,2000.0,RFn,3.0,836.0,TA,TA,192.0,84.0,WD,Normal


## Outliers

In [329]:
X_train.shape

(1460, 46)

In [330]:
rows_to_drop = []
for col in X_train.columns:
    if X_train[col].dtype not in [np.float64, np.int64]:
        continue
    z_scores = stats.zscore(X_train[col])
    for i in range(len(z_scores)):
        if abs(z_scores[i]) > 3:
            rows_to_drop.append(i)

rows_to_drop = list(set(rows_to_drop))
X_train = X_train.drop(rows_to_drop)
y_train = y_train.drop(rows_to_drop)
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_train.shape

(1260, 46)

## Encoding

In [331]:
categorical_indexes = []
for col in X_train.columns:
    if X_train[col].dtype not in [np.float64, np.int64]:
        categorical_indexes.append(list(X_train.columns).index(col))

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categorical_indexes)], remainder='passthrough')

In [332]:
X_train = X_train.iloc[:, :].values
# y_train = y_train.iloc[:, -1].values
X_test = X_test.iloc[:, :].values
X_train = ct.fit_transform(X_train)
X_test = ct.fit_transform(X_test)

# Multi Linear Regression

In [337]:
mlr = LinearRegression()
mlr.fit(X_train, y_train)
y_pred_mlr = mlr.predict(X_test)
submission = pd.DataFrame(columns=['id', 'SalePrice'])
test_data = pd.read_csv("test.csv")
for i in range(len(y_pred_mlr)):
    row = {}
    row['id'] = (test_data.iloc[i]['Id'])
    row['SalePrice'] = y_pred_mlr[i]    
    submission = submission.append(row, ignore_index=True)

submission
submission.to_csv('submssion1.csv', index=False)