## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor

## Importing and Reading Data as DataFrame

In [2]:
df = pd.read_csv("../data/train.csv")
df.shape

(1460, 81)

### Removing ID column

In [3]:
print(df.columns)
df = df.drop(['Id'], axis=1)
print('*****************************')
print(df.columns)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

### EDA

#### After EDA, We found out the list of numerical and categorical features as some of the categorical features were labelled as Numerical.
#### List of columns categorised as numerical but have cardinal features
#### ["MSSubClass",  "OverallQual", "OverallCond", "BstFullBath", "BstHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces", "GarageCars", "MoSold"]
#### TimeSeries columns : ["YearBuilt", "YearRemodAdd", "GarageYrBlt", "YrSold"]
#### MasVnrArea has around 60 % of values as Zero.
#### BsmtFinSF1 has 32% of values as Zero.
#### BsmtFinSF2 has 88% of values as Zero.
#### 2ndFlrSF has 56% of values as Zero.
#### LowQualFinSF has 98% of values as Zero.
#### WoodDeckSF has 52% of values as Zero.
#### OpenPorchSF has 45% of values as Zero.
#### EnclosedPorch has 85% of values as Zero.
#### 3SsnPorch has 98% of values as Zero.
#### ScreenPorch has 92% of values as Zero.
#### PoolArea has 99% of values as Zero.
#### MiscVal has 96% of values as Zero.
#### Alley column has 93% of missing data.
#### FireplaceQu has 47% of missing data.
#### PoolQC has 99% of missing data.
#### Fence has 80% of Missing data.
#### MiscFeature has 96% of missing data.

## Splitting data into Numerical and Categorical datapoints

### Numerical Features

In [4]:
numerical_data = df[['LotFrontage', 'LotArea', 
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'GarageYrBlt', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'YrSold', 'SalePrice']]

### Categorical Features

In [5]:
categorical_data = df[['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition', "MSSubClass", "OverallQual", "OverallCond", 
        "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", 
        "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces", "GarageCars", "MoSold"]]

## Feature Selection

### Dropping Columns with more than 40% Missing data

In [6]:
def missingDataCalculator(df):
    dropped_columns = []
    for cols in df.columns:
        missing_data_points = df[cols].isnull().sum()
        data_percentage = (missing_data_points/len(df))*100
    
        if (data_percentage) > 40:
            dropped_columns.append(cols)
        else:
            pass
    try:
        df = df.drop([dropped_columns], axis=1)
    except:
        pass
    print('DONE !...')
    return dropped_columns

#### Numerical

In [7]:
print(missingDataCalculator(numerical_data))

DONE !...
[]


In [8]:
numerical_data.columns

Index(['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageYrBlt', 'GarageArea',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'YrSold', 'SalePrice'],
      dtype='object')

In [9]:
target = numerical_data['SalePrice']
# numerical_data = numerical_data.drop(['SalePrice'], axis = 1)

#### Categorical

In [10]:
print(missingDataCalculator(categorical_data))

DONE !...
['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']


In [11]:
categorical_data = categorical_data.drop(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)

In [12]:
categorical_data.columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition', 'MSSubClass', 'OverallQual',
       'OverallCond', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageCars', 'MoSold'],
      dtype='object')

### Checking Correlation of Numerical data

In [13]:
corr_matrix = numerical_data.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.5)]
to_drop.append('YearBuilt')
to_drop.remove('SalePrice')
print(to_drop)

['YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'GarageYrBlt', 'GarageArea', 'YearBuilt']


In [14]:
numerical_data.drop(to_drop, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numerical_data.drop(to_drop, axis=1, inplace=True)


### Merging Numerical and Categorical Data

In [15]:
df = pd.concat([numerical_data, categorical_data], axis=1)

In [16]:
df.shape

(1460, 68)

In [17]:
numerical_data

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,2ndFlrSF,LowQualFinSF,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,YrSold,SalePrice
0,65.0,8450,196.0,706,0,150,854,0,0,61,0,0,0,0,0,2008,208500
1,80.0,9600,0.0,978,0,284,0,0,298,0,0,0,0,0,0,2007,181500
2,68.0,11250,162.0,486,0,434,866,0,0,42,0,0,0,0,0,2008,223500
3,60.0,9550,0.0,216,0,540,756,0,0,35,272,0,0,0,0,2006,140000
4,84.0,14260,350.0,655,0,490,1053,0,192,84,0,0,0,0,0,2008,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,62.0,7917,0.0,0,0,953,694,0,0,40,0,0,0,0,0,2007,175000
1456,85.0,13175,119.0,790,163,589,0,0,349,0,0,0,0,0,0,2010,210000
1457,66.0,9042,0.0,275,0,877,1152,0,0,60,0,0,0,0,2500,2010,266500
1458,68.0,9717,0.0,49,1029,0,0,0,366,0,112,0,0,0,0,2010,142125


### Saving data to a CSV

In [18]:
df.to_csv('../data/train_.csv', index=False)