In [1]:
# import the required libraries

import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# laod the test dataset
test_data = pd.read_csv('test.csv')
test_data.shape

(1459, 80)

In [8]:
# create a list of numerical features

numerical_features_columns = ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold']

In [9]:
# create a list of categorical columns

categorical_features_columns = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition']

In [10]:
# the following three lists are created with the help of analysis done in the code for the training data

high_multi_coll = ['GarageArea', 'GarageYrBlt', 'TotRmsAbvGrd', '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr',
                 'BsmtFullBath', 'FullBath', 'GarageCars']

low_corr_features = ['BsmtFinSF2', 'BsmtHalfBath', 'MiscVal', 'Id', 'LowQualFinSF', 'YrSold', '3SsnPorch', 'MoSold', 
                     'OverallCond', 'MSSubClass', 'PoolArea', 'ScreenPorch', 'EnclosedPorch', 'KitchenAbvGr', 
                     'BedroomAbvGr', 'BsmtUnfSF', 'BsmtFullBath', 'LotArea', 'HalfBath']

high_missing_cols = ['Alley', 'PoolQC', 'Fence', 'MiscFeature']

features_to_drop = set(high_multi_coll + low_corr_features + high_missing_cols)
test_data.drop(list(features_to_drop), axis = 1, inplace = True)
test_data.shape


(1459, 50)

In [11]:
# impute the missing values using KNN Imputer

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)

# create a new test_dataframe with only the columns to impute
cols_to_impute = ['LotFrontage', 'MasVnrArea']
test_data_to_impute = test_data[cols_to_impute]

# perform KNN imputation on the selected columns
imputed_cols = imputer.fit_transform(test_data_to_impute)

# replace the original columns with the imputed values
test_data[cols_to_impute] = imputed_cols 

In [14]:
# fill the missing values in the Garage columns with 'No Garage'
imputer = SimpleImputer(strategy='constant', fill_value='No Garage')
test_data[['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']] = imputer.fit_transform(test_data[['GarageType', 
                                                                    'GarageFinish', 'GarageQual', 'GarageCond']])
    

In [15]:
basement_features = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']

# Fill missing values in Basement features with mode
for feature in basement_features:
    data[feature].fillna(data[feature].mode()[0], inplace=True)

In [22]:
# dealing with the missing  values with more columns

test_data["FireplaceQu"].fillna("No Fireplace", inplace=True)
test_data['BsmtQual'].fillna('No Basement', inplace=True)
test_data['BsmtCond'].fillna('No Basement', inplace=True)
test_data['BsmtExposure'].fillna('No Basement', inplace=True)
test_data['BsmtFinType1'].fillna('No Basement', inplace=True)
test_data['BsmtFinType2'].fillna('No Basement', inplace=True)
test_data['BsmtFinSF1'].fillna(test_data['BsmtFinSF1'].mean(), inplace=True)
test_data['TotalBsmtSF'].fillna(test_data['TotalBsmtSF'].mean(), inplace=True)

for col in ['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'KitchenQual', 'Functional', 'SaleType', 'MasVnrType']:
    test_data[col].fillna(test_data[col].mode()[0], inplace=True)

test_data.isnull().sum()

MSZoning         0
LotFrontage      0
Street           0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
GrLivArea        0
KitchenQual      0
Functional       0
Fireplaces       0
FireplaceQu      0
GarageType       0
GarageFinish     0
GarageQual       0
GarageCond       0
PavedDrive       0
WoodDeckSF       0
OpenPorchSF      0
SaleType         0
SaleCondition    0
dtype: int64

In [23]:
# create a list of continuous and discrete features

continuous_features = ['LotFrontage', 'MasVnrArea' , 'BsmtFinSF1', 'TotalBsmtSF', 'GrLivArea', 'WoodDeckSF', 'OpenPorchSF'] 
discrete_features = ['OverallQual','YearBuilt', 'YearRemodAdd', 'Fireplaces']

In [25]:
# normalize the continuous features using log transformation

for feat in continuous_features:
    test_data[feat] = np.log1p(test_data[feat])

In [26]:
test_data.shape

(1459, 50)

In [27]:
test_data.head()

Unnamed: 0,MSZoning,LotFrontage,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,GrLivArea,KitchenQual,Functional,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,SaleType,SaleCondition
0,RH,4.394449,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,6.150603,LwQ,6.783325,GasA,TA,Y,SBrkr,6.799056,TA,Typ,0,No Fireplace,Attchd,Unf,TA,TA,Y,4.94876,0.0,WD,Normal
1,RL,4.406719,Pave,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,4.691348,TA,TA,CBlock,TA,TA,No,ALQ,6.828712,Unf,7.192934,GasA,TA,Y,SBrkr,7.192934,Gd,Typ,0,No Fireplace,Attchd,Unf,TA,TA,Y,5.976351,3.610918,WD,Normal
2,RL,4.317488,Pave,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,6.674561,Unf,6.834109,GasA,Gd,Y,SBrkr,7.396335,TA,Typ,1,TA,Attchd,Fin,TA,TA,Y,5.361292,3.555348,WD,Normal
3,RL,4.369448,Pave,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,3.044522,TA,TA,PConc,TA,TA,No,GLQ,6.401917,Unf,6.831954,GasA,Ex,Y,SBrkr,7.380879,Gd,Typ,1,Gd,Attchd,Fin,TA,TA,Y,5.888878,3.610918,WD,Normal
4,RL,3.78419,Pave,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,5.575949,Unf,7.155396,GasA,Ex,Y,SBrkr,7.155396,Gd,Typ,0,No Fireplace,Attchd,RFn,TA,TA,Y,0.0,4.418841,WD,Normal


In [28]:
# write the dataframe into a csv file to be used in the code for Training data
test_data.to_csv('formulatedtest.csv', index = False)