In [36]:
#from google.colab import drive
#drive.mount('/content/drive')

In [37]:
#cd /content/drive/MyDrive/Colab\ Notebooks/house-prices-advanced-regression-techniques

# Preprocessing

In [38]:
import pandas as pd
import numpy as np

In [39]:
df_train = pd.read_csv('data/for_preprocessing_train.csv')
df_test = pd.read_csv('data/for_preprocessing_test.csv')

In [40]:
df_train['HeatingQC'].dtype

dtype('int64')

In [41]:
def get_feature_description(feature):
    with open("data/data_description.txt", mode='r') as des:
        i=0
        for line in des:
            if feature in line:
                print(line)
                i = 1
            elif ("\t" in line) and (i==1):
                print(line)
            elif (":" in line) & (i==1):
                break

## Select the relevant features and drop the rest

- following are the categorical features

In [42]:
col = df_train.dtypes[df_train.dtypes == object].index
col

Index(['MSZoning', 'Street', 'Alley', 'LandContour', 'LotConfig',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'Foundation', 'Heating', 'GarageType', 'PavedDrive', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

- Selecting the relevant categorical features

In [43]:
cat_col = ['MSZoning','Street','Alley','Neighborhood','BldgType','MasVnrType','Foundation','SaleType','SaleCondition']
len(cat_col)

9

- for numerical features, we will keep all those which have the correlation with the target variable greater than 0.1

In [44]:
num_col = df_train.corr()['SalePrice'][df_train.corr()['SalePrice'].abs() > 0.1].sort_values(ascending=False).index

In [45]:
num_col = list(num_col)
len(num_col)

43

- Droping the features which are highly correlated with ech other

In [46]:
for col in num_col:
    for i in range(len(df_train.corr()[col].sort_values(ascending=False)[1:].values)):
        if df_train.corr()[col].sort_values(ascending=False)[1:].values[i] > 0.9:
            print(col, df_train.corr()[col].sort_values(ascending=False)[1:].values[i], df_train.corr()[col].sort_values(ascending=False)[1:].index[i])
            print("\n")

GarageQual 0.9587167630750162 GarageCond


GarageQual 0.9460304783986115 GarageYrBlt


GarageCond 0.9587167630750162 GarageQual


GarageCond 0.9488416248579604 GarageYrBlt


GarageYrBlt 0.9488416248579604 GarageCond


GarageYrBlt 0.9460304783986115 GarageQual




In [47]:
num_col.remove('GarageYrBlt')
num_col.remove('GarageQual')

In [48]:
df_train = pd.concat([df_train[num_col], df_train[cat_col]], axis=1)
num_col.remove('SalePrice')
df_test = pd.concat([df_test[num_col], df_test[cat_col]], axis=1)

- Converting the categorical features to dummy variables

In [49]:
df_train = pd.get_dummies(df_train, columns=cat_col, drop_first=True)
df_test = pd.get_dummies(df_test, columns=cat_col, drop_first=True)

- we want to compare the model on the basis of rmse between the logarithm of the predicted value and the logarithm of the observed sales price.
- hence converting the target variable to logarithm of the observed sales price

In [50]:
df_train['SalePrice_log'] = np.log(df_train['SalePrice'])

## Save the data

In [51]:
df_train.to_csv('data/train_final.csv', index=False)
df_test.to_csv('data/test_final.csv', index=False)