In [85]:
import numpy as np
import pandas as pd

In [86]:
# Importing data
ic_house_pred_train = pd.read_csv('data/ic_house_pred_train.csv')
ic_house_pred_test = pd.read_csv('data/ic_house_pred_test.csv')

In [87]:
# Separating input and output, and dropping columns

y_train = ic_house_pred_train['SalePrice']
x_train = ic_house_pred_train.drop(['Id', 'SalePrice'], axis=1)

y_test = ic_house_pred_test['SalePrice']
x_test = ic_house_pred_test.drop(['Id', 'SalePrice'], axis=1)


# ic_house_pred_train = ic_house_pred_train.drop(['Id'], axis=1)
# ic_house_pred_test = ic_house_pred_test.drop(['Id'], axis=1)

In [88]:
ic_house_pred_train.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,136,20,RL,80.0,10400,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,5,2008,WD,Normal,174000
1,1453,180,RM,35.0,3675,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2006,WD,Normal,145000
2,763,60,FV,72.0,8640,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2010,Con,Normal,215200
3,933,20,RL,84.0,11670,Pave,,IR1,Lvl,AllPub,...,0,,,,0,3,2007,WD,Normal,320000
4,436,60,RL,43.0,10667,Pave,,IR2,Lvl,AllPub,...,0,,,,0,4,2009,ConLw,Normal,212000


In [89]:
numeric_columns = x_train.select_dtypes(include=np.number).columns
categoric_columns = [x for x in x_train.columns if x not in numeric_columns]

In [90]:
# Treating missing values

print("Numeric missing values from train database")
print(x_train[numeric_columns].isnull().sum().sum())

print("Numeric missing values from test database")
print(x_test[numeric_columns].isnull().sum().sum())

print("Categoric missing values from train database")
print(x_train[categoric_columns].isnull().sum().sum())

print("Categoric missing values from test database")
print(x_test[categoric_columns].isnull().sum().sum())


# missing_values_columns_train = ic_house_pred_train.columns[ic_house_pred_train[numeric_columns].isnull().any()]
# missing_values_columns_test = ic_house_pred_test.columns[ic_house_pred_test[string_columns].isnull().any()]

# print(missing_values_columns_train)
# print(missing_values_columns_test)

def replaceMissingValuesByMean(var_list, data):
    for var in var_list:
        avg = data[var].mean(axis=0)
        data[var].fillna(avg, inplace=True)


def replaceMissingValuesByMode(var_list, data):
    for var in var_list:
        mode = data[var].mode().iloc[0]
        data[var].fillna(mode, inplace=True)


# Replacing numeric columns
replaceMissingValuesByMean(numeric_columns, x_train)
replaceMissingValuesByMean(numeric_columns, x_test)

# Replacing categoric columns
replaceMissingValuesByMode(categoric_columns, x_train)
replaceMissingValuesByMode(categoric_columns, x_test)

Numeric missing values from train database
247
Numeric missing values from test database
101
Categoric missing values from train database
4612
Categoric missing values from test database
2005


In [91]:
# Creating dummies variables
x_train = pd.get_dummies(x_train, prefix_sep='_')
x_test = pd.get_dummies(x_test, prefix_sep='_')

In [92]:
print("Number of missing values from the training database")
print(x_train.isnull().sum().sum())

print("Number of missing values from the test database")
print(x_test.isnull().sum().sum())

Number of missing values from the training database
0
Number of missing values from the test database
0


In [93]:
# Transforming target variable
y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

In [94]:
# Generating new databases

x_train.to_csv('X_TRAIN.csv', index=False)
y_train.to_csv('Y_TRAIN.csv', index=False)
x_test.to_csv('X_TEST.csv', index=False)
y_test.to_csv('Y_TEST.csv', index=False)

# Target variable with log
y_train_log.to_csv('Y_TRAIN_LOG.csv', index=False)
y_test_log.to_csv('Y_TEST_LOG.csv', index=False)


