In [1]:
import pandas as pd
import numpy as np

## Importing the training data and performing basic EDA

In [2]:
# import train.csv file as pandas Dataframe
houses = pd.read_csv('train.csv', encoding='latin-1')
houses.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# check for duplicate data
duplicate_rows_houses = houses[houses.duplicated()]
print("number of duplicate rows: ", duplicate_rows_houses.shape[0])

number of duplicate rows:  0


I have decided to remove all non-numeric columns from my analysis for the sake of convenience. As we can see, the dataset has enough important quantitative variables to work with, so including all qualitative variables into will likely result in overfitting and decrease the result accuracy. Moreover, I have tried to perform one-hot encoding for all non-numeric columns and put the new modified dataset (287 columns) to train the model. Everything goes well until I want to make predictions with my model because, for some reason, the same function *pd.get_dummies()* creates 267 columns when I input the test set as a parameter. This results in a ValueError because number of features of the model does not match the input (287 vs. 267). A potential improvement to this algorithm is to determine what variables in the dataset matter the most by looking at the tree visualization and finding out what variables are being used to make decisions. Most likely, we will see that there are only a few columns that really matter in this case. Thus, those values should be used by the new model. 

In [4]:
# drop all qualitative variables
houses = houses.select_dtypes(['number'])
print("Dataset dimensions (#rows, #columns):",houses.shape)

Dataset dimensions (#rows, #columns): (1460, 38)


In [5]:
# columns that have null values
print(houses.columns[houses.isnull().any()].tolist())

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']


In [6]:
# check how many null values each columns contains
print("LotFrontage: ", houses['LotFrontage'].isnull().sum(), " null values")

print("MasVnrArea: ", houses['MasVnrArea'].isnull().sum(), " null values")

print("GarageYrBlt: ", houses['GarageYrBlt'].isnull().sum(), " null values")

LotFrontage:  259  null values
MasVnrArea:  8  null values
GarageYrBlt:  81  null values


In [7]:
# drop 'LotFrontage' and 'GarageYrBlt' because they have many null values
houses.drop(['LotFrontage', 'GarageYrBlt'], axis=1, inplace=True)

# check if everything is correct
print("Removed 2 columns with many null values. Rows:",\
      houses.shape[0],", columns:", houses.shape[1])

# delete the rows with null values in the 'MasVnrArea' column
# it does not make sense to drop the whole column because it has only 8 null values 

houses = houses.dropna(axis=0, subset=['MasVnrArea'])

# check if everything is correct
print("Removed 8 rows with null values in the 'MasVnrArea' column. Rows:",\
      houses.shape[0],", columns:", houses.shape[1])

Removed 2 columns with many null values. Rows: 1460 , columns: 36
Removed 8 rows with null values in the 'MasVnrArea' column. Rows: 1452 , columns: 36


## Training the model

In [8]:
# initialize the response variable (values we want to predict — price)
tr_response = np.array(houses['SalePrice'])

# initialize the predictors (all other variables from the dataset)
tr_predictors = houses.drop('SalePrice', axis=1)

tr_predictors = np.array(tr_predictors)

In [9]:
# import the model 
from sklearn.ensemble import RandomForestRegressor

# instantiate model with 1000 decision trees
tr_model = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# train the model on training data
tr_model.fit(tr_predictors, tr_response)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

## Importing the test data and performing basic EDA

In [10]:
# import test.csv file as pandas Dataframe
houses_test = pd.read_csv('test.csv', encoding='latin-1')

# drop all qualitative variables
houses_test = houses_test.select_dtypes(['number'])

print("Dataset dimensions (#rows, #columns):",houses_test.shape)

# columns that have null values 
print(houses_test.columns[houses_test.isnull().any()].tolist())

Dataset dimensions (#rows, #columns): (1459, 37)
['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea']


In [11]:
# check how many null values each columns contains
print("LotFrontage: ", houses_test['LotFrontage'].isnull().sum(), " null values")
print("MasVnrArea: ", houses_test['MasVnrArea'].isnull().sum(), " null values")
print("BsmtFinSF1: ", houses_test['BsmtFinSF1'].isnull().sum(), " null values")
print("BsmtFinSF2: ", houses_test['BsmtFinSF2'].isnull().sum(), " null values")
print("BsmtUnfSF: ", houses_test['BsmtUnfSF'].isnull().sum(), " null values")
print("TotalBsmtSF: ", houses_test['TotalBsmtSF'].isnull().sum(), " null values")
print("BsmtFullBath: ", houses_test['BsmtFullBath'].isnull().sum(), " null values")
print("BsmtHalfBath: ", houses_test['BsmtHalfBath'].isnull().sum(), " null values")
print("GarageYrBlt: ", houses_test['GarageYrBlt'].isnull().sum(), " null values")
print("GarageCars: ", houses_test['GarageCars'].isnull().sum(), " null values")
print("GarageArea: ", houses_test['GarageArea'].isnull().sum(), " null values")

LotFrontage:  227  null values
MasVnrArea:  15  null values
BsmtFinSF1:  1  null values
BsmtFinSF2:  1  null values
BsmtUnfSF:  1  null values
TotalBsmtSF:  1  null values
BsmtFullBath:  2  null values
BsmtHalfBath:  2  null values
GarageYrBlt:  78  null values
GarageCars:  1  null values
GarageArea:  1  null values


In [12]:
# drop 'LotFrontage' and 'GarageYrBlt' because they have many null values
houses_test.drop(['LotFrontage', 'GarageYrBlt'], axis=1, inplace=True)

# delete the rows with null values in the other columns
# it does not make sense to drop all columns because they have a few null values 

houses_test = houses_test.dropna(axis=0, subset=['MasVnrArea', 'BsmtFinSF1', \
                                                 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', \
                                                 'BsmtFullBath', 'BsmtHalfBath', \
                                                 'GarageCars', 'GarageArea'])

# check if everything is correct
print("Removed 2 columns and a few rows with null values. " \
"Rows:",houses_test.shape[0],", columns:", houses_test.shape[1])

Removed 2 columns and a few rows with null values. Rows: 1441 , columns: 35


## Making predictions with the trained model

In [13]:
# initialize the predictors (all variables from the test dataset)
test_predictors = np.array(houses_test)

# make predictions on the test data
predictions = tr_model.predict(test_predictors)

## Saving results 

In [14]:
# save the result to a prediction.csv file with two columns: Id and SalePrice
output = pd.DataFrame({'Id': np.array(houses_test['Id']), 'SalePrice': predictions}, columns=['Id', 'SalePrice'])
output.head()

Unnamed: 0,Id,SalePrice
0,1461,127235.103
1,1462,155846.155
2,1463,183794.458
3,1464,181342.093
4,1465,199393.416


In [15]:
output.to_csv("prediction.csv", index=False)