# 1. Question or problem definition.

This is project based on free datasets from Kaggle.com. 

The main goal is predict the future house price based on initial features. 

# Acquire data 

In [2]:
import pandas as pd
import numpy as np

Let's load initial data for training and test data. I used the shape function to understand how many values included in train and test data. 

In [3]:
df_train = pd.read_csv('C:/Users/Vlad/Documents/HousePrice Python/house-prices-advanced-regression-techniques/train.csv')
df_train.shape

(1460, 81)

In [4]:
df_test = pd.read_csv('C:/Users/Vlad/Documents/HousePrice Python/house-prices-advanced-regression-techniques/test.csv')
df_test.shape

(1459, 80)

Usually, the whole data are splitting using train_test_split. To simplify the clearing data I will combine the train and test data into one dataframe

In [5]:
sumple_submissive = pd.read_csv('C:/Users/Vlad/Documents/HousePrice Python/house-prices-advanced-regression-techniques/sample_submission.csv')


Let's combine df into one

In [6]:
df_test = pd.concat([df_test, sumple_submissive.drop(['Id'], axis = 'columns')], axis = 1)
df_test['SalePrice'] = df_test['SalePrice'].astype(int)
df = pd.concat([df_train, df_test], axis = 0)
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,167081
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Abnorml,164788
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2006,WD,Abnorml,219222
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,184924


Little data mining

In [7]:
#missing values

total = df.isnull().sum().sort_values(ascending = False)
percent = (df.isnull().sum() / df.isnull().count().sort_values(ascending = False))
missing_data = pd.concat ([total, percent], axis = 1, keys = ['Total', 'Percent'])
missing_data.head()


Unnamed: 0,Total,Percent
PoolQC,2909,0.996574
MiscFeature,2814,0.964029
Alley,2721,0.932169
Fence,2348,0.804385
FireplaceQu,1420,0.486468


In [8]:
#Now drop some missing data

df = df.drop((missing_data[missing_data['Total'] > 1]).index,1)
df = df.drop(df.loc[df['Electrical'].isnull()].index)
df = df.dropna()
df.shape


(2912, 58)

In [9]:
#encoding object type to int. I need to do it because of the linear regression method working with int values better than objects. 

from sklearn import preprocessing
def number_encode_features(init_df):
    result = init_df.copy() 
    encoders = {}
    for column in result.columns:
        if result.dtypes[column] == np.object:
            encoders[column] = preprocessing.LabelEncoder()
            result[column] = encoders[column].fit_transform(result[column])
    return result, encoders
encoded_data, encoders = number_encode_features(df) 
encoded_data.head() 

Unnamed: 0,Id,MSSubClass,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,8450,1,3,3,4,0,5,2,...,0,0,0,0,0,2,2008,8,4,208500
1,2,20,9600,1,3,3,2,0,24,1,...,0,0,0,0,0,5,2007,8,4,181500
2,3,60,11250,1,0,3,4,0,5,2,...,0,0,0,0,0,9,2008,8,4,223500
3,4,70,9550,1,0,3,0,0,6,2,...,272,0,0,0,0,2,2006,8,0,140000
4,5,60,14260,1,0,3,2,0,15,2,...,0,0,0,0,0,12,2008,8,4,250000


In [10]:
#Because I combined all data to one df to simplify data mining. I need to sample all data to train and test data again

from sklearn.model_selection import train_test_split

X = encoded_data.iloc[: , encoded_data.columns!='SalePrice']
y =  encoded_data['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.501, random_state=42)
X_test.shape

(1459, 57)

In [11]:
#Add some metrics to undestand how the model was fiting and predicting

from sklearn import metrics

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)

# Models, predictions, results.

# GradientBoostingRegressor from scikit learn package

In [12]:
from sklearn.ensemble import GradientBoostingRegressor

reg = GradientBoostingRegressor(random_state=42,
                                n_estimators = 1000,
                                max_depth = 5,
                                min_samples_leaf = 6,
                                min_samples_split = 10)
                                
reg.fit(X_train, y_train)
pred_reg = reg.predict(X_test)

In [13]:
print_evaluate(pred_reg, y_test)

MAE: 11564.3082534328
MSE: 479350277.85327435
RMSE: 21894.069467626945
R2 Square 0.8267593238923725


# XGBRegressor

In [14]:
import sys
!{sys.executable} -m pip install xgboost



In [15]:
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder


xgb = XGBRegressor(learning_rate=0.01,
                       n_estimators=1000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:linear',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)


# Add silent=True to avoid printing out updates with each cycle
xgb.fit(X_train, y_train, verbose=False)
pred_xgb = xgb.predict(X_test)



In [16]:
print_evaluate(pred_xgb, y_test)

MAE: 12426.827311086361
MSE: 475772973.0550427
RMSE: 21812.22072726761
R2 Square 0.8011060653586435


# CatBoostRegressor

In [17]:
!{sys.executable} -m pip install catboost



In [18]:
from catboost import CatBoostRegressor

cat_model = CatBoostRegressor(iterations = 1000,
                              random_state = 42)
cat_model.fit(X_train,y_train, verbose=False)
cat_pred = cat_model.predict(X_test)

In [19]:
print_evaluate(cat_pred, y_test)

MAE: 10617.044768252415
MSE: 424936015.8134463
RMSE: 20613.976225208135
R2 Square 0.8377234962319581


In [20]:
# as the result the best type of regression model for this datasets is CatBoostRegressor
X_test

Unnamed: 0,Id,MSSubClass,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
178,179,20,17423,1,0,3,1,0,22,2,...,60,0,0,0,0,0,7,2009,6,5
678,679,20,11844,1,0,3,4,0,22,2,...,82,0,0,0,0,0,7,2009,6,5
1419,2880,50,6191,1,3,3,0,0,18,2,...,0,0,0,0,0,0,11,2006,8,4
1392,2853,60,11250,1,3,3,0,0,5,2,...,170,0,0,0,0,0,7,2006,8,4
440,1901,50,13680,1,3,1,4,1,6,2,...,21,150,0,0,0,0,2,2009,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696,697,30,6000,1,3,3,4,0,3,2,...,0,129,0,0,0,0,6,2006,8,4
1433,2894,50,8520,0,3,0,4,0,9,2,...,0,0,0,0,0,0,4,2006,8,4
244,245,60,8880,1,0,3,4,0,20,2,...,130,0,0,0,0,0,5,2010,8,4
1279,2740,20,9723,1,0,3,0,0,12,2,...,0,0,0,0,0,0,10,2006,8,4


In [21]:
pred = pd.DataFrame(cat_pred)
sub_df = pd.read_csv('C:/Users/Vlad/Documents/HousePrice Python/house-prices-advanced-regression-techniques/sample_submission.csv')
datasets = pd.concat([sub_df['Id'], pred], axis = 1)
datasets.columns = ['Id', 'SalePrice']
datasets.to_csv('sample_submission_1', index = False)