# Import data and encoding library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

path_train = './dataset/train.csv'
path_test = './dataset/test.csv'

house_data = pd.read_csv(path_train)
house_test = pd.read_csv(path_test)

# Heads of both dataset

In [2]:
house_data.head()

Unnamed: 0,Property_ID,Property_Type,Property_Area,Number_of_Windows,Number_of_Doors,Furnishing,Frequency_of_Powercuts,Power_Backup,Water_Supply,Traffic_Density_Score,Crime_Rate,Dust_and_Noise,Air_Quality_Index,Neighborhood_Review,Habitability_score
0,0x21e3,Apartment,106,,1,Semi_Furnished,0.0,No,Once in a day - Morning,5.89,Slightly below average,Medium,90.0,3.86,71.98
1,0x68d4,Apartment,733,2.0,2,Unfurnished,1.0,No,Once in a day - Evening,4.37,Well below average,Medium,96.0,3.55,71.2
2,0x7d81,Apartment,737,4.0,2,Fully Furnished,0.0,No,Once in a day - Morning,7.45,Slightly below average,Medium,121.0,3.81,71.39
3,0x7a57,Apartment,900,3.0,2,Unfurnished,2.0,Yes,Once in a day - Morning,6.16,Well above average,Medium,100.0,1.34,31.46
4,0x9409,Bungalow,2238,14.0,6,Fully Furnished,0.0,No,All time,5.46,Well below average,Medium,116.0,4.77,93.7


In [3]:
house_test.head()

Unnamed: 0,Property_ID,Property_Type,Property_Area,Number_of_Windows,Number_of_Doors,Furnishing,Frequency_of_Powercuts,Power_Backup,Water_Supply,Traffic_Density_Score,Crime_Rate,Dust_and_Noise,Air_Quality_Index,Neighborhood_Review
0,0x6e93,Apartment,293,3.0,1,Unfurnished,0.0,No,Once in a day - Morning,7.28,Well above average,Medium,152.0,2.52
1,0x8787,Apartment,586,4.0,1,Semi_Furnished,0.0,No,Once in a day - Evening,7.63,Well below average,Medium,92.0,4.16
2,0x6c17,Container Home,305,1.0,2,Semi_Furnished,1.0,No,All time,5.39,Slightly above average,Medium,90.0,2.92
3,0x9dbd,Apartment,258,2.0,1,Semi_Furnished,1.0,No,All time,7.53,Slightly below average,Medium,158.0,3.45
4,0xbfde,Bungalow,3031,12.0,4,Fully Furnished,0.0,No,All time,8.79,Well above average,High,186.0,2.72


# Count all null values in dataframe

In [4]:
house_data.isna().sum()

Property_ID                  0
Property_Type                0
Property_Area                0
Number_of_Windows         1654
Number_of_Doors              0
Furnishing                1042
Frequency_of_Powercuts    1383
Power_Backup                 0
Water_Supply                 0
Traffic_Density_Score        0
Crime_Rate                 787
Dust_and_Noise            1219
Air_Quality_Index            0
Neighborhood_Review          0
Habitability_score           0
dtype: int64

In [5]:
house_test.isna().sum()

Property_ID                 0
Property_Type               0
Property_Area               0
Number_of_Windows         445
Number_of_Doors             0
Furnishing                257
Frequency_of_Powercuts    366
Power_Backup                0
Water_Supply                0
Traffic_Density_Score       0
Crime_Rate                212
Dust_and_Noise            330
Air_Quality_Index           0
Neighborhood_Review         0
dtype: int64

# Describe both data

In [6]:
house_data.describe()

Unnamed: 0,Property_Area,Number_of_Windows,Number_of_Doors,Frequency_of_Powercuts,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review,Habitability_score
count,39499.0,37845.0,39499.0,38116.0,39499.0,39499.0,39499.0,39499.0
mean,1391.715638,3.923768,2.331375,0.484678,6.354772,121.352566,3.735945,73.482297
std,1953.287544,2.690992,1.179964,0.752207,1.219681,62.384669,0.796843,14.118753
min,100.0,0.0,1.0,0.0,0.0,0.0,0.0,7.28
25%,494.0,2.0,1.0,0.0,5.6,95.0,3.22,69.66
50%,932.0,4.0,2.0,0.0,6.47,113.0,3.85,75.76
75%,1770.0,5.0,3.0,1.0,7.22,140.0,4.26,82.26
max,28064.0,15.0,6.0,3.0,9.91,1044.0,5.0,100.0


In [7]:
house_test.describe()

Unnamed: 0,Property_Area,Number_of_Windows,Number_of_Doors,Frequency_of_Powercuts,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review
count,10500.0,10055.0,10500.0,10134.0,10500.0,10500.0,10500.0
mean,1364.333333,3.897563,2.326571,0.484409,6.361525,120.532571,3.734125
std,1813.702032,2.690162,1.175324,0.749686,1.216237,55.63599,0.793325
min,100.0,0.0,1.0,0.0,0.02,0.0,0.02
25%,505.0,2.0,1.0,0.0,5.62,95.0,3.23
50%,935.5,3.0,2.0,0.0,6.47,113.0,3.85
75%,1758.0,5.0,3.0,1.0,7.22,140.0,4.22
max,27477.0,15.0,6.0,3.0,9.43,1031.0,5.0


In [8]:
house_data[house_data['Furnishing'] == 'Semi_Furnished'].describe()

Unnamed: 0,Property_Area,Number_of_Windows,Number_of_Doors,Frequency_of_Powercuts,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review,Habitability_score
count,20357.0,19514.0,20357.0,19669.0,20357.0,20357.0,20357.0,20357.0
mean,1372.653829,3.901968,2.319644,0.479231,6.350508,121.348234,3.740183,75.478654
std,1894.466318,2.658779,1.171925,0.747539,1.219374,61.681906,0.793678,11.050474
min,100.0,0.0,1.0,0.0,0.0,0.0,0.0,16.76
25%,494.0,2.0,1.0,0.0,5.59,95.0,3.23,71.21
50%,928.0,3.0,2.0,0.0,6.46,113.0,3.85,76.3
75%,1748.0,5.0,3.0,1.0,7.2,140.0,4.26,82.39
max,28057.0,15.0,6.0,3.0,9.91,1044.0,5.0,97.84


In [9]:
quartile_house = house_data[house_data['Furnishing'] == 'Unfurnished']
quartile_house['quartile'] = pd.qcut(quartile_house['Habitability_score'], 10, labels=False)

In [10]:
house_data[(house_data['Property_Type'] == '#R%$G&867')]

Unnamed: 0,Property_ID,Property_Type,Property_Area,Number_of_Windows,Number_of_Doors,Furnishing,Frequency_of_Powercuts,Power_Backup,Water_Supply,Traffic_Density_Score,Crime_Rate,Dust_and_Noise,Air_Quality_Index,Neighborhood_Review,Habitability_score
27,0xb8da,#R%$G&867,990,7.0,3,Semi_Furnished,0.0,No,All time,3.60,Well below average,Low,27.0,4.96,78.07
74,0xc6f2,#R%$G&867,2185,4.0,1,Unfurnished,2.0,No,All time,6.24,Well below average,Medium,100.0,3.53,68.57
78,0x2910,#R%$G&867,985,2.0,3,Fully Furnished,0.0,No,Once in a day - Evening,7.60,Well below average,Medium,117.0,4.13,80.26
81,0x7cc9,#R%$G&867,313,1.0,3,Semi_Furnished,1.0,No,All time,6.02,Well below average,Medium,124.0,4.13,78.34
110,0x290a,#R%$G&867,783,2.0,2,Fully Furnished,0.0,No,Once in a day - Morning,5.05,Slightly below average,Medium,84.0,3.87,72.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38955,0xb272,#R%$G&867,2870,7.0,2,Semi_Furnished,0.0,No,Once in a day - Morning,7.15,Well below average,Medium,105.0,4.46,77.30
38963,0x3cb4,#R%$G&867,956,,3,Semi_Furnished,1.0,Yes,Once in a day - Morning,6.58,Slightly above average,Medium,96.0,2.60,85.35
39006,0xa335,#R%$G&867,3516,12.0,4,,0.0,No,Once in a day - Morning,5.29,Slightly below average,Medium,100.0,3.85,83.53
39210,0xbcb5,#R%$G&867,2186,4.0,3,Unfurnished,0.0,No,All time,6.02,Well below average,Medium,101.0,4.79,71.89


In [11]:
apartment = house_data[(house_data['Property_Type'] == 'Apartment') & (house_data['Property_Area'] < 5000)]
bunglow = house_data[house_data['Property_Type'] == 'Bungalow']

# Remove NA and #R% values

In [12]:
house_data = house_data.dropna()

In [13]:
house_data.drop(house_data.index[house_data['Property_Type'] == '#R%$G&867'], inplace=True)

In [14]:
house_data['Property_Type'].unique()

array(['Apartment', 'Bungalow', 'Single-family home', 'Duplex',
       'Container Home'], dtype=object)

# Pick all columns that are relevent

In [15]:
house_data.columns

Index(['Property_ID', 'Property_Type', 'Property_Area', 'Number_of_Windows',
       'Number_of_Doors', 'Furnishing', 'Frequency_of_Powercuts',
       'Power_Backup', 'Water_Supply', 'Traffic_Density_Score', 'Crime_Rate',
       'Dust_and_Noise', 'Air_Quality_Index', 'Neighborhood_Review',
       'Habitability_score'],
      dtype='object')

In [128]:
relevent_column_list = ['Property_Type', 'Property_Area', 'Number_of_Windows',
       'Number_of_Doors', 'Furnishing', 'Frequency_of_Powercuts',
       'Power_Backup', 'Water_Supply', 'Traffic_Density_Score', 'Crime_Rate',
       'Dust_and_Noise', 'Air_Quality_Index', 'Neighborhood_Review', 'Habitability_score']
relevent_column_list_X = list(filter(lambda x : x!='Habitability_score', relevent_column_list))
relevent_column_list_Y = ['Habitability_score']

# Make dataframe for storing training data

In [17]:
relevent_house_data = house_data[relevent_column_list]

In [18]:
relevent_house_data.shape

(33336, 14)

# Encoding all enum values

In [19]:
propertyType_Encoder = preprocessing.LabelEncoder()
furnishing_Encoder = preprocessing.LabelEncoder()
powerbackup_Encoder = preprocessing.LabelEncoder()
watersupply_Encoder = preprocessing.LabelEncoder()
crimerate_Encoder = preprocessing.LabelEncoder()
dustandnoise_Encoder = preprocessing.LabelEncoder()

In [20]:
propertyType_Encoder.fit(relevent_house_data['Property_Type'])

furnishing_Encoder.fit(relevent_house_data['Furnishing'])

powerbackup_Encoder.fit(relevent_house_data['Power_Backup'])

watersupply_Encoder.fit(relevent_house_data['Water_Supply'])

crimerate_Encoder.fit(relevent_house_data['Crime_Rate'])

dustandnoise_Encoder.fit(relevent_house_data['Dust_and_Noise'])

LabelEncoder()

In [21]:
relevent_house_data['Property_Type'] = propertyType_Encoder.transform(relevent_house_data['Property_Type'])

relevent_house_data['Furnishing'] =furnishing_Encoder.transform(relevent_house_data['Furnishing'])

relevent_house_data['Power_Backup'] = powerbackup_Encoder.transform(relevent_house_data['Power_Backup'])

relevent_house_data['Water_Supply'] = watersupply_Encoder.transform(relevent_house_data['Water_Supply'])

relevent_house_data['Crime_Rate'] = crimerate_Encoder.transform(relevent_house_data['Crime_Rate'])

relevent_house_data['Dust_and_Noise'] = dustandnoise_Encoder.transform(relevent_house_data['Dust_and_Noise'])

In [22]:
relevent_house_data.head()

Unnamed: 0,Property_Type,Property_Area,Number_of_Windows,Number_of_Doors,Furnishing,Frequency_of_Powercuts,Power_Backup,Water_Supply,Traffic_Density_Score,Crime_Rate,Dust_and_Noise,Air_Quality_Index,Neighborhood_Review,Habitability_score
1,0,733,2.0,2,2,1.0,1,2,4.37,3,2,96.0,3.55,71.2
2,0,737,4.0,2,0,0.0,1,3,7.45,1,2,121.0,3.81,71.39
3,0,900,3.0,2,2,2.0,2,3,6.16,2,2,100.0,1.34,31.46
4,1,2238,14.0,6,0,0.0,1,0,5.46,3,2,116.0,4.77,93.7
5,4,1185,3.0,3,2,0.0,1,3,5.69,3,2,91.0,4.49,82.94


# Splitting into trainging and test set

In [97]:
train_data, test_data = train_test_split(relevent_house_data, test_size=0.2, random_state=100)

In [98]:
train_data.columns

Index(['Property_Type', 'Property_Area', 'Number_of_Windows',
       'Number_of_Doors', 'Furnishing', 'Frequency_of_Powercuts',
       'Power_Backup', 'Water_Supply', 'Traffic_Density_Score', 'Crime_Rate',
       'Dust_and_Noise', 'Air_Quality_Index', 'Neighborhood_Review',
       'Habitability_score'],
      dtype='object')

In [130]:
X = train_data[relevent_column_list_X]
Y = train_data[relevent_column_list_Y]

In [131]:
x = test_data[relevent_column_list_X]
y = test_data[relevent_column_list_Y]

# Evaluation import using r2_score

In [132]:
from sklearn.metrics import r2_score

# Model Training

# True Result for test

In [133]:
e = y.loc[y.index].values.reshape(-1).tolist()

# XGB Model

In [134]:
import xgboost as xgbt

In [135]:
xgb_model = xgbt.XGBRegressor()

In [136]:
xgb_model.fit(X.loc[X.index].values.tolist(), Y.loc[Y.index].values.reshape(-1).tolist())

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [137]:
predict_xgb = xgb_model.predict(x.loc[x.index].values.tolist())

In [138]:
r2_score(e, predict_xgb)

0.8086483589840971

# Gradient Boost

In [139]:
from sklearn.ensemble import GradientBoostingRegressor

In [140]:
gd_model = GradientBoostingRegressor(n_estimators=200, max_depth=10, loss='ls')

In [141]:
gd_model.fit(X.loc[X.index].values.tolist(), Y.loc[Y.index].values.reshape(-1).tolist())

GradientBoostingRegressor(max_depth=10, n_estimators=200)

In [142]:
predict_gd = gd_model.predict(x.loc[x.index].values.tolist())

In [143]:
r2_score(e, predict_gd)

0.8182890177326866

# Random Forest

In [144]:
from sklearn.ensemble import RandomForestRegressor

In [164]:
forest_model = RandomForestRegressor(max_depth=12, n_estimators=185)

In [162]:
forest_model.fit(X.loc[X.index].values.tolist(), Y.loc[Y.index].values.reshape(-1).tolist())

RandomForestRegressor(max_depth=13, n_estimators=185)

In [163]:
predict_forest = forest_model.predict(x.loc[x.index].values.tolist())

In [165]:
r2_score(e, predict_forest)

0.8293808989969376

# Ada Boost Regressor

In [149]:
from sklearn.ensemble import AdaBoostRegressor

In [150]:
ada_model = AdaBoostRegressor()

In [151]:
ada_model.fit(X.loc[X.index].values.tolist(), Y.loc[Y.index].values.reshape(-1).tolist())

AdaBoostRegressor()

In [152]:
predict_ada = ada_model.predict(x.loc[x.index].values.tolist())

In [153]:
r2_score(e, predict_ada)

0.6883127887992737

# CSV output

In [None]:
result = house