In [26]:
import pandas as pd
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
pd.set_option('display.max_columns', None)

# Upload csv file : file is copied from "C:\repos\immo-eliza-team6-analysis\analysis\Team_6_Step_3b_outliers_cat.ipynb"
# after running "analysis\Team_6_Step_3b_outliers_cat.ipynb" without the coding blocks for 
# "Encoding - label encoding" on kitchen type, state of building and EPC 
# (hence, missing values on categorical values was treated)
data = r'raw.csv' 
df = pd.read_csv(data, sep = ',')

In [27]:
### HANDLING CATEGORICAL DATA ###

#Label encoding for EPC from A to F replaced by from 1 to 6
order = ['A', 'B', 'C', 'D', 'E', 'F']
category_mapping = {category: rank for rank, category in enumerate(order, start=1)}
df['EPC_encoded'] = df['epc'].map(category_mapping)
df=df.drop(['epc'],axis = 1)

#One hot encoding for kitchen type
dummies = pd.get_dummies(df['Kitchen_type'], prefix='Kitchen')
df=pd.concat([df, dummies], axis = 1)
df=df.drop(['Kitchen_type','Kitchen_Not installed'],axis = 1)

#One hot encoding for province (when no province as True => Brussels)
dummies = pd.get_dummies(df['Province'])
df=pd.concat([df, dummies], axis = 1)
df=df.drop(['Province','Brussels'],axis = 1)

#One hot encoding for State of building (when no state as True => As_new)
dummies = pd.get_dummies(df['State_of_building'], prefix="State")
df=pd.concat([df, dummies], axis = 1)
df=df.drop(['State_of_building','State_As new'],axis = 1)
display(df)

Unnamed: 0,id,locality_name,Postal_code,Price,Subtype,Number_of_bedrooms,Living_area,street,number,latitude,longitude,Open_fire,Swimming_Pool,hasTerrace,terraceSurface,gardenSurface,Number_of_facades,Furnished,landSurface,price_per_sqm,price_per_sqm_land,Assigned_City,Has_Assigned_City,Assigned_City_5,Has_Assigned_City_5,Assigned_City_10,Has_Assigned_City_10,Assigned_City_15,Has_Assigned_City_15,EPC_encoded,Kitchen_Hyper equipped,Kitchen_Installed,Kitchen_Semi equipped,Antwerp,Brabant_Wallon,East Flanders,Flemish Brabant,Hainaut,Limburg,Liège,Luxembourg,Namur,West Flanders,State_Good,State_Just renovated,State_To be done up,State_To renovate
0,20252354,Ronse,9600,319000,House,3,125,Rotterij,148,50.744176,3.625722,False,False,True,23,370,4,False,767,2552.000000,415.906128,,False,,False,,False,,False,6,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True
1,20251003,Geraardsbergen,9500,299999,House,3,167,Pirrestraat,17,50.752121,3.925495,False,False,True,23,895,2,False,1050,1796.401198,285.713333,,False,,False,,False,Aalst,True,4,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False
2,20252002,Mechelen,2800,275000,House,3,154,Caputsteenstraat,150,51.033269,4.491795,False,False,True,10,20,2,False,120,1785.714286,2291.666667,,False,,False,,False,Brussels,True,5,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,True
3,20252352,Gooik,1755,295000,House,3,172,Strijlandstraat,45,50.791877,4.084293,False,False,True,23,300,3,False,309,1715.116279,954.692557,,False,,False,,False,Aalst,True,6,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True
4,20251715,Dilbeek,1700,715000,House,3,280,Herdebeekstraat 94-94A,0,50.840610,4.228425,False,True,True,36,374,3,False,374,2553.571429,1911.764706,,False,,False,Brussels,True,Brussels,True,3,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4175,20194622,Nivelles,1400,295000,House,4,135,Rue François Lebon,25,50.600371,4.332420,False,False,True,26,220,2,False,120,2185.185185,2458.333333,,False,,False,,False,Brussels,True,4,False,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False
4176,20157543,Tielt,8700,295000,House,4,169,Blauwvoetstraat,5,50.982288,3.329582,False,False,True,23,220,3,False,412,1745.562130,716.019417,,False,,False,,False,Bruges,True,5,False,True,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False
4177,20140297,Vilvoorde,1800,299000,House,3,237,Marius Duchéstraat,169,50.934037,4.418561,False,False,True,18,220,3,False,197,1261.603376,1517.766497,,False,,False,Brussels,True,Brussels,True,6,False,False,True,False,True,False,False,False,False,False,False,False,False,True,False,False,False
4178,20125321,"Saint-Hubert, Mirwart",6870,299000,House,4,215,Rue Du Staplisse,4,50.056322,5.266147,False,False,True,23,220,3,False,386,1390.697674,774.611399,,False,,False,,False,,False,3,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False


In [28]:
### dropping columns not to be considered in model < df2 ###
#print(df['Number_of_facades'].value_counts())
df2 = df.drop(['locality_name', 'Postal_code','street', 'number', 'Subtype','latitude','longitude','hasTerrace','terraceSurface', 'gardenSurface', 'Furnished','Price',
       'price_per_sqm_land', 'Assigned_City','Assigned_City_5', 'Has_Assigned_City_5', 'Assigned_City_10','Has_Assigned_City_10', 'Assigned_City_15', 'Has_Assigned_City_15'],axis=1)

print("Remaining dataframe (df2) :")
df2.info()
print("shape of df2:",df2.shape)
print("type of df2:", type(df2))


Remaining dataframe (df2) :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4180 entries, 0 to 4179
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      4180 non-null   int64  
 1   Number_of_bedrooms      4180 non-null   int64  
 2   Living_area             4180 non-null   int64  
 3   Open_fire               4180 non-null   bool   
 4   Swimming_Pool           4180 non-null   bool   
 5   Number_of_facades       4180 non-null   int64  
 6   landSurface             4180 non-null   int64  
 7   price_per_sqm           4180 non-null   float64
 8   Has_Assigned_City       4180 non-null   bool   
 9   EPC_encoded             4180 non-null   int64  
 10  Kitchen_Hyper equipped  4180 non-null   bool   
 11  Kitchen_Installed       4180 non-null   bool   
 12  Kitchen_Semi equipped   4180 non-null   bool   
 13  Antwerp                 4180 non-null   bool   
 14  Brabant_Wall

In [29]:
### splitting in input and output : X and y ###
X = df2.drop(['id', 'price_per_sqm'], axis=1)
y = df2["price_per_sqm"]
print("data type of X and y:",type(X), type(y))
all = X.columns 
print("columns in X", all)

data type of X and y: <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
columns in X Index(['Number_of_bedrooms', 'Living_area', 'Open_fire', 'Swimming_Pool',
       'Number_of_facades', 'landSurface', 'Has_Assigned_City', 'EPC_encoded',
       'Kitchen_Hyper equipped', 'Kitchen_Installed', 'Kitchen_Semi equipped',
       'Antwerp', 'Brabant_Wallon', 'East Flanders', 'Flemish Brabant',
       'Hainaut', 'Limburg', 'Liège', 'Luxembourg', 'Namur', 'West Flanders',
       'State_Good', 'State_Just renovated', 'State_To be done up',
       'State_To renovate'],
      dtype='object')


In [30]:
### splitting in training and testing data ###
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=41)
print("shape of X_train and X_test:" , X_train.shape, X_test.shape)

shape of X_train and X_test: (3135, 25) (1045, 25)


In [31]:
### standardize integer-columns except price(ouput to be forecasted) ###
standard_scaler = StandardScaler()
columns = ['Number_of_bedrooms', 'Living_area','landSurface','EPC_encoded', 'Number_of_facades']
X_train_st = X_train.copy()
X_test_st = X_test.copy()
X_train_st[columns] = standard_scaler.fit_transform(X_train[columns])
X_test_st[columns]= standard_scaler.transform(X_test[columns])

In [32]:
### train linear regression model and show score ### 
reg = LinearRegression()
reg.fit(X_train, y_train)
print("r2 score on training is: ",reg.score(X_train, y_train))

r2 score on training is:  0.563254368354686


In [33]:
### test the model ### 
y_pred = reg.predict(X_test)
print("r2 score on test is: ", reg.score(X_test,y_test))
y_pred = reg.predict(X_test)
print("RMSE on test data is: ", root_mean_squared_error(y_test, y_pred))

r2 score on test is:  0.5552226680710062
RMSE on test data is:  449.85993414555304


In [34]:
### interpretation ### 
print("'base'-price :", reg.intercept_)
print('Standard house in trainingset:' )
X_train.mean()

'base'-price : 4122.180283668006
Standard house in trainingset:


Number_of_bedrooms          3.225518
Living_area               170.078150
Open_fire                   0.040510
Swimming_Pool               0.010845
Number_of_facades           2.851994
landSurface               402.581499
Has_Assigned_City           0.264753
EPC_encoded                 3.730144
Kitchen_Hyper equipped      0.122807
Kitchen_Installed           0.724402
Kitchen_Semi equipped       0.114833
Antwerp                     0.140032
Brabant_Wallon              0.093461
East Flanders               0.310048
Flemish Brabant             0.048485
Hainaut                     0.017544
Limburg                     0.016906
Liège                       0.077512
Luxembourg                  0.058692
Namur                       0.018501
West Flanders               0.196172
State_Good                  0.484530
State_Just renovated        0.054864
State_To be done up         0.103987
State_To renovate           0.181180
dtype: float64

In [35]:
print("Contributions to the price prediction :")
"coeff :", list(zip(all, reg.coef_))

Contributions to the price prediction :


('coeff :',
 [('Number_of_bedrooms', 48.723296481244894),
  ('Living_area', -7.191194503599181),
  ('Open_fire', 75.24469679659556),
  ('Swimming_Pool', 244.84608413015832),
  ('Number_of_facades', 168.78551054608744),
  ('landSurface', 0.6997681325270786),
  ('Has_Assigned_City', 202.9759472850997),
  ('EPC_encoded', -156.31942763467555),
  ('Kitchen_Hyper equipped', 164.4088489729209),
  ('Kitchen_Installed', 78.01928400189567),
  ('Kitchen_Semi equipped', 4.281938490016304),
  ('Antwerp', -920.0190450399457),
  ('Brabant_Wallon', -755.7608363935409),
  ('East Flanders', -1193.691460553108),
  ('Flemish Brabant', -911.6526453252621),
  ('Hainaut', -1658.2704904710722),
  ('Limburg', -1325.7526898744936),
  ('Liège', -1504.9268972939387),
  ('Luxembourg', -1442.4672509156487),
  ('Namur', -1670.5259575434948),
  ('West Flanders', -1319.374909891598),
  ('State_Good', -108.79261573861722),
  ('State_Just renovated', -72.79084822512098),
  ('State_To be done up', -191.38509724017493),
 