In [219]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae

In [220]:
df = pd.read_csv('./data/dataset_analysed.csv', dtype={'StateOfBuilding' : object})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114504 entries, 0 to 114503
Data columns (total 34 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Url                114504 non-null  object 
 1   BathroomCount      114504 non-null  float64
 2   BedroomCount       114504 non-null  int64  
 3   ConstructionYear   66591 non-null   float64
 4   Country            114504 non-null  object 
 5   District           114504 non-null  object 
 6   Fireplace          114504 non-null  float64
 7   FloodingZone       56440 non-null   object 
 8   Furnished          114504 non-null  float64
 9   Garden             114504 non-null  float64
 10  GardenArea         114504 non-null  float64
 11  Kitchen            64464 non-null   object 
 12  LivingArea         101071 non-null  float64
 13  Locality           114501 non-null  object 
 14  MonthlyCharges     12710 non-null   float64
 15  NumberOfFacades    73777 non-null   float64
 16  PE

In [221]:
df.drop(df[['Url', 'Country', 'MonthlyCharges','PropertyId','MunicipalityName']], axis='columns',inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114504 entries, 0 to 114503
Data columns (total 29 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   BathroomCount      114504 non-null  float64
 1   BedroomCount       114504 non-null  int64  
 2   ConstructionYear   66591 non-null   float64
 3   District           114504 non-null  object 
 4   Fireplace          114504 non-null  float64
 5   FloodingZone       56440 non-null   object 
 6   Furnished          114504 non-null  float64
 7   Garden             114504 non-null  float64
 8   GardenArea         114504 non-null  float64
 9   Kitchen            64464 non-null   object 
 10  LivingArea         101071 non-null  float64
 11  Locality           114501 non-null  object 
 12  NumberOfFacades    73777 non-null   float64
 13  PEB                81957 non-null   object 
 14  PostalCode         114504 non-null  int64  
 15  Price              114504 non-null  int64  
 16  Pr

In [222]:
df2 = df.where(df['FloodingZone'] == None)
df2

Unnamed: 0,BathroomCount,BedroomCount,ConstructionYear,District,Fireplace,FloodingZone,Furnished,Garden,GardenArea,Kitchen,...,ShowerCount,StateOfBuilding,SubtypeOfProperty,SurfaceOfPlot,SwimmingPool,Terrace,ToiletCount,TypeOfProperty,TypeOfSale,RefnisCode
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114499,,,,,,,,,,,...,,,,,,,,,,
114500,,,,,,,,,,,...,,,,,,,,,,
114501,,,,,,,,,,,...,,,,,,,,,,
114502,,,,,,,,,,,...,,,,,,,,,,


In [223]:
df.dropna(subset=['LivingArea'],inplace=True)
df.update(df['ConstructionYear'].fillna(value=med))

In [224]:
df[['Price', 'BedroomCount','Garden','GardenArea','LivingArea','Furnished','NumberOfFacades','ConstructionYear']].corr()

Unnamed: 0,Price,BedroomCount,Garden,GardenArea,LivingArea,Furnished,NumberOfFacades,ConstructionYear
Price,1.0,0.386786,0.051441,0.105722,0.463487,0.042954,0.202627,-0.051519
BedroomCount,0.386786,1.0,0.175669,0.108229,0.595088,-0.003375,0.218856,-0.184821
Garden,0.051441,0.175669,1.0,0.1914,0.132543,0.000437,0.105301,-0.116042
GardenArea,0.105722,0.108229,0.1914,1.0,0.130622,0.011214,0.110144,-0.052476
LivingArea,0.463487,0.595088,0.132543,0.130622,1.0,-0.009177,0.231511,-0.174914
Furnished,0.042954,-0.003375,0.000437,0.011214,-0.009177,1.0,-0.022494,-0.023155
NumberOfFacades,0.202627,0.218856,0.105301,0.110144,0.231511,-0.022494,1.0,0.014627
ConstructionYear,-0.051519,-0.184821,-0.116042,-0.052476,-0.174914,-0.023155,0.014627,1.0


In [225]:
y = df['Price'].values
X = df[['ConstructionYear','LivingArea','BedroomCount']].values

In [226]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.2)

In [227]:
from sklearn.ensemble import RandomForestRegressor as RFR

regressor = RFR(max_depth=2)

regressor.fit(X_train, y_train)

In [228]:
regressor.score(X_train, y_train)

0.24918281187073477

In [229]:
regressor.predict(X_train)

array([260843.04291487, 846410.80415042, 420712.5150098 , ...,
       260843.04291487, 260843.04291487, 426182.62136992])

In [230]:
mae(y_train, regressor.predict(X_train))

np.float64(184376.62902925524)