In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor

In [2]:
# read data
file=r'C:\Users\frede_0021xgx\OneDrive\Documents\GitHub\challenge-data-analysis\data\dataframe_cleaned.csv'
immo_info=pd.read_csv(file)
#drop the irrelevent features,locality has conflits with zipcode,so delete it 
immo_info=immo_info.drop(['Unnamed: 0.1','Unnamed: 0','url','zip_code'],axis=1)#'zip_code'

In [3]:
'''data cleaning'''
#1.drop duplicates
immo_info=immo_info.drop_duplicates()
#2.No NaNs(replace NaNs with 0)
immo_info=immo_info.fillna(0)
#3.No text data(replace 'UNKNOWN' value with zero)
immo_info=immo_info.replace('UNKNOWN',0)
#4.subset the data positive value  
    #price  
immo_info=immo_info[immo_info['price']>0.0] 
#5.turn surface_land to type float 
immo_info['surface_land']=immo_info['surface_land'].astype(float)
 
immo_info

Unnamed: 0,region,province,locality,property_type,property_subtype,price,number_rooms,living_area,kitchen,furnished,fireplace,terrace,terrace_area,garden,garden_area,surface_land,number_facades,swimming_pool,building_state
0,Flanders,Flemish Brabant,Wezembeek-Oppem,HOUSE,HOUSE,460000.0,3.0,120.000000,USA_HYPER_EQUIPPED,0,0,1,20.0,1,110.0,227.0,2,0,GOOD
1,Brussels,Brussels,Woluwe-Saint-Pierre,HOUSE,HOUSE,599000.0,4.0,203.000000,INSTALLED,0,0,1,0.0,1,50.0,258.0,3,0,GOOD
2,Flanders,Antwerp,Schilde,HOUSE,HOUSE,655000.0,2.0,235.000000,INSTALLED,0,0,0,0.0,0,0.0,930.0,4,0,AS_NEW
3,Brussels,Brussels,Brussels,HOUSE,MIXED_USE_BUILDING,1475000.0,6.0,518.000000,NOT_INSTALLED,0,0,1,100.0,1,200.0,330.0,2,0,TO_RENOVATE
4,Wallonie,Liège,Huy,HOUSE,HOUSE,149500.0,3.0,100.000000,SEMI_EQUIPPED,0,0,0,0.0,0,0.0,265.0,3,0,GOOD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19974,Flanders,East Flanders,Ronse,HOUSE,HOUSE,167950.0,2.0,209.515373,NOT_INSTALLED,0,0,0,0.0,0,0.0,80.0,2,0,GOOD
19975,Flanders,West Flanders,Kortrijk,HOUSE,HOUSE,162000.0,2.0,209.515373,NOT_INSTALLED,0,0,1,0.0,0,0.0,139.0,2,0,0
19976,Wallonie,Namur,GODINNE,HOUSE,HOUSE,269000.0,3.0,106.000000,SEMI_EQUIPPED,0,0,0,0.0,1,454.0,529.0,4,0,0
19977,Flanders,Limburg,Kozen,HOUSE,HOUSE,629000.0,5.0,313.000000,HYPER_EQUIPPED,0,0,0,0.0,0,0.0,9540.0,4,0,0


In [4]:
#6.Devide dataset into two parts
text_value=['region','province','property_type','property_subtype','building_state','kitchen','locality']
number_value=['number_rooms','living_area','terrace_area','garden_area','surface_land','number_facades']

In [5]:
#7.remove outliers
for col in immo_info[number_value]:
    mean = immo_info[col].mean()
    std_dev = immo_info[col].std()
    upper_limit = mean+3*std_dev
    lower_limit = mean-3*std_dev
    print(immo_info[col].max(),immo_info[col].min())
    immo_info[col] = np.where(immo_info[col] >upper_limit,upper_limit,immo_info[col])
    immo_info[col] = np.where(immo_info[col] <lower_limit,lower_limit,immo_info[col])
    print(immo_info[col].max(),immo_info[col].min())


89.0 0.0
9.075500606143835 0.0
26218.0 12.0
1150.2878382355075 12.0
1620.0 0.0
109.95205753340706 0.0
120000.0 0.0
5920.674177505749 0.0
917440.0 0.0
29672.5603708909 0.0
26 1
6.000456887999611 1.0


In [6]:
''' Data formating'''
#Convert text_value to numeric value
dummies_df = pd.get_dummies(immo_info[text_value],prefix=text_value,dtype=float)
immo_info_merge = pd.concat([immo_info, dummies_df], axis=1).drop(text_value,axis = 1)
immo_info_merge.head()

Unnamed: 0,price,number_rooms,living_area,furnished,fireplace,terrace,terrace_area,garden,garden_area,surface_land,...,locality_oudegem,locality_pilar de la horadada,locality_rojales,locality_san juan de los terreros,locality_san pedro del pinatar,locality_sant josep de sa talaia,locality_©rez©e,locality_Écaussinnes,locality_Éghezée,locality_Érezée
0,460000.0,3.0,120.0,0,0,1,20.0,1,110.0,227.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,599000.0,4.0,203.0,0,0,1,0.0,1,50.0,258.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,655000.0,2.0,235.0,0,0,0,0.0,0,0.0,930.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1475000.0,6.0,518.0,0,0,1,100.0,1,200.0,330.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,149500.0,3.0,100.0,0,0,0,0.0,0,0.0,265.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
#Convert dataframe to numpy array
X=immo_info_merge.drop('price',axis=1).to_numpy()
y=immo_info_merge['price'].to_numpy().reshape(-1,1)

X.shape,y.shape

((16449, 2629), (16449, 1))

In [11]:
'''Data training'''
#Split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=10,test_size=0.2)
#initialize the DecisionTreeRegressor model
d_reg=DecisionTreeRegressor()
#fit data in the model
d_reg.fit(X_train, y_train)
#display score of model
print('Train_score:',d_reg.score(X_train, y_train))
#See how the prediction match the y test
print('Test_score:',d_reg.score(X_test, y_test))

Train_score: 0.9998931522781395
Test_score: 0.5702504035660645


In [12]:
#See the accuracy of the model by cross-validation
d_clas=DecisionTreeRegressor(random_state=1)
scores = cross_val_score(d_clas, X, y, cv=5) # cv is the number of folds (k)
print(scores)
print("Accuracy: {:.2f}% (+/- {:.2f})".format(scores.mean() * 100, scores.std() * 100))

[0.37951124 0.5182774  0.59968545 0.61708963 0.49098149]
Accuracy: 52.11% (+/- 8.53)
