In [144]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import cross_val_score

Improt the dataset

In [145]:
#setup display options
pd.set_option('display.max_columns', None)
pd.set_option('display.min_rows', 15)

df = pd.read_csv("..\data\data_cleaned.csv",index_col=[0])


In [146]:
df.head()

Unnamed: 0,id,type of property,subtype of property,locality,latitude,longitude,street,postalCode,price,type of sale,number of bedrooms,living area,livable area,fully equipped kitchen,furnished,open fire,terrace,terrace area,garden,garden area,total property area,total land area,number of facades,swimming pool,state of the building,Gemeente,Province,Population( per province),Area(per province),Density,kitchen,price per sqr
0,10689265,HOUSE,MIXED_USE_BUILDING,Koekelberg,50.863091,4.329928,Rue du Noyer 86,1081,575000,residential_sale,3,0,220.0,installed,False,False,True,0.0,False,0.0,220.0,71.0,2.0,False,good,Koekelberg,Brussel,1208542.0,162.4,7442.0,,2613.636364
1,10687297,HOUSE,EXCEPTIONAL_PROPERTY,Beveren,51.224437,4.246054,,9120,695000,residential_sale,4,28,269.0,hyper_equipped,False,False,True,41.0,True,300.0,269.0,583.0,3.0,False,as_new,Beveren,Oost-Vlaanderen,1515064.0,3007.0,504.0,Hyper equipped,2583.643123
2,10686031,HOUSE,HOUSE,Seraing,50.598224,5.518784,Place Merlot 6,4100,144000,residential_sale,2,14,105.0,semi_equipped,False,False,True,0.0,True,25.0,105.0,105.0,2.0,False,good,Seraing,Liège,1106992.0,3857.0,288.0,Semi equipped,1371.428571
3,10686030,HOUSE,HOUSE,Seraing,50.601548,5.510799,Place Merlot 6,4100,199000,residential_sale,4,25,125.0,installed,False,False,True,108.0,True,28.0,125.0,229.0,2.0,False,good,Seraing,Liège,1106992.0,3857.0,288.0,,1592.0
4,10686027,HOUSE,VILLA,Herzele,50.891145,3.857915,Hoogstraat 20,9550,539000,residential_sale,4,65,370.0,hyper_equipped,False,False,True,60.0,False,0.0,370.0,911.0,4.0,False,as_new,Herzele,Oost-Vlaanderen,1515064.0,3007.0,504.0,Hyper equipped,1456.756757


In [147]:
df["subtype of property"].value_counts()
df["latitude"].isna().value_counts()

latitude
False    12976
True      2694
Name: count, dtype: int64

Data cleaning!
1. fillna
2. remove castles
3. scaling certain data
4. get dummies


In [148]:
#drop Nan rows
df.drop(df["kitchen"].isna().index)

#remove the properties without coordinates
lon_la = (df["latitude"].isna()) | (df["longitude"].isna())
df.drop(df.loc[lon_la].index,inplace=True)

#remove castles
castle = df[df["subtype of property"]=="CASTLE"].index
df.drop(castle,inplace=True)


def convert(n):
    return int(n/100)
df["digit"]=df["postalCode"].agg(convert)

#get dummies for catagorical data:
get_dummies = ["Province","type of property","kitchen","subtype of property","state of the building","garden","terrace","digit"]
df =pd.get_dummies(data=df, columns=get_dummies)

df.drop(columns=["fully equipped kitchen"],inplace=True)

#replace True with 1, and False with 0, turn them into numerical data
df.replace([True,False],[1,0],inplace=True)


In [149]:
df.shape

(12970, 153)

Remove outliners and castles

In [150]:
def remove_outliners(df,columns,n_std):
    for col in columns:

        mean = df[col].mean()
        sd = df[col].std()
        df = df[(df[col] <= mean+(n_std*sd))]
        df = df[(df[col] >= mean-(n_std*sd))]

    return df

# def remove_outliner(df, col_names):
#     for col_name in col_names:
#         q1 = df[col_name].quantile(0.25)
#         q3 = df[col_name].quantile(0.75)
#         iqr = q3-q1 #Interquartile range
#         fence_low  = q1-1.5*iqr
#         fence_high = q3+1.5*iqr
#         df = df.loc[(df[col_name] > fence_low) & (df[col_name] < fence_high)]

#     return df



In [151]:
# print(df["price"].std())
# print(df["price"].mean())
df = remove_outliners(df, ["number of bedrooms","livable area","living area",
                           "total property area","garden area","total land area","terrace area"],3)
df.shape

(12299, 153)

Data formatting: 
1.normalize the data
2.create X,y, split them into training and testing

In [152]:
# # fix the distributions of number of bedrooms
# df["number of bedrooms"] = np.log(df["number of bedrooms"] + 1)

#normalize these columns
columns_to_normalize = ["latitude","longitude","living area","livable area",
                        "garden area","total property area","total land area","number of facades","number of bedrooms","terrace area"]

# print(df["price"].sort_values())

In [153]:
# price would be the target
columns_to_drop = ["id","locality","street","price","type of sale"
                   ,"Density","Area(per province)",
                   "Population( per province)","price per sqr","open fire","postalCode","Gemeente"
                   ]
#Now normalize the data
y = df.loc[:,"price"].to_numpy().reshape(-1,1)

# df[columns_to_normalize]=preprocessing.normalize(df[columns_to_normalize])
X=df.drop(columns=columns_to_drop,axis=1)
X = X.to_numpy()



# X=preprocessing.normalize(X_raw) 
# print(X.shape,y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2,random_state=21)


In [154]:
print(X.shape, X_train.shape, y_train.shape)


(12299, 141) (9839, 141) (9839, 1)


visualization of the raw_data

In [155]:
df.drop(columns=columns_to_drop,axis=1).head()

Unnamed: 0,latitude,longitude,number of bedrooms,living area,livable area,furnished,terrace area,garden area,total property area,total land area,number of facades,swimming pool,Province_Antwerpen,Province_Brabant wallon,Province_Brussel,Province_Hainaut,Province_Limburg,Province_Liège,Province_Luxembourg,Province_Namur,Province_Oost-Vlaanderen,Province_Vlaams-Brabant,Province_West-Vlaanderen,type of property_APARTMENT,type of property_HOUSE,kitchen_Equipped,kitchen_Hyper equipped,kitchen_Not equipped,kitchen_Semi equipped,subtype of property_APARTMENT,subtype of property_APARTMENT_BLOCK,subtype of property_BUNGALOW,subtype of property_CHALET,subtype of property_COUNTRY_COTTAGE,subtype of property_DUPLEX,subtype of property_EXCEPTIONAL_PROPERTY,subtype of property_FARMHOUSE,subtype of property_FLAT_STUDIO,subtype of property_GROUND_FLOOR,subtype of property_HOUSE,subtype of property_KOT,subtype of property_LOFT,subtype of property_MANOR_HOUSE,subtype of property_MANSION,subtype of property_MIXED_USE_BUILDING,subtype of property_PENTHOUSE,subtype of property_SERVICE_FLAT,subtype of property_TOWN_HOUSE,subtype of property_TRIPLEX,subtype of property_VILLA,state of the building_0,state of the building_as_new,state of the building_good,state of the building_just_renovated,state of the building_to_be_done_up,state of the building_to_renovate,state of the building_to_restore,garden_False,garden_True,terrace_False,terrace_True,digit_10,digit_11,digit_12,digit_13,digit_14,digit_15,digit_16,digit_17,digit_18,digit_19,digit_20,digit_21,digit_22,digit_23,digit_24,digit_25,digit_26,digit_28,digit_29,digit_30,digit_31,digit_32,digit_33,digit_34,digit_35,digit_36,digit_37,digit_38,digit_39,digit_40,digit_41,digit_42,digit_43,digit_44,digit_45,digit_46,digit_47,digit_48,digit_49,digit_50,digit_51,digit_53,digit_55,digit_56,digit_60,digit_61,digit_62,digit_64,digit_65,digit_66,digit_67,digit_68,digit_69,digit_70,digit_71,digit_73,digit_75,digit_76,digit_77,digit_78,digit_79,digit_80,digit_82,digit_83,digit_84,digit_85,digit_86,digit_87,digit_88,digit_89,digit_90,digit_91,digit_92,digit_93,digit_94,digit_95,digit_96,digit_97,digit_98,digit_99
0,50.863091,4.329928,3,0,220.0,0,0.0,0.0,220.0,71.0,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,51.224437,4.246054,4,28,269.0,0,41.0,300.0,269.0,583.0,3.0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,50.598224,5.518784,2,14,105.0,0,0.0,25.0,105.0,105.0,2.0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,50.891145,3.857915,4,65,370.0,0,60.0,0.0,370.0,911.0,4.0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
5,50.821444,4.323519,4,16,215.0,0,10.0,90.0,215.0,170.0,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Model selection :Linear Regression

In [156]:
reg = LinearRegression()
reg.fit(X_train, y_train)

# y_pred = reg.predict(X_test)
print(reg.score(X_train,y_train))
print(reg.score(X_test,y_test))



0.4662342125772574
0.4945542644447517


Decision Tree?


In [162]:
from sklearn.tree import DecisionTreeRegressor


clf = DecisionTreeRegressor()
#min_samples_split=50
clf.fit(X_train,y_train)
clf.score(X_train,y_train)
clf.score(X_test,y_test)




0.7061781717779012

Try XGboost

In [164]:
#use XGBregressor! instead of XGBclassifier!!
from xgboost import XGBRegressor
model= XGBRegressor()
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))


0.9737001799935574
0.8446516128459934
