In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import urllib.parse, re
import matplotlib

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.impute import KNNImputer

In [21]:
df = pd.read_csv("utils/raw_data.csv", low_memory=False)
df = df.drop(df[df["Type"]=="house group"].index)
df = df.drop(df[df["Type"]=="apartment group"].index)

In [22]:
df = df[df["Type"]=="apartment"]

print("Shape: ",df.shape)
for i in df.columns:
    print(i, (40-len(i))*"-", df[i].isna().sum())

Shape:  (8849, 129)
id -------------------------------------- 0
Available as of ------------------------- 4182
Construction year ----------------------- 2120
Building condition ---------------------- 1822
Street frontage width ------------------- 7722
Number of frontages --------------------- 3181
Covered parking spaces ------------------ 6093
Outdoor parking spaces ------------------ 6711
Surroundings type ----------------------- 5040
Living area ----------------------------- 331
Living room surface --------------------- 4476
Dining room ----------------------------- 7853
How many fireplaces? -------------------- 8713
Kitchen type ---------------------------- 1986
Kitchen surface ------------------------- 5656
Bedrooms -------------------------------- 271
Bedroom 1 surface ----------------------- 4189
Bedroom 2 surface ----------------------- 5076
Bedroom 3 surface ----------------------- 7472
Bedroom 4 surface ----------------------- 8657
Bedroom 5 surface ----------------------- 881

In [23]:
for i in df.columns:
    print(i)

id
Available as of
Construction year
Building condition
Street frontage width
Number of frontages
Covered parking spaces
Outdoor parking spaces
Surroundings type
Living area
Living room surface
Dining room
How many fireplaces?
Kitchen type
Kitchen surface
Bedrooms
Bedroom 1 surface
Bedroom 2 surface
Bedroom 3 surface
Bedroom 4 surface
Bedroom 5 surface
Dressing room
Bathrooms
Toilets
Office surface
Office
Professional space
Attic surface
Isolated
Armored door
Surface of the plot
Land is facing street
Wooded land
Plot at rear
Flat land
Width of the lot on the street
Connection to sewer network
Gas, water & electricity
Garden surface
Garden orientation
Caretaker
Elevator
Accessible for disabled people
Intercom
Secure access / alarm
Air conditioning
TV cable
Visio phone
Jacuzzi
Sauna
Swimming pool
Internet
Primary energy consumption
Energy class
Unnamed: 54
Reference number of the EPC report
CO₂ emission
Yearly theoretical total energy consumption
Conformity certification for fuel tanks
H

In [24]:
df.groupby("Surroundings type")["id"].count()

Surroundings type
Concrete                                        1
Countryside                                   170
Fitted out                                    106
Isolated                                      490
Landscape                                     107
Living area (residential, urban or rural)     891
Mall                                           37
Shop street                                    49
Urban                                        1958
Name: id, dtype: int64

In [25]:
df = df[['id','Price','Zip','Type','Subtype','location',
       'Surroundings type',
       'Living area','Surface of the plot',
       'Bedrooms','Kitchen type','Bathrooms',
       'Building condition',
       'Construction year', 
       'Number of frontages',
       'Covered parking spaces', 'Outdoor parking spaces', 
       'Swimming pool',
       'Furnished',
       'How many fireplaces?',
       'Terrace','Terrace surface',
       'Garden','Garden surface',
       'Primary energy consumption','Energy class','Heating type'
       ]]


In [26]:
df = df.rename(columns={
    'location' :'Locality',
    'Transaction Type' : 'Type of sale',
    'Type' :'Type of property',
    'Subtype' : 'Subtype of property',
    'Number of frontages': 'Number of facades',
    'Bedrooms':'Number of rooms',
    'Surface of the plot' :'Surface of the land',
    'Kitchen type' : 'Fully equipped kitchen',
    'How many fireplaces?' : 'Open fire',
})

In [27]:
df['Locality'] = df['Locality'].apply(urllib.parse.unquote)

def clean_and_convert(column):
    column = column.apply(lambda x: re.sub('\D+', '', str(x)))
    column = column.replace('', np.nan)
    return column

df['Living area'] = clean_and_convert(df['Living area'])
df['Terrace surface'] = clean_and_convert(df['Terrace surface'])
df['Garden surface'] = clean_and_convert(df['Garden surface'])
df['Surface of the land'] = clean_and_convert(df['Surface of the land'])
df['Primary energy consumption'] = clean_and_convert(df['Primary energy consumption'])

In [28]:
# GARDEN AND TERRACE
conditions = [
    (df['Garden']== "Yes"),
    (df["Garden"].isna()) & (df["Garden surface"].isna()),
    (df["Garden surface"].notna())
    ]
values = [1, 0, 1]
df['Garden'] = np.select(conditions, values)

df.loc[(df["Garden"] == 0 ) & (df["Garden surface"].isna()), 'Garden surface'] = 0

conditions = [
    (df['Terrace']== "Yes"),
    (df["Terrace"].isna()) & (df["Terrace surface"].isna()),
    (df["Terrace surface"].notna())
    ]
values = [1, 0, 1]
df['Terrace'] = np.select(conditions, values)

df.loc[(df["Terrace"] == 0 ) & (df["Terrace surface"].isna()), 'Terrace surface'] = 0

In [29]:
def nan_replacement(column):
    column = column.replace("Yes",1)
    column = column.replace("No",0)
    column = column.replace('', np.nan).fillna(0)
    return column

df['Furnished'] = nan_replacement(df['Furnished'])
df['Swimming pool'] = nan_replacement(df['Swimming pool'])
df['Open fire'] = nan_replacement(df['Open fire'])

In [30]:
# Mapping dictionary for replacing values in the "kitchen" column
kitchen_mapping = {
    # np.nan: -1,
    'Not installed': 0,
    'Installed': 1,
    'Semi equipped': 2,
    'Hyper equipped': 3,
    'USA uninstalled' :0,
    'USA installed': 1,
    'USA semi equipped': 2,
    'USA hyper equipped' :3
}
# Replace values in the "Kitchen type" column with corresponding numbers and create a new column called "Kitchen values"
df['Kitchen values'] = df['Fully equipped kitchen'].map(kitchen_mapping).fillna(df['Fully equipped kitchen'])

building_cond_mapping = {
    # np.nan: -1,
    'To restore': 0,
    'To be done up': 2,
    'Just renovated': 3,
    'To renovate': 1,
    'Good': 3,
    'As new' :4
}

df['Building Cond. values'] = df['Building condition'].map(building_cond_mapping).fillna(df['Building condition'])

df = df.loc[:, ~df.columns.isin(['Fully equipped kitchen','Building condition'])]

In [31]:
df = df.drop(df[df["Living area"].isna()].index)
df = df.drop(df[df["Surface of the land"].isna()].index)

In [32]:
conditions = [
    (df["Covered parking spaces"].notna()) & (df["Outdoor parking spaces"].notna()),
    (df["Covered parking spaces"].isna()) & (df["Outdoor parking spaces"].isna()),
    (df["Covered parking spaces"].isna()) & (df["Outdoor parking spaces"].notna()),
    (df["Covered parking spaces"].notna()) & (df["Outdoor parking spaces"].isna())
    ]
values = [(df["Covered parking spaces"]+df["Outdoor parking spaces"]), 0, df["Outdoor parking spaces"],df["Covered parking spaces"]]
df['Parking'] = np.select(conditions, values)

df = df.loc[:, ~df.columns.isin(["Covered parking spaces","Outdoor parking spaces"])]

In [33]:
def get_province(zip_code):
    if 1000 <= zip_code <= 1299:
        return 'Brussels Capital Region'
    elif 1300 <= zip_code <= 1499:
        return 'Walloon Brabant'
    elif 1500 <= zip_code <= 1999 or 3000 <= zip_code <= 3499:
        return 'Flemish Brabant'
    elif 2000 <= zip_code <= 2999:
        return 'Antwerp'
    elif 3500 <= zip_code <= 3999:
        return 'Limburg'
    elif 4000 <= zip_code <= 4999:
        return 'Liège'
    elif 5000 <= zip_code <= 5999:
        return 'Namur'
    elif 6000 <= zip_code <= 6599 or 7000 <= zip_code <= 7999:
        return 'Hainaut'
    elif 6600 <= zip_code <= 6999:
        return 'Luxembourg'
    elif 8000 <= zip_code <= 8999:
        return 'West Flanders'
    elif 9000 <= zip_code <= 9999:
        return 'East Flanders'
    else:
        return 'Unknown'
        
df['Province'] = df['Zip'].apply(get_province)

In [16]:
df = df.astype({"Price":"float",
                "Number of rooms":"float",
                "Living area":"float",
                "Terrace surface":"float",
                "Garden surface":"float",
                "Surface of the land":"float",
                "Number of facades":"float",
                "Primary energy consumption":"float"})

In [34]:
aptdf = df.copy()

print("Apartment DataFrame shape (before): ",aptdf.shape)
print("Apartment data min (with outliers): ",aptdf['Price'].min())
print("Apartment data max (with outliers): ",aptdf['Price'].max())

# Remove outliers
def remove_outliers(df, columns, n_std) -> pd.DataFrame:
    for col in columns:
        print('Working on column: {}'.format(col))
        mean = df[col].mean()
        sd = df[col].std()
        df = df[(df[col] <= mean+(n_std*sd))]
    return df

new_housedf = remove_outliers(aptdf, ['Price'], 4)

print("_"*30)
print("Apartment DataFrame shape (after removing): ",aptdf.shape)
print("Apartment data min (without outliers): ", aptdf['Price'].min())
print("Apartment data max (without outliers): ", aptdf['Price'].max())

Apartment DataFrame shape (before):  (0, 27)
Apartment data min (with outliers):  nan
Apartment data max (with outliers):  nan
Working on column: Price
______________________________
Apartment DataFrame shape (after removing):  (0, 27)
Apartment data min (without outliers):  nan
Apartment data max (without outliers):  nan


In [None]:
df.groupby("Surroundings type")["Price"].max()

In [18]:
print(new_housedf.shape)
house_df = remove_outliers(new_housedf, ['Living area','Surface of the land'], 3)
print(house_df.shape)

(0, 27)
Working on column: Living area
Working on column: Surface of the land
(0, 27)


In [20]:
house_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,8869.0,10465470.0,163342.637732,8373923.0,10423860.0,10519106.0,10566849.0,10597410.0
Price,8869.0,491786.1,412795.546609,2500.0,255000.0,375000.0,569000.0,3100000.0
Zip,8869.0,5069.579,2989.496081,1000.0,2018.0,4520.0,8210.0,9991.0
Living area,8869.0,224.3935,135.041606,23.0,145.0,189.0,260.0,2272.0
Surface of the land,8869.0,933.536,1560.875779,1.0,204.0,450.0,983.0,16254.0
Number of rooms,8818.0,3.756181,1.586556,1.0,3.0,3.0,4.0,25.0
Bathrooms,7774.0,1.605223,1.214085,1.0,1.0,1.0,2.0,25.0
Construction year,5925.0,1959.536,42.779232,1755.0,1932.0,1965.0,1991.0,2025.0
Number of facades,7801.0,2.999744,0.876766,1.0,2.0,3.0,4.0,10.0
Swimming pool,8869.0,0.04498816,0.20729,0.0,0.0,0.0,0.0,1.0


In [21]:
def one_convert_to_nan(column):
    column = column.replace(1.0, np.nan)
    return column

house_df['Surface of the land'] = one_convert_to_nan(house_df['Surface of the land'])

In [22]:
knn_df = house_df.loc[:, ~house_df.columns.isin(["Price","Type of property","Subtype of property","Locality","Surroundings type","Energy class","Heating type","Province"])]
other = ['id', 'Zip', 'Living area', 'Surface of the land', 'Number of rooms',
       'Bathrooms', 'Construction year', 'Number of facades', 'Swimming pool',
       'Furnished', 'Open fire', 'Terrace', 'Terrace surface', 'Garden',
       'Garden surface', "Primary energy consumption",'Kitchen values', 'Building Cond. values', 'Parking']

impute_knn = KNNImputer(n_neighbors=5)

knn_df = impute_knn.fit_transform(knn_df).astype(float)

#Creating dfs with missing values filled in 
imputed_houses = pd.DataFrame(knn_df, columns= other)

#Creating dfs with prices
new_houses = house_df[['id',"Price","Type of property","Subtype of property","Locality","Surroundings type","Energy class","Heating type","Province"]]

#Merging dfs (with prices and without prices (with other values filled in))
complete_houses = pd.merge(new_houses, imputed_houses,on='id')


In [23]:
complete_houses.isna().sum()


id                               0
Price                            0
Type of property                 0
Subtype of property              0
Locality                         0
Surroundings type             5160
Energy class                     0
Heating type                  1329
Province                         0
Zip                              0
Living area                      0
Surface of the land              0
Number of rooms                  0
Bathrooms                        0
Construction year                0
Number of facades                0
Swimming pool                    0
Furnished                        0
Open fire                        0
Terrace                          0
Terrace surface                  0
Garden                           0
Garden surface                   0
Primary energy consumption       0
Kitchen values                   0
Building Cond. values            0
Parking                          0
dtype: int64

In [24]:
df_urbain = pd.read_csv('Urbain.csv')
postcode_set = set(df_urbain['Postcode'])
complete_houses['Urban_value'] = complete_houses['Zip'].apply(lambda x: 1 if x in postcode_set else 0)

In [25]:
mansion = ["manor house","mansion","castle","exceptional property"]
house = ["house","villa","bungalow","chalet","country cottage","farmhouse","mixed use building","town house"]
other = ["apartment block","other property"]

complete_houses["Mansion"] = complete_houses["Subtype of property"].apply(lambda x: 1 if x in mansion else 0)
complete_houses["House_villa"] = complete_houses["Subtype of property"].apply(lambda x: 1 if x in house else 0)
complete_houses["Other_house"] = complete_houses["Subtype of property"].apply(lambda x: 1 if x in other else 0)

In [26]:
complete_houses.to_csv("final_house.csv")