# 1. Feature Engineering
---

In [120]:
import typing
import numpy as np
import pandas as pd

---

In [71]:
def ordinal(entry: str) -> int:
    """
    Function that transforms Qualitative/Ordinal values into integers 
    """
    if entry == "Not equipped" or entry == "To be renovated":
        return 1
    elif entry == "Partially equipped" or entry == "Normal":
        return 2
    elif entry == "Fully equipped" or entry == "Excellent":
        return 3
    elif entry == "Super equipped" or entry == "Fully renovated":
        return 4
    elif entry == "New":
        return 5

In [90]:
def round_up_to_int(entry: float) -> int:
    """
    Function that rounds up float values and turns them into integers
    """
    return int(round(entry, 0))

In [133]:
def empty_as_one(entry: None) -> int:
    """
    Function that replaces floats with integer and empty values with default value "1"
    """
    if pd.isnull(entry):
        return 1
    else:
        return int(entry)

In [178]:
def empty_as_zero(entry: None) -> int:
    """
    Function that replaces floats with integer and empty values with default value "0"
    """
    if pd.isnull(entry):
        return 0
    else:
        return int(entry)

In [None]:
def empty_to_average(entry: None) -> int:
    """
    Function that replaces floats with integer and empty values with a rounded average of all values of the column
    """
    

---

In [134]:
df = pd.read_csv(
    "/Users/tonyanciaux/Documents/AI Bootcamp - BeCode/project3_immovlan_analysis/utils/data_cleaned.csv",
    sep=";",
)
df.head()

Unnamed: 0.1,Unnamed: 0,Locality,Type of property,Subtype of property,Price,Number of bedrooms,Livable surface,Kitchen equipment,Number of bathrooms,Number of toilets,...,Terrace,Surface terrace,Surface bedroom 2,Security door,Access for disabled,Sewer Connection,Garden,Surface garden,Surface bedroom 3,Garage
0,0,1000 Brussels,flat,apartment,333500,2.0,100.0,Super equipped,1,1,...,0,0.0,1098298429,0,0,1,0,,8916937566,0
1,1,1000 Brussels,flat,apartment,379000,1.0,80.0,Super equipped,1,1,...,0,0.0,8786387434,0,0,1,0,,7133550053,0
2,2,1000 Brussels,flat,apartment,295000,2.0,80.0,Partially equipped,1,1,...,1,4.0,8786387434,0,0,1,0,,7133550053,0
3,3,1000 Brussels,flat,loft,635000,2.0,217.0,Super equipped,1,3,...,1,40.0,21,1,1,1,0,,1934975452,0
4,4,1000 Brussels,flat,loft,595000,2.0,207.0,Partially equipped,2,2,...,1,10.0,16,0,1,1,0,,1845806076,0


---
---

### Dropping rows

In [135]:
#  Merging "Type of property" and "Subtype of property"

df["Property type"] = df["Type of property"] + " - " + df["Subtype of property"]

In [136]:
#  Dropping low-value and too strongly correlated columns 

df = df.drop(
    [
        "Unnamed: 0",
        "Security door",
        "Sewer Connection",
        "Access for disabled",
        "Orientation of the front facade",
        "Furnished",
        "Type of property",
        "Subtype of property",
        "Surface bedroom 3",
        "Surface bedroom 2",
        "Surface kitchen",
        "Number of toilets",
        "Floor of appartment",
        "Number of floors",
        "Build Year",
        "Entry phone",
        "Elevator",
        "Number of showers",
        "Terrace",
        "Garden",
    ],
    axis=1,
)

In [137]:
print(df.columns)
print("Number of columns:", len(df.columns))

Index(['Locality', 'Price', 'Number of bedrooms', 'Livable surface',
       'Kitchen equipment', 'Number of bathrooms', 'Balcony',
       'State of the property', 'Surface bedroom 1', 'Surface of living-room',
       'Cellar', 'Number of facades', 'Surface terrace', 'Surface garden',
       'Garage', 'Property type'],
      dtype='object')
Number of columns: 16


### Categorical 
#### Qualitative / Nominal Data
categories that cannot be put in any order

In [138]:
nominal_data = df[["Locality", "Property type"]]

In [139]:
df["Locality"].value_counts()

1000 Brussels           319
1050 Elsene             314
1030 Schaarbeek         293
8370 Blankenberge       258
9300 Aalst              257
                       ... 
4650 Chaineux             1
7973 Grandglise           1
4632 Cérexhe-Heuseux      1
4630 Ayeneux              1
5360 Hamois               1
Name: Locality, Length: 899, dtype: int64

In [140]:
df["Property type"].value_counts()

house - residence         7605
flat - apartment          6574
house - villa              858
flat - ground-floor        486
flat - duplex              447
house - mixed-building     356
flat - penthouse           334
flat - studio              236
house - master-house       129
flat - loft                 84
house - cottage             84
house - bungalow            79
flat - triplex              36
house - chalet              25
house - mansion             14
Name: Property type, dtype: int64

#### Qualitative / Ordinal 
categories that can be ordered

In [141]:
ordinal_data = df[["Kitchen equipment", "State of the property"]].copy()

In [142]:
ordinal_data["Kitchen equipment"].value_counts()

Partially equipped    10171
Super equipped         3905
Fully equipped         2745
Not equipped            526
Name: Kitchen equipment, dtype: int64

#### Interpretation: 

- Super equipped = 4
- Fully equipped = 3
- Partially equipped = 2
- Not equipped = 1

In [143]:
ordinal_data["Kitchen equipment"] = ordinal_data["Kitchen equipment"].apply(ordinal)

In [144]:
df["Kitchen equipment"] = ordinal_data["Kitchen equipment"]

In [145]:
df["Kitchen equipment"].value_counts()

2    10171
4     3905
3     2745
1      526
Name: Kitchen equipment, dtype: int64

In [146]:
ordinal_data["State of the property"].value_counts()

Normal             7884
New                4550
Excellent          2561
To be renovated    1975
Fully renovated     377
Name: State of the property, dtype: int64

#### Interpretation: 
 - New = 5
 - Fully renovated = 4
 - Excellent = 3
 - Normal = 2
 - To be renovated = 1 

In [147]:
ordinal_data["State of the property"] = ordinal_data["State of the property"].apply(ordinal)

In [148]:
df["State of the property"] = ordinal_data["State of the property"]

In [149]:
df["State of the property"].value_counts()

2    7884
5    4550
3    2561
1    1975
4     377
Name: State of the property, dtype: int64

### Numerical 
#### Discrete & Continuous

In [150]:
df = df.rename(columns={"Surface bedroom 1": "Surface master bedroom"})

In [151]:
numerical_data = df[
    [
        "Price",
        "Surface master bedroom",
        "Number of bedrooms",
        "Livable surface",
        "Number of bathrooms",
        "Balcony",
        "Surface of living-room",
        "Number of facades",
        "Surface terrace",
        "Surface garden",
        "Garage",
        
    ]
]

---
---

### Null value

In [188]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17345 entries, 0 to 17346
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Locality                17345 non-null  object 
 1   Price                   17345 non-null  int64  
 2   Number of bedrooms      17345 non-null  int64  
 3   Livable surface         15418 non-null  float64
 4   Kitchen equipment       17345 non-null  int64  
 5   Number of bathrooms     17345 non-null  int64  
 6   Balcony                 17345 non-null  int64  
 7   State of the property   17345 non-null  int64  
 8   Surface master bedroom  15590 non-null  object 
 9   Surface of living-room  15623 non-null  object 
 10  Cellar                  17345 non-null  int64  
 11  Number of facades       17345 non-null  int64  
 12  Surface terrace         17345 non-null  int64  
 13  Surface garden          17345 non-null  int64  
 14  Garage                  17345 non-null

In [153]:
#  Duplicate the column with empty values into a new dataframe as a safety net
null_value = df[
    [
        "Number of bedrooms",
        "Livable surface",
        "Surface master bedroom",
        "Surface of living-room",
        "Surface terrace",
        "Surface garden",
    ]
].copy()

---

In [176]:
df["Number of bedrooms"].value_counts()

2     5794
3     5671
1     2358
4     2181
5      780
6      324
7      103
8       54
9       22
10      17
12      12
11      10
14       8
16       4
20       3
13       1
23       1
22       1
15       1
Name: Number of bedrooms, dtype: int64

In [168]:
null_value["Number of bedrooms"] = null_value["Number of bedrooms"].apply(empty_as_one)

In [169]:
df["Number of bedrooms"] = null_value["Number of bedrooms"]

In [175]:
df_filtered = df[df['Number of bedrooms'] >= 40]
df_filtered.head()

Unnamed: 0,Locality,Price,Number of bedrooms,Livable surface,Kitchen equipment,Number of bathrooms,Balcony,State of the property,Surface master bedroom,Surface of living-room,Cellar,Number of facades,Surface terrace,Surface garden,Garage,Property type


---

In [181]:
null_value["Surface terrace"].value_counts()

0       10719
10        378
20        365
8         339
15        317
        ...  
112         1
340         1
1050        1
135         1
537         1
Name: Surface terrace, Length: 146, dtype: int64

In [180]:
null_value["Surface terrace"] = null_value["Surface terrace"].apply(empty_as_zero)

In [182]:
df["Surface terrace"] = null_value["Surface terrace"]

In [186]:
null_value["Surface garden"].value_counts()

0       14647
100        71
50         67
200        62
40         49
        ...  
372         1
453         1
381         1
368         1
2940        1
Name: Surface garden, Length: 678, dtype: int64

In [185]:
null_value["Surface garden"] = null_value["Surface garden"].apply(empty_as_zero)

In [187]:
df["Surface garden"] = null_value["Surface garden"]

--- 