# Model Development 

## Libraries & Data Import

In [243]:
import pandas as pd 
import numpy as np
import json
import pickle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [2]:
df = pd.read_csv(
    "/Users/tonyanciaux/Documents/AI Bootcamp - BeCode/project3_immovlan_analysis/utils/data_cleaned.csv",
    sep=";",
    index_col=0,
)

"""
The expected input should be in the following JSON format: 
        {
        "data": {
        "area": "MANDATORY [int]",
        "property-type": "MANDATORY ['Appartment', 'House'] [str]",
        "bedrooms-number": "MANDATORY [int]",
        "province": "MANDATORY [str]",
        "furnished": "OPTIONAL [bool]", 
        "garden": "OPTIONAL [bool]",
        "garden-area": "OPTIONAL [int]",
        "kitchen": "OPTIONAL ['Not equipped', 'Partially equipped', 'Fully equipped', 'Super equipped']",
        "terrace": "OPTIONAL [bool]",
        "terrace-area": "OPTIONAL [int]",
        "facades-number": "OPTIONAL [int]",
        "building-state": "OPTIONAL ['To be renovated', 'Normal', 'Excellent', 'Fully renovated', 'New']"
        }
        }
"""

## Data Cleaning

### Sorting out Columns

In [3]:
df.columns

Index(['Locality', 'Type of property', 'Subtype of property', 'Price',
       'Number of bedrooms', 'Livable surface', 'Kitchen equipment',
       'Number of bathrooms', 'Number of toilets', 'Floor of appartment',
       'Number of floors', 'Balcony', 'State of the property', 'Build Year',
       'Surface bedroom 1', 'Furnished', 'Surface of living-room', 'Cellar',
       'Surface kitchen', 'Entry phone', 'Elevator', 'Number of showers',
       'Orientation of the front facade', 'Number of facades', 'Terrace',
       'Surface terrace', 'Surface bedroom 2', 'Security door',
       'Access for disabled', 'Sewer Connection', 'Garden', 'Surface garden',
       'Surface bedroom 3', 'Garage'],
      dtype='object')

In [5]:
sorted_df = df[
    [
        "Livable surface",
        "Type of property",
        "Number of bedrooms",
        "Furnished",
        "Locality",
        "Garden",
        "Surface garden",
        "Kitchen equipment",
        "Terrace",
        "Surface terrace",
        "Number of facades",
        "State of the property"
    ]
]

In [None]:
sorted_df = sorted_df.dropna(subset=['area', 'bedrooms-number'])

### Renaming Columns

In [6]:
sorted_df = sorted_df.rename(
    {
        "Livable surface": "area",
        "Type of property": "property-type",
        "Number of bedrooms": "bedrooms-number",
        "Furnished": "furnished",
        "Locality": "zip-code",
        "Garden": "garden",
        "Surface garden": "garden-area",
        "Kitchen equipment": "kitchen",
        "Terrace": "terrace",
        "Surface terrace": "terrace-area",
        "Number of facades": "facades-number",
        "State of the property": "building-state",
    },
    axis=1,
)

In [10]:
sorted_df

Unnamed: 0,area,property-type,bedrooms-number,furnished,zip-code,garden,garden-area,kitchen,terrace,terrace-area,facades-number,building-state
0,100.0,flat,2.0,0,1000 Brussels,0,,Super equipped,0,0.0,2,Normal
1,80.0,flat,1.0,0,1000 Brussels,0,,Super equipped,0,0.0,2,Normal
2,80.0,flat,2.0,0,1000 Brussels,0,,Partially equipped,1,4.0,2,Normal
3,217.0,flat,2.0,0,1000 Brussels,0,,Super equipped,1,40.0,2,Fully renovated
4,207.0,flat,2.0,0,1000 Brussels,0,,Partially equipped,1,10.0,2,Fully renovated
...,...,...,...,...,...,...,...,...,...,...,...,...
18475,103.0,flat,2.0,0,9991 Adegem,1,127.0,Fully equipped,1,21.0,3,New
18476,53.0,flat,1.0,0,9991 Adegem,0,,Fully equipped,0,0.0,2,New
18500,99.0,flat,3.0,0,9991 Adegem,0,,Fully equipped,0,0.0,3,New
18503,630.0,house,6.0,0,9992 Middelburg,0,,Partially equipped,0,0.0,4,Excellent


### Total Cleaning Function

#### Function Definition

In [424]:
default_columns = ['price', 'bedroom-numbers', 'area', 'kitchen', 'bathroom-number', 'balcony', 'building-state', 
                   'master-bedroom-area', 'living-room-area', 'cellar', 'facades-number', 'terrace-area', 
                   'garden-area', 'garage', 'apartment', 'bungalow', 'chalet', 'cottage', 'duplex', 
                   'ground-floor', 'loft', 'mansion', 'master-house', 'mixed-building', 'penthouse', 'residence', 
                   'studio', 'triplex', 'villa', 'Antwerp Province', 'Brussels-Capital Region', 
                   'East Flanders Province', 'Flemish Brabant Province', 'Hainaut Province', 'Limburg Province', 
                   'Liège Province', 'Luxembourg Province', 'Namur Province', 'Walloon Brabant Province', 
                   'West Flanders Province']


zipcodes = {
    ("1000", "1299"): "Brussels-Capital Region",
    ("1300", "1499"): "Walloon Brabant Province",
    ("1500", "1999"): "Flemish Brabant Province",
    ("2000", "2999"): "Antwerp Province",
    ("3000", "3499"): "Flemish Brabant Province",
    ("3500", "3999"): "Limburg Province",
    ("4000", "4999"): "Liège Province",
    ("5000", "5999"): "Namur Province",
    ("6000", "6599"): "Hainaut Province",
    ("6600", "6999"): "Luxembourg Province",
    ("7000", "7999"): "Hainaut Province",
    ("8000", "8999"): "West Flanders Province",
    ("9000", "9999"): "East Flanders Province",
}


def preprocess(data):
    """
    Take a JSON formatted file as input
    :return: preprocessed data ready for prediction model
    """
    zeros = np.zeros(len(default_columns))
    df = pd.DataFrame()

    
    # kitchen data cleaning
    if data["data"]["kitchen"] == "Not equipped":
        pd_data["data"]["kitchen"] = 0
    elif data["data"]["kitchen"] == "Partially equipped":
        pd_data["data"]["kitchen"] = 1
    elif data["data"]["kitchen"] == "Fully equipped":
        pd_data["data"]["kitchen"] = 2
    elif data["data"]["kitchen"] == "Super equipped":
        pd_data["data"]["kitchen"] = 3

        
    # zip-code transformation
    for k, v in zipcodes.items():
        if k[0] < pd_data["data"]["zip-code"] < k[1]:
            pd_data["data"]["zip-code"] = v


    # Building-state cleaning
    if data["data"]["building-state"] == "To be renovated":
        pd_data["data"]["building-state"] = 0
    elif data["data"]["building-state"] == "Normal":
        pd_data["data"]["building-state"] = 1
    elif data["data"]["building-state"] == "Excellent":
        pd_data["data"]["building-state"] = 2
    elif data["data"]["building-state"] == "Fully renovated":
        pd_data["data"]["building-state"] = 3
    elif data["data"]["building-state"] == "New":
        pd_data["data"]["building-state"] = 4

        
    # Fill-in the empty values in the input with 0
    pd_data = pd_data.fillna(0)
        
   # # Replaces Boolean with Binary values 
   # for value in pd_data["data"].values:
   #     if value == True:
   #         value = 1
   #     elif value == False:
   #         value = 0

            
    return pd_data

In [412]:
dummy = {
    "data": {
        "area": 134,
        "property-type": "Appartment",
        "bedrooms-number": 3,
        "furnished": False,
        "zip-code": "5000 Somewhere",
        "garden": True,
        "garden-area": np.nan,
        "kitchen": "Partially equipped",
        "terrace": True,
        "terrace-area": 100,
        "facades-number": 2,
        "building-state": "Excellent",
    }
}

In [422]:
dummy_pd = pd.DataFrame(dummy)
dummy_pd["data"]

area                              134
bedrooms-number                     3
building-state              Excellent
facades-number                      2
furnished                       False
garden                           True
garden-area                       NaN
kitchen            Partially equipped
property-type              Appartment
terrace                          True
terrace-area                      100
zip-code               5000 Somewhere
Name: data, dtype: object

In [425]:
preprocess(dummy)

Unnamed: 0,data
area,134
bedrooms-number,3
building-state,2
facades-number,2
furnished,False
garden,True
garden-area,0
kitchen,1
property-type,Appartment
terrace,True


#### Function Calling

## Regression Model

### DF Preparation

In [143]:
ml_ready_data = pd.read_csv("/Users/tonyanciaux/Documents/AI Bootcamp - BeCode/Immo_Eliza_Regression/ML_ready_data.csv")

In [144]:
ml_ready_data.head()

Unnamed: 0,Price,Number of bedrooms,Livable surface,Kitchen equipment,Number of bathrooms,Balcony,State of the property,Surface master bedroom,Surface of living-room,Cellar,...,Brussels-Capital Region,East Flanders Province,Flemish Brabant Province,Hainaut Province,Limburg Province,Liège Province,Luxembourg Province,Namur Province,Walloon Brabant Province,West Flanders Province
0,333500.0,2.0,100.0,4.0,1.0,0.0,2.0,13.0,29.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,379000.0,1.0,80.0,4.0,1.0,0.0,2.0,21.0,25.0,1.0,...,1,0,0,0,0,0,0,0,0,0
2,295000.0,2.0,80.0,2.0,1.0,0.0,2.0,10.0,28.0,0.0,...,1,0,0,0,0,0,0,0,0,0
3,635000.0,2.0,217.0,4.0,1.0,0.0,4.0,25.0,63.0,0.0,...,1,0,0,0,0,0,0,0,0,0
4,595000.0,2.0,207.0,2.0,2.0,0.0,4.0,16.0,60.0,0.0,...,1,0,0,0,0,0,0,0,0,0


In [145]:
ml_ready_data.columns

Index(['Price', 'Number of bedrooms', 'Livable surface', 'Kitchen equipment',
       'Number of bathrooms', 'Balcony', 'State of the property',
       'Surface master bedroom', 'Surface of living-room', 'Cellar',
       'Number of facades', 'Surface terrace', 'Surface garden', 'Garage',
       'apartment', 'bungalow', 'chalet', 'cottage', 'duplex', 'ground-floor',
       'loft', 'mansion', 'master-house', 'mixed-building', 'penthouse',
       'residence', 'studio', 'triplex', 'villa', 'Antwerp Province',
       'Brussels-Capital Region', 'East Flanders Province',
       'Flemish Brabant Province', 'Hainaut Province', 'Limburg Province',
       'Liège Province', 'Luxembourg Province', 'Namur Province',
       'Walloon Brabant Province', 'West Flanders Province'],
      dtype='object')

### Standardisation

In [175]:
scalled = ml_ready_data.values
scaler = StandardScaler()
scalled = scaler.fit_transform(scalled)
scalled = pd.DataFrame(scalled)
scalled.columns = ml_ready_data.columns
scalled.head()

Unnamed: 0,Price,Number of bedrooms,Livable surface,Kitchen equipment,Number of bathrooms,Balcony,State of the property,Surface master bedroom,Surface of living-room,Cellar,...,Brussels-Capital Region,East Flanders Province,Flemish Brabant Province,Hainaut Province,Limburg Province,Liège Province,Luxembourg Province,Namur Province,Walloon Brabant Province,West Flanders Province
0,-0.052862,-0.547885,-0.338221,1.636507,-0.352604,-0.144758,-0.616825,-0.238201,-0.277172,-0.562946,...,2.366979,-0.328992,-0.33573,-0.394024,-0.248657,-0.270794,-0.137316,-0.119945,-0.182656,-0.436323
1,0.103939,-1.283359,-0.454416,1.636507,-0.352604,-0.144758,-0.616825,0.036201,-0.360172,1.776369,...,2.366979,-0.328992,-0.33573,-0.394024,-0.248657,-0.270794,-0.137316,-0.119945,-0.182656,-0.436323
2,-0.185539,-0.547885,-0.454416,-0.665526,-0.352604,-0.144758,-0.616825,-0.341101,-0.297922,-0.562946,...,2.366979,-0.328992,-0.33573,-0.394024,-0.248657,-0.270794,-0.137316,-0.119945,-0.182656,-0.436323
3,0.986157,-0.547885,0.341521,1.636507,-0.352604,-0.144758,0.810923,0.173401,0.428334,-0.562946,...,2.366979,-0.328992,-0.33573,-0.394024,-0.248657,-0.270794,-0.137316,-0.119945,-0.182656,-0.436323
4,0.84831,-0.547885,0.283423,-0.665526,1.215581,-0.144758,0.810923,-0.1353,0.366083,-0.562946,...,2.366979,-0.328992,-0.33573,-0.394024,-0.248657,-0.270794,-0.137316,-0.119945,-0.182656,-0.436323


In [273]:
scalled.columns

Index(['Price', 'Number of bedrooms', 'Livable surface', 'Kitchen equipment',
       'Number of bathrooms', 'Balcony', 'State of the property',
       'Surface master bedroom', 'Surface of living-room', 'Cellar',
       'Number of facades', 'Surface terrace', 'Surface garden', 'Garage',
       'apartment', 'bungalow', 'chalet', 'cottage', 'duplex', 'ground-floor',
       'loft', 'mansion', 'master-house', 'mixed-building', 'penthouse',
       'residence', 'studio', 'triplex', 'villa', 'Antwerp Province',
       'Brussels-Capital Region', 'East Flanders Province',
       'Flemish Brabant Province', 'Hainaut Province', 'Limburg Province',
       'Liège Province', 'Luxembourg Province', 'Namur Province',
       'Walloon Brabant Province', 'West Flanders Province'],
      dtype='object')

In [309]:
shorter_df = ml_ready_data[
    [
        "Price", 
        "Number of bedrooms", 
        "Livable surface", 
        "Number of facades", 
        "Surface garden",
        "Kitchen equipment"
    ]
]

### Model Training

In [310]:
X = np.array(shorter_df.loc[:, "Number of bedrooms":])
y = np.array(shorter_df["Price"])

In [311]:
X.shape

(17345, 5)

In [312]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [313]:
reg = LinearRegression().fit(X_train, y_train)

In [314]:
reg.score(X_train, y_train)

0.37449841973725906

In [315]:
y_pred = reg.predict(X_test)

In [316]:
print(r2_score(y_test, y_pred))

-0.5639616388688529


In [317]:
reg.intercept_

-26304.623105209437

In [318]:
reg.coef_

array([3.95844127e+04, 1.01972754e+03, 1.38604559e+04, 1.40430119e+01,
       2.71088296e+04])

In [319]:
filename = 'model.sav'
pickle.dump(reg, open(filename, 'wb'))

### Predictions

In [320]:
bedroom = 3
surface = 100
facades = 2
garden = 200
kitchen = 2

reg.predict([[bedroom, surface, facades, garden, kitchen]])

array([279168.5422565])