# Ensemble Learning - Exercise

## 1. Import some libraries

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

# Model
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

# Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

## 2. Load dataset

In [2]:
dataset_path = "Housing.csv"
df = pd.read_csv(dataset_path)

In [3]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


## 3. Process categorical data

In [4]:
categorical_cols = df.select_dtypes(include = ["object"]).columns.to_list()
print(categorical_cols)

['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']


In [14]:
encoded_df = df.copy()
ordinal_encoder = OrdinalEncoder()
encoded_df[categorical_cols] = ordinal_encoder.fit_transform(df[categorical_cols])
encoded_df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1.0,0.0,0.0,0.0,1.0,2,1.0,0.0
1,12250000,8960,4,4,4,1.0,0.0,0.0,0.0,1.0,3,0.0,0.0
2,12250000,9960,3,2,2,1.0,0.0,1.0,0.0,0.0,2,1.0,1.0
3,12215000,7500,4,2,2,1.0,0.0,1.0,0.0,1.0,3,1.0,0.0
4,11410000,7420,4,1,2,1.0,1.0,1.0,0.0,1.0,2,0.0,0.0


## 4. Nomralize the dataset

In [31]:
normalizer = StandardScaler()
dataset_arr = normalizer.fit_transform(encoded_df)
dataset_arr[:100]

array([[ 4.56636513,  1.04672629,  1.40341936, ...,  1.51769249,
         1.80494113, -1.40628573],
       [ 4.00448405,  1.75700953,  1.40341936, ...,  2.67940935,
        -0.55403469, -1.40628573],
       [ 4.00448405,  2.21823241,  0.04727831, ...,  1.51769249,
         1.80494113, -0.09166185],
       ...,
       [ 0.82049126,  0.57627895,  0.04727831, ...,  0.35597563,
         1.80494113, -0.09166185],
       [ 0.81674539,  0.66852353,  0.04727831, ..., -0.80574124,
         1.80494113,  1.22296203],
       [ 0.80176189,  0.3917898 ,  1.40341936, ..., -0.80574124,
         1.80494113,  1.22296203]])

## 5. Split the dataset into train, valid

In [32]:
X, y = dataset_arr[:, 1:] , dataset_arr[:, 0]
test_size = 0.3
random_state = 1
is_shuffle = True
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = test_size, random_state = random_state, shuffle = is_shuffle)

## 5. Train model

In [33]:
# Build the model with Random Forest method
rf_regressor = RandomForestRegressor(random_state = random_state)
rf_regressor.fit(X_train, y_train)

In [34]:
# Build the model with AdaBoost method
ada_regressor = AdaBoostRegressor(random_state = random_state)
ada_regressor.fit(X_train, y_train)

In [35]:
# Build the model with Gradient Boosting method
gd_regressor = GradientBoostingRegressor(random_state = random_state)
gd_regressor.fit(X_train, y_train)

## 6. Evaluate the model

In [36]:
mae_lst = []
mse_lst = []

# Random Forest
rf_y_pred = rf_regressor.predict(X_val)
mae_lst.append(mean_absolute_error(y_val, rf_y_pred))
mse_lst.append(mean_squared_error(y_val, rf_y_pred))

# AdaBoost
ada_y_pred = ada_regressor.predict(X_val)
mae_lst.append(mean_absolute_error(y_val, ada_y_pred))
mse_lst.append(mean_squared_error(y_val, ada_y_pred))

# Gradient Boosting
gd_y_pred = gd_regressor.predict(X_val)
mae_lst.append(mean_absolute_error(y_val, gd_y_pred))
mse_lst.append(mean_squared_error(y_val, gd_y_pred))

evals = {
    'Mean Absolute Error' : mae_lst,
    'Mean Squared Error' : mse_lst
}

# creating a Dataframe object
eval_df = pd.DataFrame(evals)
eval_df.index = ["Random Forest", "AdaBoost", "Gradient Boosting"]
eval_df

Unnamed: 0,Mean Absolute Error,Mean Squared Error
Random Forest,0.459962,0.378689
AdaBoost,0.580135,0.582243
Gradient Boosting,0.453211,0.3963
