# California Housing Challenge

The notebook is intended to predict the average house value upon the provided house features.

In [None]:
# Import Standard Libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

In [None]:
# Define Seaborn theme parameters
theme_parameters =  {
    'axes.spines.right': False,
    'axes.spines.top': False,
    'grid.alpha':0.3,
    'figure.figsize': (16, 6),
    'font.family': 'Andale Mono',
    'axes.titlesize': 24,
    'figure.facecolor': '#E5E8E8',
    'axes.facecolor': '#E5E8E8'
}

# Set the theme
sns.set_theme(style='whitegrid',
              palette=sns.color_palette('deep'), 
              rc=theme_parameters)

# Read Data

In [None]:
# Read training data
california_housing_train = pd.read_csv('./../../data/season_3_episode_1/california_housing_train.csv')

In [None]:
california_housing_train.head()

In [None]:
california_housing_train.info()

# Exploratory Data Analysis (EDA)

## Median Income Distribution

In [None]:
# Plot the distribution of the column 'MedInc'
ax = sns.histplot(data=california_housing_train, 
                  x='MedInc')

ax.set_title('Median Income Distribution')

plt.show()

The data have a binomial distribution.

## Median House Value over Median Income

Explore the relationship of the `MedHouseVal` with respect to the `MedInc`

In [None]:
# Plot a scatterl plot of `MedHouseVal` over the `MedInc`
ax = sns.scatterplot(data=california_housing_train,
                     x='MedInc',
                     y='MedHouseVal')

ax.set_ylabel('Median House Value', 
              fontweight='bold')

ax.set_xlabel('Median Income', 
              fontweight='bold')

ax.set_title('Median House Value over Median Income', 
             fontsize=14)

plt.xticks(rotation=45)

plt.show()

There is a positive correlation between the Median House Value and the Median Income. However the data regarding the Median House Value seems to be capped at '5'.

## House Age

In [None]:
# Plot the distribution of the column 'MedInc'
ax = sns.histplot(data=california_housing_train, 
                  x='HouseAge', 
                  bins=california_housing_train['HouseAge'].nunique())

ax.set_title('House Age Distribution')

plt.show()

It seems that there are not houses older than 52 years. Since there are 3 major trends (17, 25 and 52 years old), it could be reasonable to define a categorical variable called `HouseAgeClasnuniqueith the following values:
- young
- middle
- old

### House Age Class Definition

In [None]:
# Compute precisely the 3 big peaks
california_housing_train['HouseAge'].value_counts().head(3)

In [None]:
# Define the conditions for the `HouseAgeClass` categories
house_age_class_conditions = {
    'young': california_housing_train['HouseAge'] <= 17,
    'middle': (california_housing_train['HouseAge'] > 17) &  (california_housing_train['HouseAge'] < 52),
    'old': california_housing_train['HouseAge'] == 52
}

In [None]:
# Define a categorical variable called `HouseAgeClass`
california_housing_train['HouseAgeClass'] = np.select(house_age_class_conditions.values(), 
                                                      house_age_class_conditions.keys())

## Median House Value over Median Income per House Age Class

Explore the relationship of the `MedHouseVal` with respect to the `MedInc` group by `HouseAgeClass`

In [None]:
# Plot a scatterl plot of `MedHouseVal` over the `MedInc`
ax = sns.scatterplot(data=california_housing_train,
                     x='MedInc',
                     y='MedHouseVal', 
                     hue='HouseAgeClass')

ax.set_ylabel('Median House Value', 
              fontweight='bold')

ax.set_xlabel('Median Income', 
              fontweight='bold')

ax.set_title('Median House Value over Median Income', 
             fontsize=14)

plt.xticks(rotation=45)

plt.show()

# Data Preparation

## Data Normalisation

No data needs to be normalised.

## Feature Engineering

The following features have already been engineered:
- HouseAgeClass

## Features and Label Definition

In [None]:
# Define the features and the label
numerical_features = ['MedInc', 
                      'AveRooms', 
                      'AveBedrms', 
                      'Population', 
                      'AveOccup']

categorical_features = ['HouseAgeClass']

label = ['MedHouseVal']

## Data Pipeline

In [None]:
# Define a pipeline for the numerical features
numerical_features_pipeline = Pipeline(steps=[
    ('numerical_imputer', SimpleImputer()),
    ('numerical_scaler', StandardScaler())
])

In [None]:
# Define a pipeline for the categorical features
categorical_features_pipeline = Pipeline(steps=[
    ('categorical_imputer', SimpleImputer(strategy='most_frequent')),
    ('categorical_ordinal_encoder', OrdinalEncoder())
])

In [None]:
# Bundle data preprocessing steps
data_preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_preprocessing', numerical_features_pipeline, numerical_features),
        ('categorical_preprocessing', categorical_features_pipeline, categorical_features),
    ])

# Train & Test Split

In [None]:
# Define x and y for training set
X = california_housing_train[numerical_features + categorical_features]
y = california_housing_train[label]

In [None]:
# Split training data into train and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Model Training

In [None]:
# Define the used metrics
models_metrics = ['RMSE']

In [None]:
# Initialize DataFrame of model performance
models_performance = pd.DataFrame(columns=models_metrics)

## Linear Regression

In [None]:
%%time

# Define the model
model_lr = LinearRegression()

# Define the pipeline
pipe_lr = Pipeline([
    ('data_preprocessing', data_preprocessor),
    ('linear_regression', model_lr)
])

# Train the model
pipe_lr.fit(X_train, 
            y_train)

# Get Compute predictions on the validation set
predictions_lr = pipe_lr.predict(X_test)

# Model evaluation
rmse_lr = round(mean_squared_error(y_test, predictions_lr), 2)

print('RMSE: {}'.format(rmse_lr))
print('\n')

In [None]:
# Update 'models_performance' DataFrame
models_performance.loc['Linear Regression'] = [rmse_lr]

## XGBoost

In [None]:
%%time

# Define the model
model_xgb = XGBRegressor(n_estimators=500)

# Define the pipeline
pipe_xgb = Pipeline([
    ('data_preprocessing', data_preprocessor),
    ('xgboost', model_xgb)
])

# Train the model
pipe_xgb.fit(X_train,
             y_train,
             xgboost__verbose=False)

# Compute the predictions on the validation set
predictions_xgb = pipe_xgb.predict(X_test)

# Model evaluation
rmse_xgb = round(mean_squared_error(y_test, predictions_xgb), 2)

print('RMSE: {}'.format(rmse_xgb))
print('\n')

In [None]:
# Update 'models_performance' DataFrame
models_performance.loc['XGBoost'] = [rmse_xgb]

In [None]:
#TODO XGBoost - K-Fold Cross-Validation

In [None]:
#TODO PyTorch Neural Network