# California Housing Challenge

The notebook is intended to predict the average house value upon the provided house features.

In [None]:
# Import Standard Libraries
import pandas as pd
import numpy as np

from scipy.stats import zscore

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

In [None]:
# Define Seaborn theme parameters
theme_parameters =  {
    'axes.spines.right': False,
    'axes.spines.top': False,
    'grid.alpha':0.3,
    'figure.figsize': (16, 6),
    'font.family': 'Andale Mono',
    'axes.titlesize': 24,
    'figure.facecolor': '#E5E8E8',
    'axes.facecolor': '#E5E8E8'
}

# Set the theme
sns.set_theme(style='whitegrid',
              palette=sns.color_palette('deep'), 
              rc=theme_parameters)

# Read Data

In [None]:
# Read training data
train_data = pd.read_csv('./../../data/S3E1/california_housing_train.csv')
test_data = pd.read_csv('./../../data/S3E1/california_housing_test.csv')

In [None]:
train_data.head()

In [None]:
train_data.info()

# Exploratory Data Analysis (EDA)

## Train Features & Label Distribution

In [None]:
# Plot the histograms of each feature
figure, ax = plt.subplots(3, 3, figsize=(16, 9))
ax = ax.flatten()

# Fetch the data to plot (exclude the 'id' column)
for index, column_name in enumerate(train_data.columns[1:]):
    
    # Plot data
    sns.histplot(data=train_data[column_name], 
                 ax=ax[index])
    
    ax[index].set_title(column_name, 
                        fontsize=14, 
                        fontweight='bold')
    
    ax[index].tick_params(labelrotation=45)
    
plt.suptitle('Feature & Label Distrubtion', 
             fontweight='bold',
             fontsize=30)
    
plt.tight_layout()

### Average Occupancy Distribution

In [None]:
# Plot the histogram of 'AveOccup'
ax = sns.boxplot(data=train_data, 
                  x='AveOccup')


ax.set_title('Average Occupancy Distribution')

plt.tight_layout()

plt.show()

In [None]:
# Drop the outlier
train_data = train_data[train_data['AveOccup'] < 100]

- `HouseAge` has some strange peaks. Probably some rounding operations
- `AveOccup` has a huge outlier. It has been dropped for EDA sake.
- `MedHouseVal` has a strange peak at the end. Probably a cap

## Pearson Correlation

In [None]:
# Compute the correlation matrix
correlation_matrix = train_data.iloc[:, 1:].corr()

# Generate a mask for the upper triangle
correlation_mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# Define figure and axis
figure, ax = plt.subplots(figsize=(12, 8))

# Plot the correlation matrix
sns.heatmap(correlation_matrix, 
            mask=correlation_mask, 
            cmap='mako',
            vmax=1.0, 
            vmin=-1.0, 
            center=0, 
            square=True, 
            linewidths=.5, 
            annot=True,
            annot_kws={'fontsize': 8},
            cbar_kws={"shrink":.8, 'orientation':'vertical'})

# Set title
ax.set_title('Pearson Correlation', 
             fontsize=20, 
             fontweight='bold')

plt.tight_layout()

plt.show()

Positive correlation between:
- `AveRooms` and `MedInc`

## Train Data Geography

In [None]:
# Define dots colors
color_scale = [(0, 'orange'), (1,'red')]

# Plot the data
figure = px.scatter_mapbox(train_data,
                           lat="Latitude",
                           lon="Longitude",
                           hover_name="MedHouseVal",
                           hover_data=["MedHouseVal"],
                           color="MedHouseVal",
                           color_continuous_scale=color_scale,
                           size="MedHouseVal",
                           zoom=8,
                           height=600,
                           width=600)

figure.update_layout(mapbox_style="open-street-map")
figure.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
figure.show()

Define an `hot_area`flag for the houses sold in the following squares:
- (-123 <= lon <= -121.45) and (36.6 <= lat <= 38.6)
- (-119.90 <= lon <= -119.42) and (34.25 <= lat <= 34.5)
- (-119.35 <= lon <= -117.55) and (33.36 <= lat <= 34.52)
- (-117.29 <= lon <= -117.18) and (32.84 <= lat <= 33.01)

## Train vs Test Feature & Label Distribution

In [None]:
# Plot the KDE of each feature
figure, ax = plt.subplots(3, 3, figsize=(16, 12))
ax = ax.flatten()

# Fetch the data to plot (exclude the 'id' and 'quality' columns)
for index, column_name in enumerate(train_data.columns[1:-1]):
    
    # Plot data
    sns.kdeplot(data=train_data[column_name],
                label='Train',
                ax=ax[index])
    
    sns.kdeplot(data=test_data[column_name],
                label='Test',
                ax=ax[index])
    
    ax[index].set_title(column_name, fontsize=14)
    
    ax[index].tick_params(labelrotation=45)
    
    # Retrieve legend information
    handles = ax[index].get_legend_handles_labels()[0]
    labels = ax[index].get_legend_handles_labels()[1]
    ax[index].legend().remove()
    
# Remove the empty subplot
figure.delaxes(ax[-1])

# Set the legend
figure.legend(handles, 
              labels, 
              loc='upper center', 
              bbox_to_anchor=(0.5, 1.03), 
              fontsize=12,
              ncol=2)

plt.tight_layout()

No difference in the train and test data distributions.

## Count Outliers with the Z-Score

In [None]:
# Compute the Z-Score for the feature columns across 'quality' classes
z_scores = train_data.iloc[:, 1:-1].apply(zscore)

In [None]:
# Consider as an 'outlier' every records with a Z-Score bigger than 2 SDs in absolute value terms
outliers = z_scores.abs().ge(2).sum().to_frame('Count').sort_values(by='Count')

In [None]:
# Plot the feature importance
ax = sns.barplot(data=outliers, 
                 x=outliers.index.tolist(), 
                 y='Count')

# Set title
ax.set_title('Outlisers Count', 
             fontsize=20, 
             fontweight='bold')

plt.xticks(fontsize=12, 
           rotation=45)

plt.show()

The `HouseAge` has confirmed to have that strange spike at the end. It is necessary to create interval of classes for this feature.

## Train Features Pairplot

In [None]:
# Plot the Pairplot between the features
sns.pairplot(train_data.drop(columns=['id', 'MedHouseVal']),
             kind="reg",
             diag_kind='kde',
             plot_kws={'line_kws':{'color':'red'}},
             corner=True)

# Set title plot
plt.suptitle('Train Feature Pairplot', 
             fontsize=20, 
             fontweight='bold')

plt.tight_layout()

plt.show()

Positive correlations:
- `AveRooms` and `MedInc`
- `AveBedrms` and `AveRooms`

## Conclusions

- Round the HouseAge to 3 classes
- Create a feature `AveRooms per MedInc`
- Create a feature `HotArea`
- Create a feature `AveBedrms per AveRooms`

# Data Preparation

## Feature Engineering

In [None]:
def compute_engineered_features(data: pd.DataFrame) -> pd.DataFrame:
    
    """
    Create a pre-defined set of engineered feature to the input DataFrame
    
    Args:
        data Pandas.DataFrame input
    
    Returns:
        data Pandas.DataFrame with additional engineered features
    """
    
    # Define the conditions for the `HouseAgeClass` categories
    house_age_class_conditions = {
        1: data['HouseAge'] <= 17,
        2: (data['HouseAge'] > 17) &  (data['HouseAge'] < 52),
        3: data['HouseAge'] == 52
    }

    # Define a categorical variable called `HouseAgeClass`
    data['HouseAgeClass'] = np.select(house_age_class_conditions.values(),
                                      house_age_class_conditions.keys())
    
    
    # Create a feature `AveRooms per MedInc`
    data['AveRooms per MedInc'] = data['AveRooms'] * data['MedInc']
    
    # Create a feature `HotArea`
    # Define the rectangular areas of interest
    hot_areas = [(-123, -121.45, 36.6, 38.6),
                 (-119.90, -119.42, 34.25, 34.5),
                 (-119.35, -117.55, 33.86, 34.52),
                 (-117.29, -117.18, 32.84, 33.01)]

    # Check if each point is inside any of the hot areas
    is_in_hot_area = False
    for area in hot_areas:
        is_in_hot_area |= ((data['Longitude'] >= area[0]) &
                           (data['Longitude'] <= area[1]) &
                           (data['Latitude'] >= area[2]) &
                           (data['Latitude'] <= area[3]))
        
    # Assign a binary value to indicate if a point is in a hot area or not
    data['HotArea'] = np.where(is_in_hot_area, 1, 0)

    
    # Create a feature `AveBedrms per AveRooms`
    data['AveBedrms per AveRooms'] = data['AveBedrms'] * data['AveRooms']


In [None]:
# Apply the feature engineering
compute_engineered_features(train_data)
compute_engineered_features(test_data)

## Features and Labels Definition

In [None]:
# Define features and labels
numerical_features = train_data.columns[1:9].tolist()

numerical_engineered_featuers = train_data.columns[10:].tolist()

labels = ['MedHouseVal']

## Numerical Features Preprocessing Pipeline

In [None]:
# Numerical features pipeline
numerical_features_pipeline = Pipeline(steps=[
    ('numerical_scaler', StandardScaler())
])

## Bundle Data Preprocessing Steps

In [None]:
# Bunlde data preprocessing steps
data_preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_preprocessing', 
         numerical_features_pipeline, 
         numerical_features + numerical_engineered_featuers),
    ])

In [None]:
Train & Test Split