# California Housing Challenge

The notebook is intended to predict the average house value upon the provided house features.

In [None]:
# Import Standard Libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

In [None]:
# Define Seaborn theme parameters
theme_parameters =  {
    'axes.spines.right': False,
    'axes.spines.top': False,
    'grid.alpha':0.3,
    'figure.figsize': (16, 6),
    'font.family': 'Andale Mono',
    'axes.titlesize': 24,
    'figure.facecolor': '#E5E8E8',
    'axes.facecolor': '#E5E8E8'
}

# Set the theme
sns.set_theme(style='whitegrid',
              palette=sns.color_palette('deep'), 
              rc=theme_parameters)

# Read Data

In [None]:
# Read training data
train_data = pd.read_csv('./../../data/S3E1/california_housing_train.csv')

In [None]:
train_data.head()

In [None]:
train_data.info()

# Exploratory Data Analysis (EDA)

## Train Features & Label Distribution

In [None]:
# Plot the histograms of each feature
figure, ax = plt.subplots(3, 3, figsize=(16, 9))
ax = ax.flatten()

# Fetch the data to plot (exclude the 'id' column)
for index, column_name in enumerate(train_data.columns[1:]):
    
    # Plot data
    sns.histplot(data=train_data[column_name], 
                 ax=ax[index])
    
    ax[index].set_title(column_name, 
                        fontsize=14, 
                        fontweight='bold')
    
    ax[index].tick_params(labelrotation=45)
    
plt.suptitle('Feature & Label Distrubtion', 
             fontweight='bold',
             fontsize=30)
    
plt.tight_layout()