In [1]:
import numpy as np
import pandas as pd

from scipy.stats import zscore
from ucimlrepo import fetch_ucirepo

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.inspection import permutation_importance


import plotly.express as px
import plotly.graph_objects as go

In [2]:
def generate_statistical_summary(input_df:pd.DataFrame):
    """Generates statistical summary of a Pandas dataframe as a Plotly Figure object"""
    df = input_df.describe().T
    cols = ["Variables", 'Count', 'Mean',"ST-Dev","Min","Median",'Max']
    fig = go.Figure(data=[go.Table(
    header=dict(values=list(f"<b>{item}</b>" for item in cols),
                fill_color='paleturquoise',
                font=dict(color='black', size=15),
                align='center'),
    cells=dict(values=[df.index,
                       df["count"],
                       df["mean"].round(3),
                       df["std"].round(3),
                       df["min"].round(3),
                       df["50%"].round(3),
                       df["max"]],
               fill=dict(color=['grey', 'cyan']),
               font=dict(color='black',
                         family = ["Times New Roman","helvetica"],
                        size=15),
               height=25,
               align='center'))
])

    fig.update_layout(
    title={
        'text': f"<b>Statistical Summary</b>",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        font=dict(
        family="Times New Roman",
        size=20,
        color="RebeccaPurple"
    ))

    fig.show()

In [3]:
# fetch dataset 
forest_fires = fetch_ucirepo(id=162) 
  
# data (as pandas dataframes) 
X = forest_fires.data.features 
y = forest_fires.data.targets 

In [4]:
DATA_RS, MODEL_RS = 100,100

# Q1

- Randomly split the data in Train, Validation and Test Set

In [5]:
# Preprocessing
features = ['X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH',
       'wind', 'rain']
target = ["area"]

y = np.log(y+1)
df = pd.concat([X,y], axis=1, join="inner")

def deobjectify_df(X:pd.DataFrame):
    """"""

    # List to store Categorical Columns
    cat_cols = list(X.columns[X.dtypes == 'object'])
    print("Categorical Columns: ",cat_cols)

    # List to store Numerical Columns
    num_cols = list(X.columns[X.dtypes != 'object'])
    print("\nNumerical Columns:" ,num_cols)

    ## One-Hot Encoding Categorical Columns
    x_dummy =  pd.get_dummies(X[cat_cols], drop_first=True)

    ## Joining New dummified and Numerical columns
    x_new = pd.concat([x_dummy, X[num_cols]], axis=1, join='inner')
    return x_new

x_new = deobjectify_df(X=df[features])

def get_train_val_test(X,y):
    x_train, x_int, y_train, y_int = train_test_split(X,y, random_state=DATA_RS,test_size=0.5)
    x_val, x_test, y_val, y_test = train_test_split(x_int,y_int, random_state=DATA_RS,test_size=0.5)
    return x_train,x_val,x_test, y_train,y_val,y_test

x_train,x_val,x_test, y_train,y_val,y_test = get_train_val_test(X=x_new,y=df[target])

Categorical Columns:  ['month', 'day']

Numerical Columns: ['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']


In [6]:
generate_statistical_summary(df)

# Q2 

- Train the three Linear Models (**_Linear Regression_**, **_LASSO Regression_**, **_Elastic-Net_**) and three non-Linear Models (**_Suport Vector Machines_**, **_Random Forest_**, **_Gradient Boosting_**). 
- Report Training and Prediction error for all the models (MSE and RMSE)


In [7]:
# Linear Models

lr = LinearRegression(n_jobs=-1)
lasso_lr = ElasticNet(l1_ratio=1, random_state=MODEL_RS) # LASSO
mix_lr = ElasticNet(l1_ratio=0.5, random_state=MODEL_RS) # L1+L2

## Non Linear models
svm_sigmoid = SVR(kernel="sigmoid")
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
gb = GradientBoostingRegressor(n_estimators=100)

In [8]:
models = [lr,lasso_lr,mix_lr, svm_sigmoid, rf, gb]
model_names = ["Linear Regression",
               "Lasso Regression",
               "Elastic Net",
               "SVM-Sigmoid",
               "Random Forest",
               "Gradient Boosting"]

In [9]:
all_models = list(zip(model_names, models))

In [10]:
for model in models:
    model.fit(x_train, np.ravel(y_train))

In [11]:
def get_mse_rmse(model, x_true, y_true):
    """"""
    y_pred = model.predict(x_true)
    mse =mean_squared_error(y_true=y_true, y_pred=y_pred, squared=True)
    rmse =mean_squared_error(y_true=y_true, y_pred=y_pred, squared=False)
    return mse, rmse

In [91]:
def generate_report(models, x_true, y_true):
    """"""
    report_dict = dict()
    for model_name, model in models:
        mse, rmse = get_mse_rmse(model, x_true, y_true)
        report_dict[f"{model_name}"] = {"Mean-Squared Error":mse.round(4), "RMSE":rmse.round(4)}
    return pd.DataFrame(report_dict).T

In [13]:
report_train = generate_report(all_models, x_train, y_train)
report_val = generate_report(all_models, x_val, y_val)
report_test = generate_report(all_models, x_test, y_test)

In [94]:
def generate_regression_model_results(report_df, split):
    columns = ["Models"] + report_df.columns.to_list()
    # import plotly.graph_objects as go
    fig = go.Figure(data=[go.Table(
    header=dict(values=list(f"<b>{item}</b>" for item in columns),
                fill_color='paleturquoise',
                font=dict(color='black', size=15),
                align='center'),
    cells=dict(values=[report_df.index,
                       report_df["Mean-Squared Error"].round(3), 
                       report_df["RMSE"].round(3)],
               fill=dict(color=['grey', 'white']),
               font=dict(color='black', size=15),
               height=25,
               align='center'))
])

    fig.update_layout(
    title={
        'text': f"Regression Report: {split}",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        font=dict(
        family="Courier New, monospace",
        size=20,
        color="RebeccaPurple"
    ))
    # fig.update_layout({"margin": {"t":60}})
    fig.show()

In [95]:
generate_regression_model_results(report_train, "Training")
generate_regression_model_results(report_val, "Validation")

# Q3

- Which model will be selected based on MSE? Based on RMSE?
- Comment on the MSE of Training and Validation
- Is there any difference between Linear and Non-Linear Models?

In [97]:
generate_regression_model_results(report_train, "Training")
generate_regression_model_results(report_val, "Validation")

# Q4 

- Select one Linear and one Non-Linear Model. Visualize the _Feature Weights_ for both the models. Comment.


In [67]:
def generate_lin_reg_model_weights(model):

    if type(model) == LinearRegression:
        model_name = "Linear Regression"
    elif (type(model) == ElasticNet) and (model.l1_ratio == 1):
        model_name = "LASSO Regression"
    else:
        model_name = "Elastic-Net"
    sorted_idx = model.coef_.argsort()
    df_coef = pd.DataFrame({"Feature Weight": model.coef_[sorted_idx], "Classes": model.feature_names_in_[sorted_idx]})

    fig = px.bar(df_coef, x="Classes", y=df_coef["Feature Weight"], title="Wide-Form Input", height=1000)
    fig.update_layout(
        title={
            'text': f"Feature Weights: {model_name}",
            'y':1,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
            font=dict(
            family="Courier New, monospace",
            size=20,
            color="RebeccaPurple"
        ))
    fig.show()

In [70]:
# generate_lin_reg_model_weights(model=lr)
# generate_lin_reg_model_weights(model=lasso_lr)
# generate_lin_reg_model_weights(model=mix_lr)

In [52]:
def generate_non_linear_reg_model_weights(model,x_val, y_val):
    
    if type(model) == RandomForestRegressor:
        sorted_idx = model.feature_importances_.argsort()
        df_coef = pd.DataFrame({"Feature Weight": model.feature_importances_[sorted_idx], "Classes": model.feature_names_in_[sorted_idx]})
        model_name = "Random Forest"
    elif type(model) == GradientBoostingRegressor:
        sorted_idx = model.feature_importances_.argsort()
        df_coef = pd.DataFrame({"Feature Weight": model.feature_importances_[sorted_idx], "Classes": model.feature_names_in_[sorted_idx]})
        model_name = "Gradient Boosting"
    elif type(model) == SVR:
        perm_imp = permutation_importance(svm_sigmoid, x_val, y_val, n_repeats=100)
        feature_names = x_val.columns
        feature_names = np.array(feature_names)
        sorted_idx = perm_imp.importances_mean.argsort()
        df_coef = pd.DataFrame({"Feature Weight": perm_imp.importances_mean[sorted_idx], "Classes": feature_names[sorted_idx]})
        model_name = "SVM (Sigmoid)"

    fig = px.bar(df_coef, x="Classes", y=df_coef["Feature Weight"], title="Wide-Form Input", height=1000)
    fig.update_layout(
        title={
            'text': f"Feature Weights: {model_name}",
            'y':1,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
            font=dict(
            family="Courier New, monospace",
            size=20,
            color="RebeccaPurple"
        ))
    fig.show()

In [71]:
# generate_non_linear_reg_model_weights(gb, x_val,y_val)
# generate_non_linear_reg_model_weights(rf, x_val,y_val)
# generate_non_linear_reg_model_weights(svm_sigmoid, x_val,y_val)

# Q5

- Retrain the best model on Intermediate Dataset (Training+Validation) and use it to estimate Generalization error (Test error). Does this model underfit or overfit?

In [84]:
def combine_train_val(x_train,x_val,y_train,y_val):
    x_full_train = pd.concat([x_train,x_val], axis=0)
    y_train_full = pd.concat([y_train,y_val], axis=0)
    return x_full_train, y_train_full

In [85]:
x_full_train, y_train_full = combine_train_val(x_train,x_val,y_train,y_val)

In [86]:
x_full_train.shape, y_train_full.shape

((387, 27), (387, 1))

In [87]:
mix_lr.fit(x_full_train,np.ravel(y_train_full))

ElasticNet(random_state=100)

In [88]:
new_models = list(zip(["Best Model"], [mix_lr]))

In [92]:
report_test_df = generate_report(new_models, x_test, y_test)
report_train_df = generate_report(new_models, x_full_train, y_train_full)

In [96]:
generate_regression_model_results(report_df=report_test_df, split="Test")
generate_regression_model_results(report_df=report_train_df, split="Train")