# Generalized Additive Model (GAM) for regression tasks

This example illustrates the use of GAMs for regression tasks. We'll use:

### Regression Dataset 

https://www.kaggle.com/datasets/mirichoi0218/insurance




In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score,PrecisionRecallDisplay, RocCurveDisplay, classification_report
import os
from pygam import LinearGAM, s, f
import joblib

np.random.seed(42)
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Define the folder path
folder_path = '../../../Post-hoc/datasets/gas+turbine+co+and+nox+emission+data+set'

# Load the CSV files
files_to_load = ['gt_2011.csv', 'gt_2012.csv', 'gt_2013.csv', 'gt_2014.csv']
dataframes = [pd.read_csv(os.path.join(folder_path, file)) for file in files_to_load]


# Concatenate the dataframes
train_val_data = pd.concat(dataframes, ignore_index=True)
display(train_val_data.head())

# Load the test data
test_data = pd.read_csv(os.path.join(folder_path, 'gt_2015.csv'))


scaler = StandardScaler()
train_val_data = pd.DataFrame(scaler.fit_transform(train_val_data), columns=train_val_data.columns)
test_data = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns)


# Split the training and validation data
train_data, val_data = train_test_split(train_val_data, test_size=0.2, random_state=42)

#use column TEY as target
target = 'TEY'
X_train = train_data.drop(target, axis=1)
y_train = train_data[target]
X_val = val_data.drop(target, axis=1)
y_val = val_data[target]
X_test = test_data.drop(target, axis=1)
y_test = test_data[target]

print(y_train.shape, y_val.shape, y_test.shape)






Unnamed: 0,AT,AP,AH,AFDP,GTEP,TIT,TAT,TEY,CDP,CO,NOX
0,4.5878,1018.7,83.675,3.5758,23.979,1086.2,549.83,134.67,11.898,0.32663,81.952
1,4.2932,1018.3,84.235,3.5709,23.951,1086.1,550.05,134.67,11.892,0.44784,82.377
2,3.9045,1018.4,84.858,3.5828,23.99,1086.5,550.19,135.1,12.042,0.45144,83.776
3,3.7436,1018.3,85.434,3.5808,23.911,1086.5,550.17,135.03,11.99,0.23107,82.505
4,3.7516,1017.8,85.182,3.5781,23.917,1085.9,550.0,134.67,11.91,0.26747,82.028


(23479,) (5870,) (7384,)


## **3.Define a GAM model for regression**



In [2]:
# define gam for regression task

# initialize the first term

term = s(0)

# dynamically add more terms - use smooth function for numerical columns and factor function for categorical columns

for i, col in enumerate(X_train.columns):
    if X_train[col].dtype == 'int64':
        term += s(i)
    else:
        term += f(i)
    
gam = LinearGAM(terms=term,n_splines=5, lam=0.1).fit(X_train.values, y_train.values)  

print(gam.summary())


: 

### Warning

Large dataset size or large n_splines size can increase training time an dmemory exponentially. If you undergo OOM error or if Juputer crashes

In [None]:
# Predict the target variable on the test set
y_pred = gam.predict(X_test)


# Calculate the performance metrics for regression
mse = np.mean((y_test - y_pred)**2)
mae = np.mean(np.abs(y_test - y_pred))
rmse = np.sqrt(mse)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")




In [None]:
# for the 4 most significant features, plot the shape functions of the GAM model in a 2x2 subplot. 
# rescale the features to their original scale using the saved scaling parameters

# Get the feature names
feature_names = X.columns

# Get the p-values of the features in the GAM model
p_values = gam.statistics_["p_values"]

# Get the indices of the 4 most significant features
#top_features_indices = np.argsort(np.abs(p_values))[:4]
#pick 4 random features
top_features_indices = [0, 1, 2, 3]
#top_features_indices = np.argsort(np.abs(p_values))[:4]

# Filter out any indices that are out of bounds
top_features_indices = [i for i in top_features_indices if i < len(X.columns)]
print(top_features_indices)

# Get the feature names of the 4 most significant features
top_features = [feature_names[int(i)] for i in top_features_indices]


#load the saved scaler and rescale the features to their original scale
scaler = joblib.load("scaler_insurance.pkl")
scaler_mean = scaler.mean_
scaler_var = scaler.var_

# Rescale the features to their original scale
X_test_rescaled = X_test.copy()
numerical_cols_in_test = [col for col in numerical_columns if col in X_test_rescaled.columns]
X_test_rescaled[numerical_cols_in_test] = (X_test[numerical_cols_in_test] * np.sqrt(scaler_var[:len(numerical_cols_in_test)])) + scaler_mean[:len(numerical_cols_in_test)]

# get the mapping for the categorical variables
#label_encoders = joblib.load("label_encoders_insurance.pkl")

#print the mapping for the categorical variables
# for col in categorical_features:
#     print(f"Mapping for {col}: {dict(enumerate(label_encoders[col].classes_))}")

# Plot the shape functions of the GAM model for the 4 most significant features
fig, axs = plt.subplots(2, 2, figsize=(10, 8))
for i, ax in enumerate(axs.flatten()):
    if i < len(top_features_indices):
        term_index = int(top_features_indices[i])
        XX = gam.generate_X_grid(term=term_index)
        XX_rescaled = XX.copy()
        if feature_names[term_index] in numerical_columns:
            col_index = numerical_columns.index(feature_names[term_index])
            XX_rescaled[:, term_index] = XX[:, term_index] * np.sqrt(scaler_var[col_index]) + scaler_mean[col_index]
        pdep, confi = gam.partial_dependence(term=term_index, width=.95)
        ax.plot(XX_rescaled[:, term_index], pdep)
        ax.plot(XX_rescaled[:, term_index], confi, c='r', ls='--')
        ax.set_title(f"Feature: {top_features[i]}")
        if feature_names[term_index] in category_columns:
            ax.set_xticks(range(len(label_encoders[feature_names[term_index]].classes_)))
            ax.set_xticklabels(label_encoders[feature_names[term_index]].classes_, rotation=90)
        ax.set_xlabel(top_features[i])
        ax.set_ylabel("Partial Dependence")
    else:
        ax.axis('off')
plt.tight_layout()
plt.show()

print("Done")

