# Check GAM modeling results

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\netmob25

D:\netmob25


In [2]:
# Load libs
import pandas as pd
import numpy as np
import pyperclip
from lib import helpers as helpers
import shap
import pickle

In [8]:
var_cat = ['time_threshold', 'amenity', 'mode', 'Gender', 'Education', 'Household_type', 
            'Car_no', 'Bike_no', 'Two_wheeler_no', 'Escooter_no', 'pt_sub', 'main_mode']
var_con = ['d2h_nh', 'Age', 'access_h']
# Load data for modelling
print('Load data.')
df = pd.read_parquet("results/activity_access_ind_model.parquet")
df['log_disparity'] = np.log(df['gap'])

# Step 4: Combine into final feature list
features = var_con + var_cat
pyperclip.copy('\n'.join(features))
print(', '.join(features))

Load data.
d2h_nh, Age, access_h, time_threshold, amenity, mode, Gender, Education, Household_type, Car_no, Bike_no, Two_wheeler_no, Escooter_no, pt_sub, main_mode


## 1. Check the model results

In [9]:
perf_dict_list = []
md = 'all'
path2result = 'results/ebm/'
gam_results = helpers.EBMResultsOrganizer(file_loc=path2result + f'model_{md}.p')
gam_results.load_raw_data(select='all')

# Feature performance
perf = gam_results.performance()
perf['model'] = md
print(perf)


Load data.
{'rmse_train': 0.9420542971272369, 'r2_train': 0.4756669053057755, 'rmse_test': 0.9944761429059229, 'r2_test': 0.37724456390862293, 'model': 'all'}


In [14]:
# Feature importance
df_f = gam_results.feature_importance()
df_f.to_csv(f'results/ebm/f_score.csv', index=False)
print(len(df_f[df_f['Score'] > 0.01]))
df_f.to_clipboard()
print(df_f)

12
                                             Name      Color     Score
2                                   Access (home)  steelblue  0.560609
11                         Main mode of transport  steelblue  0.224028
12  Distance to Home Neighborhood & Access (home)      black  0.190082
0                   Distance to Home Neighborhood  steelblue  0.181680
10                  Public Transport Subscription  steelblue  0.126418
13         Access (home) & Main mode of transport      black  0.079116
1                                             Age      coral  0.074552
5                                  Household type      coral  0.071370
6                                      Car number  steelblue  0.055251
4                                       Education      coral  0.050368
7                                     Bike number  steelblue  0.038633
8                              Two-wheeler number  steelblue  0.011938
9                                E-scooter number  steelblue  0.003595
3  

In [11]:
# Load feature effects
gam_results.feature_scores()
df_fscore_f = gam_results.single_feature_effect()
df_fscore_f.to_csv(f'results/ebm/features.csv', index=False)

In [34]:
interaction_path = f'D:/netmob25/results/ebm/interactions/'
gam_results.interection_effect(path2save=interaction_path)

 35.47388251]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  # Get the frequency of the interaction cells
  if var1 not in labels_cat:
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.22422471 0.22422471 0.22422471 0.22422471
 0.22422471 0.22422471 0.22422471 0.22422471 0.22422471 0.22422471
 0.22422471 0.22422471 0.22422471 0.22422471 0.22422471 0.22422471
 0.22422471 0.22422471 0.22422471 0.22422471 0.22422471 0.22422471
 0.22422471 0.22422471 

IndexError: list index out of range

In [12]:
interaction_data = []

for key, value in gam_results.all_fscore["interaction"].items():
    feature1, feature2 = key.split(" & ")
    x_edges = np.array(value[0])  # either edges or categorical labels
    y_edges = np.array(value[1])
    effect_matrix = np.array(value[2])

    # Determine dimensionality
    is_x_categorical = not np.issubdtype(x_edges.dtype, np.number)
    is_y_categorical = not np.issubdtype(y_edges.dtype, np.number)

    # Compute x-axis values
    if is_x_categorical:
        x_vals = x_edges
    else:
        x_vals = (x_edges[:-1] + x_edges[1:]) / 2

    # Compute y-axis values
    if is_y_categorical:
        y_vals = y_edges
    else:
        y_vals = (y_edges[:-1] + y_edges[1:]) / 2

    for i, y in enumerate(y_vals):
        for j, x in enumerate(x_vals):
            # Adjust indexing based on layout
            effect = effect_matrix[i, j] if effect_matrix.shape == (len(y_vals), len(x_vals)) else effect_matrix[j, i]
            interaction_data.append({
                "feature1": feature1,
                "feature2": feature2,
                "x": x,
                "y": y,
                "effect": effect
            })

df_interactions = pd.DataFrame(interaction_data)

In [13]:
df_interactions.to_csv("results/ebm/interactions.csv", index=False)