In [None]:
#https://www.kaggle.com/code/dansbecker/partial-plots
import os
import pandas as pd
import shap
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay
import matplotlib.pyplot as plt

# Create a folder for explanations
os.makedirs("pdp_plots", exist_ok=True)

In [None]:
from sklearn.inspection import partial_dependence

folder_path = "train_test_pickles"

# Load the first training pickle
train_pickle_path = os.path.join(folder_path, "train_df_0.pickle")
train_df = pd.read_pickle(train_pickle_path)

# Load the first test pickle
test_pickle_path = os.path.join(folder_path, "test_df_0.pickle")
test_df = pd.read_pickle(test_pickle_path)

# Define columns to drop
cols_dropped = ['date', 'last_rtt', 'prb_id', 'dst_id', 'normalizzed_rtt', 'src_names', 'distance', 'Latitude_source', 'Longitude_source', 'Latitude_destination', 'Longitude_destination', 'Public_destination', 'Public_source', 'norm_storedtimestamp']

# Create feature matrix (X) and target variable (y) for training and testing sets
X_train = train_df.drop(columns=cols_dropped, axis=1)
y_train = train_df['normalizzed_rtt']

X_test = test_df.drop(columns=cols_dropped, axis=1)
y_test = test_df['normalizzed_rtt']

# Initialize the RandomForestRegressor
rand_forest = RandomForestRegressor(
    n_estimators=10,
    criterion='squared_error',
    random_state=42,
)

# Train the random forest on the training data
rand_forest.fit(X_train, y_train)

# Calculate SHAP values using the first 10,000 samples from the training data
explainer_shap = shap.Explainer(rand_forest)
shap_values = explainer_shap(X_train[:10000])

# Select the top 6 features based on average absolute SHAP values
top_features_idx = np.abs(shap_values.values).mean(axis=0).argsort()[-10:]

# Create and save partial dependence plots for the top 6 features
for feature_idx in top_features_idx:
    # Use PartialDependenceDisplay
    pd_display = PartialDependenceDisplay.from_estimator(
        estimator=rand_forest,
        X=X_train,
        features=[feature_idx]
    )

    # Plot the partial dependence plot
    # Save the plot in the 'pdp_plots' folder
    plt.savefig(f"pdp_plots/pdp_plot_feature_{feature_idx}.png")
    plt.clf()  # Clear the plot for the next iteration ,. how do i plot scatter plots instead of line plots 