In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m266.2/275.7 kB[0m [31m7.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=f08e7232b275f2532f37a0d4b0cb41d4bc1fae764fc2a46a80fca566e7504a08
  Stored in directory: /root/.cache/pip/wheels/e7/5d/0e/4b4fff9a47468fed5633211fb3b76d1db43fe806a17fb7486a
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


pip install optuna

In [None]:
pip install shap



In [None]:
import numpy as np
import pandas as pd
from pathlib import Path


# Scikit-learn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import lime
from lime.lime_tabular import LimeTabularExplainer
import shap
import optuna
import joblib

In [None]:
DATA_DIR = Path('drive/MyDrive/retail')
required_files = ['races.csv','results.csv','drivers.csv','constructors.csv','circuits.csv']

races = pd.read_csv(DATA_DIR / 'races.csv')
results = pd.read_csv(DATA_DIR / 'results.csv')
drivers = pd.read_csv(DATA_DIR / 'drivers.csv')
constructors = pd.read_csv(DATA_DIR / 'constructors.csv')
circuits = pd.read_csv(DATA_DIR / 'circuits.csv')


# create target: podium (top3)
df = results.copy()
df['position'] = pd.to_numeric(df['position'], errors='coerce').fillna(0).astype(int)
df['podium'] = (df['position'] <= 3).astype(int)


if 'raceId' in races.columns:
  df = df.merge(races[['raceId','year','circuitId']], on='raceId', how='left')



drv_merge_cols = ['driverId']
if 'nationality' in drivers.columns:
  drv_merge_cols.append('nationality')
if 'dob' in drivers.columns:
  drv_merge_cols.append('dob')
df = df.merge(drivers[drv_merge_cols], on='driverId', how='left')



if 'constructorId' in constructors.columns and 'name' in constructors.columns:
  df = df.merge(constructors[['constructorId','name']], on='constructorId', how='left')
  df.rename(columns={'name':'constructor_name'}, inplace=True)
else:
  df['constructor_name'] = df['constructorId'].astype(str)


# Select features — keep it simple and reproducible
features = ['grid','year','circuitId','constructor_name','driverId']
df = df[df['grid'].notna()]
data = df[features + ['podium']].copy()



data['driverId'] = data['driverId'].astype(str)
data['constructor_name'] = data['constructor_name'].astype(str)
data['circuitId'] = data['circuitId'].astype(str)


data = data.dropna()
print('\nPrepared dataset shape:', data.shape)
display(data.head())


Prepared dataset shape: (26759, 6)


Unnamed: 0,grid,year,circuitId,constructor_name,driverId,podium
0,1,2008,1,McLaren,1,1
1,5,2008,1,BMW Sauber,2,1
2,7,2008,1,Williams,3,1
3,11,2008,1,Renault,4,0
4,3,2008,1,McLaren,5,0


In [None]:
numeric_features = ['grid','year']
categorical_features = ['circuitId','constructor_name','driverId']


preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
])


base_clf = RandomForestClassifier(random_state=42)

pipe = Pipeline(steps=[('pre', preprocessor), ('clf', base_clf)])


X = data[features]
y = data['podium']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
print('\nTrain/Test split:', X_train.shape, X_test.shape)


Train/Test split: (21407, 5) (5352, 5)


In [None]:
param_grid = {
  'clf__n_estimators': [100, 200],
  'clf__max_depth': [10, 20, None],
  'clf__max_features': ['sqrt', 'log2']
}


from scipy.stats import randint
param_dist = {
  'clf__n_estimators': randint(50, 300),
  'clf__max_depth': [5,10,15,20,None],
  'clf__min_samples_split': randint(2,10),
  'clf__max_features': ['sqrt','log2', None]
}


print('\nDefined hyperparameter search spaces.')


Defined hyperparameter search spaces.


In [None]:
gs = GridSearchCV(pipe, param_grid, cv=3, n_jobs=-1, scoring='accuracy', verbose=1)
print('\nStarting GridSearchCV (this may take a while)...')
gs.fit(X_train, y_train)
print('\nGridSearch best score:', gs.best_score_)
print('GridSearch best params:', gs.best_params_)


rs = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=20, cv=3, n_jobs=-1, scoring='accuracy', verbose=1, random_state=42)
print('\nStarting RandomizedSearchCV...')
rs.fit(X_train, y_train)
print('\nRandomSearch best score:', rs.best_score_)
print('RandomSearch best params:', rs.best_params_)


Starting GridSearchCV (this may take a while)...
Fitting 3 folds for each of 12 candidates, totalling 36 fits

GridSearch best score: 0.6736114864617148
GridSearch best params: {'clf__max_depth': 20, 'clf__max_features': 'sqrt', 'clf__n_estimators': 100}

Starting RandomizedSearchCV...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

RandomSearch best score: 0.6765544097760152
RandomSearch best params: {'clf__max_depth': 10, 'clf__max_features': None, 'clf__min_samples_split': 7, 'clf__n_estimators': 239}


In [None]:
def objective(trial):
  n_estimators = trial.suggest_int('n_estimators', 50, 200)
  max_depth = trial.suggest_categorical('max_depth', [5,10,15,20,None])
  max_features = trial.suggest_categorical('max_features', ['sqrt','log2', None])
  min_samples_split = trial.suggest_int('min_samples_split', 2, 10)


  pipe.set_params(clf__n_estimators=n_estimators,
                  clf__max_depth=max_depth,
                  clf__max_features=max_features,
                  clf__min_samples_split=min_samples_split)
  from sklearn.model_selection import cross_val_score
  scores = cross_val_score(pipe, X_train, y_train, cv=3, scoring='accuracy', n_jobs=-1)
  return float(np.mean(scores))


study = optuna.create_study(direction='maximize')
print('\nStarting Optuna study (10 trials example)...')
study.optimize(objective, n_trials=10)
print('\nOptuna best value:', study.best_value)
print('Optuna best params:', study.best_params)

[I 2025-11-24 21:09:02,383] A new study created in memory with name: no-name-4c6627db-415f-4a3f-8d0f-8b838491a51e



Starting Optuna study (10 trials example)...


[I 2025-11-24 21:14:51,979] Trial 0 finished with value: 0.6760872750384167 and parameters: {'n_estimators': 178, 'max_depth': 15, 'max_features': None, 'min_samples_split': 7}. Best is trial 0 with value: 0.6760872750384167.
[I 2025-11-24 21:15:28,621] Trial 1 finished with value: 0.673004230288594 and parameters: {'n_estimators': 170, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_split': 6}. Best is trial 0 with value: 0.6760872750384167.
[I 2025-11-24 21:19:15,314] Trial 2 finished with value: 0.6744523132770412 and parameters: {'n_estimators': 101, 'max_depth': 20, 'max_features': None, 'min_samples_split': 8}. Best is trial 0 with value: 0.6760872750384167.
[I 2025-11-24 21:19:55,278] Trial 3 finished with value: 0.6624467875051195 and parameters: {'n_estimators': 69, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 3}. Best is trial 0 with value: 0.6760872750384167.
[I 2025-11-24 21:20:04,057] Trial 4 finished with value: 0.6587564538481119 and parameters: 


Optuna best value: 0.6760872750384167
Optuna best params: {'n_estimators': 178, 'max_depth': 15, 'max_features': None, 'min_samples_split': 7}


In [None]:
candidates = {}
if 'gs' in locals():
  candidates['GridSearch'] = gs.best_estimator_
if 'rs' in locals():
  candidates['RandomSearch'] = rs.best_estimator_
if optuna is not None and 'study' in locals():
  pipe.set_params(**{f'clf__{k}': v for k,v in study.best_params.items()})
  pipe.fit(X_train, y_train)
  candidates['Optuna'] = pipe



best_name = None
best_acc = -1
for name, model in candidates.items():
  preds = model.predict(X_test)
  acc = accuracy_score(y_test, preds)
  print(f"{name} test accuracy: {acc:.4f}")
  if acc > best_acc:
    best_acc = acc
    best_name = name
    final_model = model


print('\nSelected best model:', best_name, 'with test acc =', best_acc)

GridSearch test accuracy: 0.6779
RandomSearch test accuracy: 0.6794
Optuna test accuracy: 0.6762

Selected best model: RandomSearch with test acc = 0.679372197309417


In [None]:
if not isinstance(final_model, Pipeline):
  final_model = Pipeline([('pre', preprocessor), ('clf', final_model)])
  final_model.fit(X_train, y_train)


pre = final_model.named_steps['pre']
clf = final_model.named_steps['clf']


X_train_trans = pre.transform(X_train)

numeric_features = ['grid', 'year']

ohe = pre.named_transformers_['cat']

ohe_input_features = list(ohe.feature_names_in_)
ohe_feature_names = list(ohe.get_feature_names_out(ohe_input_features))
transformed_feature_names = numeric_features + ohe_feature_names
print('Transformed feature count =', len(transformed_feature_names))

Transformed feature count = 1104


In [None]:
try:
  print('\nComputing SHAP values on a sample (fast)...')
  background = shap.sample(X_train_trans, 200, random_state=42)
  explainer_shap = shap.TreeExplainer(clf)
  shap_values_sample = explainer_shap.shap_values(background)
  print('Computed SHAP (sample).')
except Exception as e:
    print('Error computing SHAP:', e)
    explainer_shap = None


Computing SHAP values on a sample (fast)...
Computed SHAP (sample).


In [None]:
if LimeTabularExplainer is not None:
  lime_explainer = LimeTabularExplainer(
  training_data=X_train_trans,
  feature_names=transformed_feature_names,
  class_names=['not_podium','podium'],
  mode='classification'
  )
else:
  lime_explainer = None

In [None]:
def calculator(raw_instance: dict, top_k: int = 7):
  """
  raw_instance: dict with keys matching the raw feature names (columns of X_train), e.g.
  {'grid': 2, 'year': 2019, 'circuitId':'1', 'constructor_name':'Ferrari', 'driverId':'1'}
  Returns: dict with prediction, probability, lime list, shap top_k
  """
  inst_df = pd.DataFrame([raw_instance])

  missing_cols = [c for c in X_train.columns if c not in inst_df.columns]
  if missing_cols:

    for c in missing_cols:
      inst_df[c] = np.nan
  inst_df = inst_df[X_train.columns]



  pred_proba = final_model.predict_proba(inst_df)[0,1]
  pred_label = int(final_model.predict(inst_df)[0])



  inst_trans = pre.transform(inst_df)


  lime_res = None
  if lime_explainer is not None:
    try:
      def predict_proba_transformed(X_trans):
        return clf.predict_proba(X_trans)


      exp = lime_explainer.explain_instance(inst_trans[0], predict_proba_transformed, num_features=top_k)
      lime_res = exp.as_list(label=1)
    except Exception as e:
      lime_res = f'LIME error: {e}'


  shap_res = None
  if explainer_shap is not None:
    try:
      sv = explainer_shap.shap_values(inst_trans)
      # For binary classification, TreeExplainer returns values for each class.
      # It can be a list of 2D arrays or a single 3D array.
      # We want the SHAP values for class 1 ('podium') for the single instance.
      if isinstance(sv, list):
          # If sv is a list of arrays: [shap_values_class0, shap_values_class1]
          # Each inner array has shape (num_instances, num_features)
          shap_vals_instance = sv[1][0] # Get class 1, first instance
      elif isinstance(sv, np.ndarray) and sv.ndim == 3:
          # If sv is a 3D array: (num_instances, num_classes, num_features)
          shap_vals_instance = sv[0, 1, :] # Get first instance, class 1, all features
      else:
          raise ValueError(f"Unexpected SHAP values format: {type(sv)} with ndim={getattr(sv, 'ndim', 'N/A')}")

      shap_df = pd.DataFrame({'feature': transformed_feature_names, 'shap_value': shap_vals_instance})
      shap_df = shap_df.reindex(shap_df['shap_value'].abs().sort_values(ascending=False).index)
      shap_res = shap_df.head(top_k).to_dict(orient='records')
    except Exception as e:
      shap_res = f'SHAP error: {e}'


  return {
        'pred_label': pred_label,
        'pred_proba': float(pred_proba),
        'lime': lime_res,
        'shap': shap_res
  }


if 'X_test' in locals():
  example_inst = X_test.iloc[0].to_dict()
  print('\nCalculator example input (raw):')
  print(example_inst)
  print('\nCalculator example output:')
  print(calculator(example_inst))


Calculator example input (raw):
{'grid': 7, 'year': 1950, 'circuitId': '14', 'constructor_name': 'Alfa Romeo', 'driverId': '579'}

Calculator example output:
{'pred_label': 1, 'pred_proba': 0.5945665214508943, 'lime': [('driverId_740 <= 0.00', 0.2674565751950144), ('driverId_597 <= 0.00', 0.26724071205091493), ('driverId_467 <= 0.00', 0.1651952136490291), ('constructor_name_Martini <= 0.00', -0.13604822945112358), ('driverId_834 <= 0.00', -0.12064576512618812), ('driverId_729 <= 0.00', -0.10634500435967205), ('constructor_name_BRM-Ford <= 0.00', -0.013412996607359853)], 'shap': 'SHAP error: All arrays must be of the same length'}
