In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import itertools
import seaborn as sns
import pickle as pkl
from sklearn import tree as tr
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, f1_score, precision_score, recall_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from xgboost import XGBClassifier
import shap
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix

import pickle as pkl
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold


In [4]:
def run_on_splits(func):
    def _run_loop(*args, **kwargs):
        for x,y,nsplit in zip([X_train, X_val, X_test],
                              [y_train, y_val, y_test],
                              ['train', 'val', 'test']):
            func(*args, X=x, y=y, nsplit=nsplit, **kwargs)
    return _run_loop

In [5]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
@run_on_splits
def evaluate_2(model, X, y, nsplit, model_name):
    ''' Evaluates the performance of a model 
    Args:
        model (sklearn.Estimator): fitted sklearn estimator
        X (np.array): predictors
        y (np.array): true outcome
        nsplit (str): name of the split
        model_name (str): an identifier for the model
    '''
    predict_y = model.predict(X)
    precision = precision_score(y, predict_y, average='macro')
    recall = recall_score(y, predict_y, average='macro')
    f1 = f1_score(y, predict_y, average='macro')
    accuracy = accuracy_score(y, predict_y)

    performances.append({
        'model': model_name,
        'split': nsplit,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    })

# Example usage:
# evaluate_2(some_model, X_test, y_test, 'test_split', 'model_name')


In [6]:
def fit_and_evaluate_models_2(X_train, y_train, subset_name):
    ''' 
    Fits and evaluates models on a given subset of data 
    Args:
        X_train (pd.DataFrame): Training features
        y_train (pd.Series): Training labels
        subset_name (str): Name of the subset
    '''

    def fit_and_evaluate_2(model, param_grid, model_name):
        if len(param_grid) * min([len(v) for v in param_grid.values()]) <= 20:
            search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1', cv=StratifiedKFold(5), n_jobs=-1)
        else:
            search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, scoring='f1', n_iter=20, cv=StratifiedKFold(5), n_jobs=-1)
        reg = search.fit(X_train, y_train)
        with open(f'/Users/sylvainestebe/Code/data_child/models/Q2_{model_name}_{subset_name}.pkl', 'wb') as file:
            pkl.dump(reg, file)  # save the model
        evaluate_2(model=search.best_estimator_, model_name=f'{model_name}_{subset_name}')

    # Dummy Classifier
    dummy_clf = DummyClassifier(strategy="constant", constant=1)
    dummy_clf.fit(X_train, y_train)
    evaluate_2(model=dummy_clf,model_name=f'dummy_{subset_name}')

    # Logistic Regression
    logistic_pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Adding scaler
        ('logistic', LogisticRegression(random_state=42, max_iter=5000))
    ])
    fit_and_evaluate_2(
        logistic_pipeline, 
        {'logistic__C': [0.01, 0.1, 1, 10, 100]}, 
        'logistic'
    )

    # SVM
    fit_and_evaluate_2(
        svm.SVC(), 
        {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly']}, 
        'svm'
    )

    # KNN
    knn_pipeline = Pipeline([("scaler", StandardScaler()), ("knn", KNeighborsClassifier())])
    fit_and_evaluate_2(
        knn_pipeline, 
        {'knn__n_neighbors': [3, 5, 7, 11]}, 
        'knn'
    )

    # Decision Tree
    fit_and_evaluate_2(
        tree.DecisionTreeClassifier(), 
        {'max_depth': [2, 5, 10, None], 'min_samples_split': [2, 5, 10]}, 
        'tree'
    )

    # Random Forest
    fit_and_evaluate_2(
        RandomForestClassifier(random_state=42), 
        {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2'],
        'bootstrap': [True, False]        }, 
        'randomforest'
    )

    # XGBoost
    fit_and_evaluate_2(
        XGBClassifier(random_state=42), 
        {
            'n_estimators': [10, 50, 100, 200,300],
            'max_depth': [3, 6, 9],
            'learning_rate': [0.01, 0.1, 0.2],
            'min_child_weight': [1, 3, 5],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0],
            'reg_alpha': [0, 0.1, 1],
            'reg_lambda': [1, 10, 100],
        }, 
        'xgboost'
    )

    # Neural Network
    nn_pipeline = Pipeline([("scaler", StandardScaler()), ("mlp", MLPClassifier(max_iter=1100, random_state=1))])
    
    # Call fit_and_evaluate with the correctly prefixed parameters
    fit_and_evaluate_2(
        nn_pipeline, 
        {
            'mlp__hidden_layer_sizes': [(50, 50), (100,), (100, 50)],
            'mlp__activation': ['tanh', 'relu'],
            'mlp__solver': ['adam', 'sgd'],
            'mlp__alpha': [0.0001, 0.001, 0.01],
            'mlp__learning_rate': ['constant', 'adaptive'],
            'mlp__max_iter': [200, 400, 600]        
        }, 
        'neural_network'
    )
        # HistGradientBoostingClassifier
    fit_and_evaluate_2(
        HistGradientBoostingClassifier(random_state=42),
        {
            'learning_rate': [0.01, 0.1, 0.2],
            'max_iter': [100, 200,300],
            'max_depth': [3, 6, 9],
            'l2_regularization': [0, 0.1, 1, 10]

        },
        'histgradientboosting'
    )


# Start

In [7]:
# Load dataset 
train = pd.read_csv(f'/Users/sylvainestebe/Code/data_child/data/train_question2.csv', index_col=0)
val = pd.read_csv(f'/Users/sylvainestebe/Code/data_child/data/val_question2.csv', index_col=0)
test = pd.read_csv(f'/Users/sylvainestebe/Code/data_child/data/test_question2.csv', index_col=0)
performances = []


# Load dataset features selections
train_features = pd.read_csv(f'/Users/sylvainestebe/Code/data_child/data/train_question2_features.csv', index_col=0)
val_features = pd.read_csv(f'/Users/sylvainestebe/Code/data_child/data/val_question2_features.csv', index_col=0)
test_features = pd.read_csv(f'/Users/sylvainestebe/Code/data_child/data/test_question2_features.csv', index_col=0)
performances = []

In [8]:
## Features selections
X_train, y_train = train_features.iloc[:,:-1], train_features.iloc[:,-1]
X_val, y_val = val_features.iloc[:,:-1], val_features.iloc[:,-1]
X_test, y_test = test_features.iloc[:,:-1], test_features.iloc[:,-1]

fit_and_evaluate_models_2(X_train,y_train,subset_name="features selections")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Traceback (most recent call last):
  File "/Users/sylvainestebe/Code/data_child/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sylvainestebe/Code/data_child/.venv/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sylvainestebe/Code/data_child/.venv/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

KeyboardInterrupt: 