In [1]:
import config
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score

In [2]:
# load preprocessed data
X = pd.read_pickle("../data/clean/X_data.pkl")
y = pd.read_pickle("../data/clean/y_data.pkl")

In [3]:
# split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=config.test_size,
    stratify=y if config.stratify else None,
    random_state=config.random_state
)

# save test data for later use in model_evaluation notebook
X_test.to_pickle("../data/clean/X_test.pkl")
y_test.to_pickle("../data/clean/y_test.pkl")

## Random Forest

In [4]:
# train random forest model
rf_model = RandomForestClassifier(**config.rf_params)
rf_model.fit(X_train, y_train)

In [5]:
# use trained model to predict test and evaluate
rf_preds = rf_model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_preds))
print("Random Forest - Classification Report:")
print(classification_report(y_test, rf_preds))

Confusion Matrix:
 [[3771   40]
 [  84  156]]
Random Forest - Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3811
           1       0.80      0.65      0.72       240

    accuracy                           0.97      4051
   macro avg       0.89      0.82      0.85      4051
weighted avg       0.97      0.97      0.97      4051



In [6]:
# hyperparameter tuning
rf_search_model = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=config.random_state),
    param_distributions=config.rf_param_dist,
    **config.search_cv_params
)
rf_search_model.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [7]:
# evaluate
rf_search_preds = rf_search_model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_search_preds))
print("Random Forest (Tuned) - Classification Report:")
print(classification_report(y_test, rf_search_preds))

Confusion Matrix:
 [[3742   69]
 [  79  161]]
Random Forest (Tuned) - Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3811
           1       0.70      0.67      0.69       240

    accuracy                           0.96      4051
   macro avg       0.84      0.83      0.83      4051
weighted avg       0.96      0.96      0.96      4051



## Logistic Regression

In [8]:
# scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# save scaler for later use in model_evaluation notebook (if final chosen model is an LR model)
with open("../models/logreg_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [9]:
# train logistic regression model
logreg_model = LogisticRegression(**config.logreg_params)
logreg_model.fit(X_train_scaled, y_train)

In [10]:
# evaluate
logreg_preds = logreg_model.predict(X_test_scaled)
print("Confusion Matrix:\n", confusion_matrix(y_test, logreg_preds))
print("Logistic Regression - Classification Report:")
print(classification_report(y_test, logreg_preds))

Confusion Matrix:
 [[3765   46]
 [  81  159]]
Logistic Regression - Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3811
           1       0.78      0.66      0.71       240

    accuracy                           0.97      4051
   macro avg       0.88      0.83      0.85      4051
weighted avg       0.97      0.97      0.97      4051



In [11]:
# hyperparameter tuning
logreg_search_model = RandomizedSearchCV(
    estimator=LogisticRegression(),
    param_distributions=config.logreg_param_dist,
    **config.search_cv_params
)
logreg_search_model.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits




In [12]:
# evaluate
logreg_search_preds = logreg_search_model.predict(X_test_scaled)
print("Confusion Matrix:\n", confusion_matrix(y_test, logreg_search_preds))
print("Logistic Regression (Tuned) - Classification Report:")
print(classification_report(y_test, logreg_search_preds))

Confusion Matrix:
 [[3765   46]
 [  81  159]]
Logistic Regression (Tuned) - Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3811
           1       0.78      0.66      0.71       240

    accuracy                           0.97      4051
   macro avg       0.88      0.83      0.85      4051
weighted avg       0.97      0.97      0.97      4051



## Compare Models

In [13]:
# f1, precision, and recall scores for all models
rf_f1 = f1_score(y_test, rf_preds)
rf_search_f1 = f1_score(y_test, rf_search_preds)
logreg_f1 = f1_score(y_test, logreg_preds)
logreg_search_f1 = f1_score(y_test, logreg_search_preds)

rf_precision = precision_score(y_test, rf_preds)
rf_search_precision = precision_score(y_test, rf_search_preds)
logreg_precision = precision_score(y_test, logreg_preds)
logreg_search_precision = precision_score(y_test, logreg_search_preds)

rf_recall = recall_score(y_test, rf_preds)
rf_search_recall = recall_score(y_test, rf_search_preds)
logreg_recall = recall_score(y_test, logreg_preds)
logreg_search_recall = recall_score(y_test, logreg_search_preds)

print(f"Random Forest - F1: {rf_f1:.3f}, Precision: {rf_precision:.3f}, Recall: {rf_recall:.3f}")
print(f"Tuned Random Forest - F1: {rf_search_f1:.3f}, Precision: {rf_search_precision:.3f}, Recall: {rf_search_recall:.3f}")
print(f"Logistic Regression - F1: {logreg_f1:.3f}, Precision: {logreg_precision:.3f}, Recall: {logreg_recall:.3f}")
print(f"Tuned Logistic Regression - F1: {logreg_search_f1:.3f}, Precision: {logreg_search_precision:.3f}, Recall: {logreg_search_recall:.3f}")


Random Forest - F1: 0.716, Precision: 0.796, Recall: 0.650
Tuned Random Forest - F1: 0.685, Precision: 0.700, Recall: 0.671
Logistic Regression - F1: 0.715, Precision: 0.776, Recall: 0.662
Tuned Logistic Regression - F1: 0.715, Precision: 0.776, Recall: 0.662


The base logistic regression model is my preferred model for predicting All-Stars. The recall is more than 1% better than the base random forest. While the tuned RF model boasts the highest recall, I don't believe this justifies the significant drop in precision (as indicated by the lowest of all f1 scores). I prefer the higher recall of the logistic regression model, because the point of the model is to correctly classify the minority class, and I find the slightly lower precision justified. I will move forward with the base logistic regression model

In [14]:
# save the best model
with open("../models/all_star_model.pkl", "wb") as f:
    pickle.dump(logreg_model, f)