In [12]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# Load data
train_data = pd.read_csv('../split_data/train_test_data.csv')
test_data = pd.read_csv('../split_data/validation_data.csv')

print(f"Training data size: {len(train_data)}, Test data size: {len(test_data)}")

Training data size: 9960, Test data size: 1107


## Pre-process the data, for both training and validation

In [13]:
# Preprocessing function
def preprocess_data(data):
    # Drop non-predictive columns
    drop_cols = ['id', 'date']
    data = data.drop(columns=drop_cols, errors='ignore')
    
    # Handle categorical variables
    categorical_cols = data.select_dtypes(include=['object']).columns
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col].astype(str))
        label_encoders[col] = le  # Save encoders if needed later
    
    # Handle missing values
    data = data.fillna(data.median(numeric_only=True))
    
    return data, label_encoders

## Training the data and saving the model to a file

In [14]:
# Preprocess train and test data
train_data, train_encoders = preprocess_data(train_data)
test_data, _ = preprocess_data(test_data)

# Split features and target
X_train = train_data.drop(columns=['home_team_win'])
y_train = train_data['home_team_win']

X_test = test_data.drop(columns=['home_team_win'])
y_test = test_data['home_team_win']

# Build a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('model', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

# Hyperparameter tuning
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1
)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Save the trained model
model_filename = 'best_random_forest_model.pkl'
joblib.dump(best_model, model_filename)
print(f"Model saved to {model_filename}")

# Cross-validation performance
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)
print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Load the model and evaluate on test data
loaded_model = joblib.load(model_filename)
y_pred = loaded_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

# Feature Importance
importances = loaded_model.named_steps['model'].feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("Top 10 Important Features:\n", importance_df.head(10))


Fitting 5 folds for each of 81 candidates, totalling 405 fits
Model saved to best_random_forest_model.pkl
Cross-Validation Accuracy: 0.5472 ± 0.0073
Test Accuracy: 0.5709
Classification Report:
               precision    recall  f1-score   support

       False       0.53      0.49      0.51       504
        True       0.60      0.64      0.62       603

    accuracy                           0.57      1107
   macro avg       0.57      0.56      0.56      1107
weighted avg       0.57      0.57      0.57      1107

Top 10 Important Features:
                                  Feature  Importance
25   away_pitching_SO_batters_faced_10RA    0.011180
113  away_pitching_SO_batters_faced_mean    0.010860
149   away_pitcher_SO_batters_faced_mean    0.008849
44                 home_team_spread_mean    0.008672
23   home_pitching_BB_batters_faced_10RA    0.008520
92     home_pitching_earned_run_avg_mean    0.008229
77         away_batting_onbase_perc_mean    0.008216
47                 away_te

# Evaluating Performance via Validation Data

In [15]:
# Load the trained model
model_filename = 'best_random_forest_model.pkl'
try:
    loaded_model = joblib.load(model_filename)
    print(f"Model loaded successfully from {model_filename}")
except FileNotFoundError:
    print(f"Model file not found: {model_filename}")
    exit()

# Load test data here
try:
    test_data = pd.read_csv('../split_data/validation_data.csv')
    print(f"Test data loaded successfully")
except FileNotFoundError as fe:
    print(f"Test data file not found {fe}")
    exit()

# Preprocess test data
# Assuming label_encoders were not saved; preprocessing will handle encoding afresh
test_data, _ = preprocess_data(test_data)

# Separate features and target
X_test = test_data.drop(columns=['home_team_win'], errors='ignore')
y_test = test_data['home_team_win'] if 'home_team_win' in test_data else None

# Predict and evaluate
y_pred = loaded_model.predict(X_test)

if y_test is not None:
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
else:
    print("Target variable not found in test data. Predictions only:")
    print(y_pred)


Model loaded successfully from best_random_forest_model.pkl
Test data loaded successfully
Test Accuracy: 0.5709
Classification Report:
               precision    recall  f1-score   support

       False       0.53      0.49      0.51       504
        True       0.60      0.64      0.62       603

    accuracy                           0.57      1107
   macro avg       0.57      0.56      0.56      1107
weighted avg       0.57      0.57      0.57      1107

