In [54]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# Load data
train_data = pd.read_csv('../split_data/train_test_data.csv')
test_data = pd.read_csv('../split_data/validation_data.csv')

print(f"Training data size: {len(train_data)}, Test data size: {len(test_data)}")

Training data size: 9960, Test data size: 1107


## Pre-process the data, for both training and validation

In [55]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def preprocess_data(data, target_column):
    """
    Preprocesses data for use in machine learning pipelines.

    Args:
        data (pd.DataFrame): Input data.
        target_column (str): The name of the target column.

    Returns:
        pd.DataFrame: Preprocessed data with numeric and encoded features.
        pd.Series: Target values.
        dict: Encoders for categorical columns.
    """
    # Handle missing values and convert categorical columns
    label_encoders = {}
    categorical_cols = data.select_dtypes(include=['object', 'category']).columns
    
    # Encode categorical columns
    for col in categorical_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col].astype(str).fillna("missing"))
        label_encoders[col] = le  # Save the encoder for future use
    
    # Fill missing numeric values with the median
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())
    
    le = LabelEncoder()
    # Split features and target
    X = data.drop(columns=[target_column])
    y = data[target_column]
    y = le.fit_transform(y)
    
    return X, y, label_encoders


## Training the data and saving the model to a file

In [56]:
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score


X_train, y_train, label_encoders = preprocess_data(train_data, "home_team_win")


# Build a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('feature_selection', SelectFromModel(XGBClassifier(eval_metric='logloss'))),
    ('model', XGBClassifier(eval_metric='logloss', random_state=42))
])

# Hyperparameter tuning
param_grid = {
    'feature_selection__estimator__n_estimators': [50, 100, 200],
    'feature_selection__estimator__max_depth': [3, 5, 7],
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1
)
grid_search.fit(X_train, y_train)

# Save and evaluate
model_filename = 'best_xgboost_model.pkl'
joblib.dump(grid_search.best_estimator_, model_filename)
print(f"Model saved to {model_filename}")


Fitting 5 folds for each of 243 candidates, totalling 1215 fits




Model saved to best_xgboost_model.pkl


# Evaluating Performance via Validation Data

In [60]:
# Load the trained model
model_filename = 'best_random_forest_model.pkl'
try:
    loaded_model = joblib.load(model_filename)
    print(f"Model loaded successfully from {model_filename}")
except FileNotFoundError:
    print(f"Model file not found: {model_filename}")
    exit()

# Load test data here
try:
    test_data = pd.read_csv('../split_data/validation_data.csv')
    print(f"Test data loaded successfully")
except FileNotFoundError as fe:
    print(f"Test data file not found {fe}")
    exit()

# Preprocess test data
# Assuming label_encoders were not saved; preprocessing will handle encoding afresh
test_data, _, _ = preprocess_data(train_data, "home_team_win")

# Separate features and target
X_test = test_data.drop(columns=['home_team_win'], errors='ignore')
y_test = test_data['home_team_win'] if 'home_team_win' in test_data else None

# Predict and evaluate
y_pred = loaded_model.predict(X_test)

if y_test is not None:
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
else:
    print("Target variable not found in test data. Predictions only:")
    print(y_pred)


Model loaded successfully from best_random_forest_model.pkl
Test data loaded successfully


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- away_batting_RBI_10RA
- away_batting_RBI_mean
- away_batting_RBI_skew
- away_batting_RBI_std
- away_batting_batting_avg_10RA
- ...
