In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# Load data
train_data = pd.read_csv('../split_data/train_test_data.csv')
test_data = pd.read_csv('../split_data/validation_data.csv')

# Preprocessing
def preprocess_data(data):
    # Drop non-predictive columns
    drop_cols = ['id', 'date']
    data = data.drop(columns=drop_cols, errors='ignore')
    
    # Handle categorical variables
    categorical_cols = data.select_dtypes(include=['object']).columns
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col].astype(str))
        label_encoders[col] = le  # Save encoders if needed later
    
    # Handle missing values
    data = data.fillna(data.median(numeric_only=True))
    
    return data, label_encoders

# Preprocess train and test data
train_data, train_encoders = preprocess_data(train_data)
test_data, _ = preprocess_data(test_data)

# Split features and target
X_train = train_data.drop(columns=['home_team_win'])
y_train = train_data['home_team_win']

X_test = test_data.drop(columns=['home_team_win'])
y_test = test_data['home_team_win']

# Build a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('model', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

# Hyperparameter tuning
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1
)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Cross-validation performance
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)
print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Evaluate on test data
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

# Feature Importance
importances = best_model.named_steps['model'].feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("Top 10 Important Features:\n", importance_df.head(10))

Fitting 5 folds for each of 81 candidates, totalling 405 fits


KeyboardInterrupt: 