In [1]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Step 1: Load the dataset
data = pd.read_csv('understat_per_game.csv')

In [3]:
# Step 2: Handle outliers in the target variable ('scored') using the IQR method
Q1 = data['scored'].quantile(0.25)
Q3 = data['scored'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers
data = data[(data['scored'] >= lower_bound) & (data['scored'] <= upper_bound)]
print(f"Dataset shape after removing outliers: {data.shape}")

Dataset shape after removing outliers: (24420, 29)


In [4]:
# Step 3: Create target classes for binary classification (under 2.5 goals = 0, over 2.5 goals = 1)
def classify_binary_goals(goals):
    return 1 if goals > 2.5 else 0

data['scored_binary'] = data['scored'].apply(classify_binary_goals)

In [None]:
# Step 4: Drop high-correlation features and non-useful columns
high_corr_features = ['pts', 'npxG', 'npxGD', 'xG', 'xG_diff', 'wins', 'npxG', 
                      'xpts', 'xGA_diff', 'xGA', 'npxGA', 'xpts_diff', 'loses', 'draws']
data = data.drop(columns=high_corr_features + ['result', 'date', 'team', 'scored'])

In [None]:
# Step 5: Feature Engineering
data['ppda_efficiency'] = data['ppda_att'] / (data['ppda_def'] + 1)
data['oppda_efficiency'] = data['oppda_att'] / (data['oppda_def'] + 1)
data['relative_ppda_efficiency'] = data['ppda_efficiency'] / (data['oppda_efficiency'] + 1)
data['ppda_intensity'] = data['ppda_coef'] * data['ppda_att']
data['oppda_intensity'] = data['oppda_coef'] * data['oppda_att']
data['intensity_diff'] = data['ppda_intensity'] - data['oppda_intensity']

In [None]:
# Apply one-hot encoding to categorical columns ('h_a', 'league')
data = pd.get_dummies(data, columns=['h_a', 'league'], drop_first=True)

In [None]:
# Step 6: Define target and features for binary classification
X = data.drop(columns=['scored_binary'])  # Features
y = data['scored_binary']  # Target variable (binary classes)

# Step 7: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Initialize Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    class_weight="balanced"  # Handle class imbalance
)

# Step 9: Train the model
rf_model.fit(X_train, y_train)

# Step 10: Make predictions
y_pred = rf_model.predict(X_test)

# Step 11: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Under 2.5', 'Over 2.5'], yticklabels=['Under 2.5', 'Over 2.5'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Binary Classification - Random Forest)')
plt.show()

# Step 12: Cross-Validation Accuracy
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
print("\nCross-Validation Accuracy:")
print(f"Mean: {cv_scores.mean():.4f}, Std: {cv_scores.std():.4f}")
print(f"Scores: {cv_scores}")

In [None]:
# Step 13: Feature Importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nTop 20 Feature Importance based on Random Forest:")
print(feature_importance.head(20))

# Plot Feature Importance
plt.figure(figsize=(10, 8))
sns.barplot(data=feature_importance.head(20), x='Importance', y='Feature', palette='viridis')
plt.title('Top 20 Feature Importances (Binary Classification - Random Forest)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()