# Football Match Outcome Prediction - Model Training

Before running this notebook, make sure the following dependencies are installed:

In [None]:
%pip install pandas scikit-learn xgboost joblib


Run the code to train a machine learning model (XGBoost Classifier) to predict the outcomes of football matches based on historical data.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from os import path
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="xgboost")

# Data gathering
data_folders = [
    'english-premier-league_zip',
    'spanish-la-liga_zip',
    'french-ligue-1_zip',
    'german-bundesliga_zip',
    'italian-serie-a_zip'
]
season_range = (9, 18)  # Example: Seasons from 2009-2018

data_files = []
for folder in data_folders:
    for season in range(season_range[0], season_range[1] + 1):
        data_files.append(f'data/{folder}/data/season-{season:02d}{season + 1:02d}_csv.csv')

# Collect data from available files
data_frames = []
for data_file in data_files:
    if path.exists(data_file):
        data_frames.append(pd.read_csv(data_file))

if not data_frames:
    print("No data files found. Please ensure data files are in the correct path.")
    exit()

# Combine all data
data = pd.concat(data_frames).reset_index(drop=True)
print("Data loaded successfully.")

# Preprocessing
input_features = ['home_encoded', 'away_encoded', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HR', 'AR']
output_feature = 'FTR'  # Full-Time Result: 'H', 'A', 'D'

# Encode team names
encoder = LabelEncoder()
data['home_encoded'] = encoder.fit_transform(data['HomeTeam'])
home_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

data['away_encoded'] = encoder.fit_transform(data['AwayTeam'])
away_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

# Encode target variable
target_encoder = LabelEncoder()
data['FTR_encoded'] = target_encoder.fit_transform(data['FTR'])

# Filter relevant columns
data = data[input_features + ['FTR_encoded']]
data.dropna(inplace=True)  # Drop rows with missing values

# Split data
X = data[input_features]
y = data['FTR_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and train the XGBoost model
xgb_classifier = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_classifier.fit(X_train, y_train)

# Evaluate the model
y_train_pred = xgb_classifier.predict(X_train)
y_test_pred = xgb_classifier.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred, target_names=target_encoder.classes_))

# Save the model (optional)
from joblib import dump
dump(xgb_classifier, 'xgb_classifier.model')
print("XGBoost Model saved successfully.")


Data loaded successfully.
Training Accuracy: 0.82
Testing Accuracy: 0.65

Classification Report on Test Data:
              precision    recall  f1-score   support

           A       0.65      0.69      0.67      1020
           D       0.45      0.30      0.36       916
           H       0.71      0.82      0.76      1640

    accuracy                           0.65      3576
   macro avg       0.60      0.60      0.60      3576
weighted avg       0.63      0.65      0.63      3576

XGBoost Model saved successfully.
