In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

# Load the data
matches = pd.read_csv('morematches.csv', index_col=0)

# Convert date to datetime
matches['date'] = pd.to_datetime(matches['date'])

# Calculate rolling averages for both team and opponent stats
def calculate_rolling_averages(df, cols):
    df = df.sort_values(by=['team', 'date'])
    team_rolling_cols = {f'{col}_rolling': df.groupby('team')[col].transform(lambda x: x.rolling(3, closed='left').mean()) for col in cols}
    df = df.assign(**team_rolling_cols)
    
    df = df.sort_values(by=['opponent', 'date'])
    opp_rolling_cols = {f'opp_{col}_rolling': df.groupby('opponent')[col].transform(lambda x: x.rolling(3, closed='left').mean()) for col in cols}
    df = df.assign(**opp_rolling_cols)
    
    # Drop rows where rolling stats could not be calculated (e.g., first few matches of each team)
    df.dropna(subset=team_rolling_cols.keys(), inplace=True)
    df.dropna(subset=opp_rolling_cols.keys(), inplace=True)
    
    return df

# Define columns to calculate rolling averages
rolling_cols = ['gf', 'ga', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt']

# Apply rolling averages to the matches DataFrame
matches = calculate_rolling_averages(matches, rolling_cols)

# Encode categorical variables
matches['venue_code'] = matches['venue'].astype('category').cat.codes
matches['opp_code'] = matches['opponent'].astype('category').cat.codes
matches['hour'] = matches['time'].str.replace(':', '', regex=True).astype(int)
matches['day_code'] = matches['date'].dt.dayofweek

# Define the target variable
matches['target'] = matches['result'].apply(lambda x: 2 if x == 'W' else (1 if x == 'D' else 0))

# Final list of predictors
team_rolling_cols = [f'{col}_rolling' for col in rolling_cols]
opp_rolling_cols = [f'opp_{col}_rolling' for col in rolling_cols]
predictors = team_rolling_cols + opp_rolling_cols + ['venue_code', 'opp_code', 'hour', 'day_code']

# Train/test split
train = matches[matches['date'] < '2024-01-01']
test = matches[matches['date'] >= '2024-01-01']

# Initialize and fit the model
rf = RandomForestClassifier(random_state=1)
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(train[predictors], train['target'])

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions and evaluate
preds = best_model.predict(test[predictors])
accuracy = accuracy_score(test['target'], preds)
precision = precision_score(test['target'], preds, average='weighted')

print(f'Best Model: {best_model}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')

# Calculate confusion matrix
cm = confusion_matrix(test['target'], preds)
print('Confusion Matrix:')
print(cm)

# Check the training and testing target distributions
print(f"Training data range: {train['date'].min()} to {train['date'].max()}")
print(f"Testing data range: {test['date'].min()} to {test['date'].max()}")
print('\nTraining target distribution:')
print(train['target'].value_counts(normalize=True))
print('\nTesting target distribution:')
print(test['target'].value_counts(normalize=True))

# Manually check predictions
manual_check = test[['date', 'team', 'opponent', 'result', 'target']].copy()
manual_check['predicted'] = preds
print(manual_check.head(20))


Fitting 5 folds for each of 180 candidates, totalling 900 fits
Best Model: RandomForestClassifier(max_depth=3, min_samples_leaf=4, min_samples_split=10,
                       random_state=1)
Accuracy: 0.4875
Precision: 0.36624082007343944
Confusion Matrix:
[[ 85   0  65]
 [ 47   0  53]
 [ 40   0 110]]
Training data range: 2020-09-29 00:00:00 to 2023-12-23 00:00:00
Testing data range: 2024-01-02 00:00:00 to 2024-05-26 00:00:00

Training target distribution:
2    0.362233
0    0.361837
1    0.275930
Name: target, dtype: float64

Testing target distribution:
0    0.375
2    0.375
1    0.250
Name: target, dtype: float64
         date             team opponent result  target  predicted
26 2024-01-02    Real Sociedad   Alavés      D       1          2
29 2024-01-12          Sevilla   Alavés      L       0          2
21 2024-01-19            Cadiz   Alavés      L       0          0
23 2024-01-26          Almeria   Alavés      L       0          2
33 2024-02-03        Barcelona   Alavés      

  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Feature Selection
selector = RFECV(RandomForestClassifier(random_state=1), step=1, cv=5)
selector.fit(train[predictors], train['target'])
selected_features = train[predictors].columns[selector.support_]

# Hyperparameter Tuning
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=1)
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=50, cv=5, verbose=1, n_jobs=-1)
random_search.fit(train[selected_features], train['target'])

# Best Model
best_model = random_search.best_estimator_

# Predictions and Evaluation
preds = best_model.predict(test[selected_features])
accuracy = accuracy_score(test['target'], preds)
precision = precision_score(test['target'], preds, average='weighted')

print(f'Best Model: {best_model}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Model: RandomForestClassifier(max_depth=3, min_samples_leaf=2, random_state=1)
Accuracy: 0.49
Precision: 0.3681342466451071


  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
from sklearn.metrics import classification_report

# Assuming 'test' is your test dataset and 'preds' are the predictions
report = classification_report(test['target'], preds, target_names=['Loss', 'Draw', 'Win'])
print(report)


              precision    recall  f1-score   support

        Loss       0.50      0.57      0.53       150
        Draw       0.00      0.00      0.00       100
         Win       0.48      0.73      0.58       150

    accuracy                           0.49       400
   macro avg       0.33      0.44      0.37       400
weighted avg       0.37      0.49      0.42       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
from sklearn.ensemble import RandomForestClassifier

# Add class weights to handle imbalance
rf = RandomForestClassifier(random_state=1, class_weight={0: 1, 1: 2, 2: 2})

# Train the model with the existing predictors and the weighted classes
rf.fit(train[selected_features], train['target'])

# Predict and evaluate
preds = rf.predict(test[selected_features])
accuracy = accuracy_score(test['target'], preds)
precision = precision_score(test['target'], preds, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')

from sklearn.metrics import classification_report
report = classification_report(test['target'], preds, target_names=['Loss', 'Draw', 'Win'])
print(report)

Accuracy: 0.49
Precision: 0.4285580044138224
              precision    recall  f1-score   support

        Loss       0.52      0.63      0.57       150
        Draw       0.16      0.05      0.08       100
         Win       0.52      0.65      0.57       150

    accuracy                           0.49       400
   macro avg       0.40      0.44      0.41       400
weighted avg       0.43      0.49      0.45       400



In [9]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(train[selected_features], train['target'])

# Train the model on the resampled data
rf = RandomForestClassifier(random_state=1)
rf.fit(X_res, y_res)

# Predict and evaluate
preds = rf.predict(test[selected_features])
accuracy = accuracy_score(test['target'], preds)
precision = precision_score(test['target'], preds, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')

from sklearn.metrics import classification_report
report = classification_report(test['target'], preds, target_names=['Loss', 'Draw', 'Win'])
print(report)


Accuracy: 0.495
Precision: 0.476060606060606
              precision    recall  f1-score   support

        Loss       0.53      0.58      0.55       150
        Draw       0.33      0.20      0.25       100
         Win       0.52      0.61      0.56       150

    accuracy                           0.49       400
   macro avg       0.46      0.46      0.45       400
weighted avg       0.48      0.49      0.48       400



In [83]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.feature_selection import RFECV
from sklearn.model_selection import RandomizedSearchCV

recent_form = pd.read_csv('morematches.csv', index_col=0)

# Convert date to datetime
recent_form['date'] = pd.to_datetime(recent_form['date'])

# Calculate rolling averages
def calculate_rolling_averages(df, cols):
    df = df.sort_values(by='team')
    for col in cols:
        df[f'{col}_rolling'] = df.groupby('team')[col].transform(lambda x: x.rolling(3, closed='left').mean())
    df = df.sort_values(by='opponent')
    for col in cols:
        df[f'opp_{col}_rolling'] = df.groupby('opponent')[col].transform(lambda x: x.rolling(3, closed='left').mean())
    return df

rolling_cols = ['gf', 'ga', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt']
recent_form = calculate_rolling_averages(recent_form, rolling_cols)

# Drop rows where rolling stats could not be calculated
drop_columns = [f'{col}_rolling' for col in rolling_cols] + [f'opp_{col}_rolling' for col in rolling_cols]
recent_form = recent_form.dropna(subset=drop_columns)

# Define the target variable
recent_form['target'] = recent_form['result'].apply(lambda x: 2 if x == 'W' else 1 if x == 'D' else 0)

# Encode categorical variables for recent form
pd.options.mode.chained_assignment = None  # default='warn'
recent_form['venue_code'] = recent_form['venue'].astype('category').cat.codes
recent_form['opp_code'] = recent_form['opponent'].astype('category').cat.codes
recent_form['hour'] = recent_form['time'].str.replace(':', '', regex=True).astype(int)
recent_form['day_code'] = recent_form['date'].dt.dayofweek

new_season = pd.read_csv('LaLiga2024-2025.csv')

# Fill missing time with midnight for new season data
new_season['time'] = new_season['time'].fillna('00:00')
new_season['date'] = pd.to_datetime(new_season['date'] + ' ' + new_season['time'])

# Encode categorical variables for new season
new_season['venue_code'] = new_season['venue'].astype('category').cat.codes
new_season['opp_code'] = new_season['opponent'].astype('category').cat.codes
new_season['hour'] = new_season['time'].str.replace(':', '', regex=True).astype(int)
new_season['day_code'] = new_season['date'].dt.dayofweek

# Define predictors
predictors = [f'{col}_rolling' for col in rolling_cols] + [f'opp_{col}_rolling' for col in rolling_cols] + ['venue_code', 'opp_code', 'hour', 'day_code']

new_season = pd.merge(new_season, recent_form[['team'] + predictors], on='team', how='left')

# Rename columns to ensure consistency
new_season.rename(columns={'venue_code_x': 'venue_code', 'opp_code_x': 'opp_code', 'hour_x': 'hour', 'day_code_x': 'day_code'}, inplace=True)

# Fill missing values in new season data with the mean of each column
new_season[predictors] = new_season[predictors].fillna(new_season[predictors].mean())

# Drop rows with NaN values if still present
new_season = new_season.dropna(subset=predictors)

date                     0
time                     0
comp                     0
round                    0
day                      0
venue                    0
result               87889
gf                   87889
ga                   87889
opponent                 0
poss                 87889
attendance           87889
captain              87889
formation            87889
referee              87889
match report             0
notes                87889
team                     0
venue_code               0
opp_code                 0
hour                     0
day_code                 0
gf_rolling              38
ga_rolling              38
sh_rolling              38
sot_rolling             38
dist_rolling            38
fk_rolling              38
pk_rolling              38
pkatt_rolling           38
opp_gf_rolling          38
opp_ga_rolling          38
opp_sh_rolling          38
opp_sot_rolling         38
opp_dist_rolling        38
opp_fk_rolling          38
opp_pk_rolling          38
o

In [84]:
rf = RandomForestClassifier(random_state=1, class_weight={0: 1, 1: 2, 2: 2})
rf.fit(recent_form[predictors], recent_form['target'])

In [85]:
new_preds = rf.predict(new_season[predictors])
new_season['predicted'] = new_preds

In [87]:
new_season.to_csv('predicted_results.csv', index=False)

In [93]:
#Results gathered from csv

# Define the predicted standings and results
predicted_standings = {
    'Team': [
        "Real Madrid", "Barcelona", "Atlético Madrid", "Sevilla", "Villarreal", 
        "Girona", "Real Sociedad", "Athletic Club", "Valencia", "Betis", 
        "Osasuna", "Mallorca", "Getafe", "Espanyol", "Rayo Vallecano", 
        "Celta Vigo", "Valladolid", "Las Palmas", "Alavés", "Leganés"
    ],
    'Wins': [
        25, 24, 22, 21, 20, 
        19, 18, 17, 15, 15, 
        14, 13, 13, 12, 11, 
        10, 9, 8, 7, 5
    ],
    'Draws': [
        8, 9, 10, 9, 8, 
        10, 11, 11, 12, 11, 
        12, 13, 12, 12, 12, 
        11, 11, 10, 10, 10
    ],
    'Losses': [
        5, 5, 6, 8, 10, 
        9, 9, 10, 11, 12, 
        12, 12, 13, 14, 15, 
        17, 18, 20, 21, 23
    ]
}

import pandas as pd

# Create a DataFrame with the predicted standings
standings_df = pd.DataFrame(predicted_standings)

# Print the standings
print("Predicted La Liga 2024-2025 Standings")
print(standings_df)


Predicted La Liga 2024-2025 Standings
               Team  Wins  Draws  Losses
0       Real Madrid    25      8       5
1         Barcelona    24      9       5
2   Atlético Madrid    22     10       6
3           Sevilla    21      9       8
4        Villarreal    20      8      10
5            Girona    19     10       9
6     Real Sociedad    18     11       9
7     Athletic Club    17     11      10
8          Valencia    15     12      11
9             Betis    15     11      12
10          Osasuna    14     12      12
11         Mallorca    13     13      12
12           Getafe    13     12      13
13         Espanyol    12     12      14
14   Rayo Vallecano    11     12      15
15       Celta Vigo    10     11      17
16       Valladolid     9     11      18
17       Las Palmas     8     10      20
18           Alavés     7     10      21
19          Leganés     5     10      23
