In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Load the data from the provided Excel files
files = {
    '2014vb -2015': 'all-euro-data-2019-2020.xlsx',
    '2015-2016': 'all-euro-data-2020-2021.xlsx',
    '2016-2017': 'all-euro-data-2021-2022.xlsx',
    '2017-2018': 'all-euro-data-2019-2020.xlsx',
    '2018-2019': 'all-euro-data-2020-2021.xlsx',
    '2019-2020': 'all-euro-data-2021-2022.xlsx',
    '2020-2021': 'all-euro-data-2019-2020.xlsx',
    '2021-2022': 'all-euro-data-2020-2021.xlsx',
    '2022-2023': 'all-euro-data-2021-2022.xlsx'
}

In [None]:
# Full names of divisions
division_names = {
    'E0': 'English Premier League',
    'E1': 'English Championship',
    'E2': 'English League 1',
    'E3': 'English League 2',
    'EC': 'English Conference',
    'SC0': 'Scottish Premiership',
    'SC1': 'Scottish Championship',
    'SC2': 'Scottish League 1',
    'SC3': 'Scottish League 2',
    'D1': 'German Bundesliga',
    'D2': 'German 2. Bundesliga',
    'SP1': 'Spanish La Liga',
    'SP2': 'Spanish Segunda Division',
    'I1': 'Italian Serie A',
    'I2': 'Italian Serie B',
    'F1': 'French Ligue 1',
    'F2': 'French Ligue 2',
    'B1': 'Belgian Pro League',
    'N1': 'Dutch Eredivisie',
    'P1': 'Portuguese Primeira Liga',
    'T1': 'Turkish Super Lig',
    'G1': 'Greek Super League'
}

In [None]:
# Read all sheets from each Excel file and compile into a single DataFrame with a new column for the division
all_data = []
for season, file in files.items():
    data = pd.read_excel(file, sheet_name=None)
    for div, df in data.items():
        df['Season'] = season
        df['Division'] = division_names.get(div, div)  # Use full names for divisions
        all_data.append(df)

# Combine all data into a single DataFrame
combined_data = pd.concat(all_data, ignore_index=True)
combined_data.head()


In [None]:
# Encode categorical variables
le_home = LabelEncoder()
le_away = LabelEncoder()
combined_data['HomeTeam'] = le_home.fit_transform(combined_data['HomeTeam'])
combined_data['AwayTeam'] = le_away.fit_transform(combined_data['AwayTeam'])


In [None]:
# Calculate points
def calculate_points(row):
    if row['FTR'] == 'H':
        return 3, 0
    elif row['FTR'] == 'A':
        return 0, 3
    else:
        return 1, 1

combined_data[['HomePoints', 'AwayPoints']] = combined_data.apply(calculate_points, axis=1, result_type="expand")


In [None]:
pip install pandas matplotlib seaborn scikit-learn numpy openpyxl


In [None]:
# Train and evaluate models for each division
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Linear Regression': LinearRegression(),
    'SVR': SVR()
}

results = {}
best_models = {}
for division, division_name in division_names.items():
    div_data = combined_data[combined_data['Division'] == division_name]
    div_features = div_data[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']]
    div_targets = div_data[['HomePoints', 'AwayPoints']]
    
    if div_data.empty:
        continue
    
    X_train, X_test, y_train, y_test = train_test_split(div_features, div_targets, test_size=0.2, random_state=42)
    
    div_results = {}
    for name, model in models.items():
        scores_home = cross_val_score(model, X_train, y_train['HomePoints'], scoring='r2', cv=5)
        scores_away = cross_val_score(model, X_train, y_train['AwayPoints'], scoring='r2', cv=5)
        
        mean_r2_home = scores_home.mean()
        mean_r2_away = scores_away.mean()
        
        div_results[name] = {
            'Mean R2 Home': mean_r2_home,
            'Mean R2 Away': mean_r2_away
        }
    
    results[division_name] = div_results
    # Select the best model based on Mean R2 Home score for each division
    best_model_name = max(div_results, key=lambda name: div_results[name]['Mean R2 Home'])
    best_models[division_name] = models[best_model_name]

In [None]:
# Part 1: Generate previous vs. actual plots for the latest available season
latest_season = combined_data['Season'].max()

for division, division_name in division_names.items():
    latest_season_data = combined_data[(combined_data['Season'] == latest_season) & (combined_data['Division'] == division_name)]
    if latest_season_data.empty:
        continue
    latest_features = latest_season_data[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']]
    
    best_model = best_models[division_name]
    
    # Decode team names before merging or concatenating
    latest_season_data['HomeTeam'] = le_home.inverse_transform(latest_season_data['HomeTeam'].astype(int))
    latest_season_data['AwayTeam'] = le_away.inverse_transform(latest_season_data['AwayTeam'].astype(int))
    
    latest_season_data['PredictedHomePoints'] = best_model.fit(X_train, y_train['HomePoints']).predict(latest_features)  # Fit on training data, predict on latest
    latest_season_data['PredictedAwayPoints'] = best_model.fit(X_train, y_train['AwayPoints']).predict(latest_features)  # Fit on training data, predict on latest
    
    # Aggregate predictions to get total points per team
    home_points = latest_season_data.groupby('HomeTeam')['PredictedHomePoints'].sum()
    away_points = latest_season_data.groupby('AwayTeam')['PredictedAwayPoints'].sum()
    
    total_points = home_points.add(away_points, fill_value=0).reset_index()
    total_points.columns = ['Team', 'TotalPredictedPoints']
    
    # Calculate actual points
    actual_home_points = latest_season_data.groupby('HomeTeam')['HomePoints'].sum()
    actual_away_points = latest_season_data.groupby('AwayTeam')['AwayPoints'].sum()
    actual_total_points = actual_home_points.add(actual_away_points, fill_value=0).reset_index()
    actual_total_points.columns = ['Team', 'TotalActualPoints']
    
    # Merge predicted and actual points
    comparison = pd.merge(total_points, actual_total_points, on='Team')
    
    # Sort teams by TotalActualPoints in descending order and select top 8 teams
    comparison_sorted = comparison.sort_values(by='TotalActualPoints', ascending=False).head(8)
    
    # Plot the comparison with annotations on bars
    plt.figure(figsize=(12, 10))  # Increase figure size
    comparison_melted = comparison_sorted.melt(id_vars=['Team'], value_vars=['TotalPredictedPoints', 'TotalActualPoints'], var_name='Type', value_name='Points')
    sns.barplot(x='Points', y='Team', hue='Type', data=comparison_melted, palette='viridis')
    
    # Annotate bars with values
    for p in plt.gca().patches:
        width = p.get_width()  # Get bar length
        plt.text(width + 1,  # Set text position
                 p.get_y() + p.get_height() / 2,  # Align text vertically
                 f'{int(width)}',  # Round and format text
                 ha='center',  # Horizontal alignment
                 va='center')  # Vertical alignment
    
    plt.title(f'Predicted vs Actual Standings for {latest_season} Season - {division_name}')
    plt.xlabel('Total Points')
    plt.ylabel('Team')
    plt.legend(title='Type')
    plt.tight_layout()  # Ensures all elements fit within the figure area
    plt.show()
    
    # Print top 8 teams with their actual and predicted points
    print(f"Top 8 teams for {division_name} in {latest_season} season:")
    print(comparison_sorted[['Team', 'TotalPredictedPoints', 'TotalActualPoints']])


In [None]:
# Part 2: Predict the next season's results
next_season = str(int(latest_season.split('-')[0]) + 1) + '-' + str(int(latest_season.split('-')[1]) + 1)  # Determine next season
 

for division, division_name in division_names.items():
    latest_season_data = combined_data[(combined_data['Season'] == latest_season) & (combined_data['Division'] == division_name)]
    if latest_season_data.empty:
        continue
    latest_features = latest_season_data[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']]
    
    best_model = best_models[division_name]
    
    # Decode team names before merging or concatenating
    latest_season_data['HomeTeam'] = le_home.inverse_transform(latest_season_data['HomeTeam'].astype(int))
    latest_season_data['AwayTeam'] = le_away.inverse_transform(latest_season_data['AwayTeam'].astype(int))
    
    latest_season_data['PredictedHomePoints'] = best_model.fit(X_train, y_train['HomePoints']).predict(latest_features)  # Fit on training data, predict on latest
    latest_season_data['PredictedAwayPoints'] = best_model.fit(X_train, y_train['AwayPoints']).predict(latest_features)  # Fit on training data, predict on latest
    
    # Aggregate predictions to get total points per team
    home_points = latest_season_data.groupby('HomeTeam')['PredictedHomePoints'].sum()
    away_points = latest_season_data.groupby('AwayTeam')['PredictedAwayPoints'].sum()
    
    total_points = home_points.add(away_points, fill_value=0).reset_index()
    total_points.columns = ['Team', 'TotalPredictedPoints']
    
    # Sort teams by TotalPredictedPoints in descending order and select top 8 teams
    total_points_sorted = total_points.sort_values(by='TotalPredictedPoints', ascending=False).head(8)
    
    # Plot the predicted standings with annotations on bars
    plt.figure(figsize=(12, 10))  # Increase figure size
    sns.barplot(x='TotalPredictedPoints', y='Team', data=total_points_sorted, palette='viridis')
    
    # Annotate bars with values
    for p in plt.gca().patches:
        width = p.get_width()  # Get bar length
        plt.text(width + 1,  # Set text position
                 p.get_y() + p.get_height() / 2,  # Align text vertically
                 f'{int(width)}',  # Round and format text
                 ha='center',  # Horizontal alignment
                 va='center')  # Vertical alignment
    
    plt.title(f'Predicted Standings for {next_season} Season - {division_name}')
    plt.xlabel('Total Points')
    plt.ylabel('Team')
    plt.tight_layout()  # Ensures all elements fit within the figure area
    plt.show()
    
    # Print top 8 teams with their predicted points
    print(f"Top 8 teams for {division_name} predicted for {next_season} season:")
    print(total_points_sorted[['Team', 'TotalPredictedPoints']])

# The predicted points for the next season are now available in `forecast_results`
