In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
def process_ml_data(file_paths):
    all_training_data = []

    for file in file_paths:
        df = pd.read_csv(file)
        # Convert date to datetime for correct sorting
        df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
        df = df.sort_values('Date')

        # Dictionary to track running stats for each team within the season
        teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()
        team_stats = {team: {
            'MP': 0, 'Pts': 0, 'Gls': 0, 'GA': 0,
            'H_MP': 0, 'H_Pts': 0, 'A_MP': 0, 'A_Pts': 0
        } for team in teams}

        season_rows = []

        for index, row in df.iterrows():
            home_t = row['HomeTeam']
            away_t = row['AwayTeam']

            # 1. Calculate Rank based on current stats (prior to this match)
            # We create a temporary table to find current rankings
            current_table = []
            for t, s in team_stats.items():
                current_table.append({
                    'Team': t, 'Pts': s['Pts'], 
                    'GD': s['Gls'] - s['GA'], 'Gls': s['Gls']
                })
            
            # Sort to get ranks (Points > GD > Gls)
            rank_df = pd.DataFrame(current_table).sort_values(
                by=['Pts', 'GD', 'Gls'], ascending=False).reset_index(drop=True)
            rank_df['Rk'] = rank_df.index + 1
            
            # 2. Extract features for Home and Away teams
            h_stat = team_stats[home_t]
            a_stat = team_stats[away_t]

            match_features = {
                'Date': row['Date'],
                'HomeTeam': home_t,
                'AwayTeam': away_t,
                # Target Variable
                'Result': row['FTR'], # H, D, A
                
                # Home Team Pre-Match Features
                'Home_Rk': rank_df.loc[rank_df['Team'] == home_t, 'Rk'].values[0],
                'Home_Pts_MP': h_stat['Pts'] / h_stat['MP'] if h_stat['MP'] > 0 else 0,
                'Home_GD': h_stat['Gls'] - h_stat['GA'],
                'Home_H_Pts_MP': h_stat['H_Pts'] / h_stat['H_MP'] if h_stat['H_MP'] > 0 else 0,
                'Home_Gls': h_stat['Gls'],
                'Home_GA': h_stat['GA'],

                # Away Team Pre-Match Features
                'Away_Rk': rank_df.loc[rank_df['Team'] == away_t, 'Rk'].values[0],
                'Away_Pts_MP': a_stat['Pts'] / a_stat['MP'] if a_stat['MP'] > 0 else 0,
                'Away_GD': a_stat['Gls'] - a_stat['GA'],
                'Away_A_Pts_MP': a_stat['A_Pts'] / a_stat['A_MP'] if a_stat['A_MP'] > 0 else 0,
                'Away_Gls': a_stat['Gls'],
                'Away_GA': a_stat['GA']
            }

            # Only add to training if both teams have played at least 3 games 
            # (Ranks and Pts/MP are highly volatile in the first 1-3 weeks)
            if h_stat['MP'] >= 3 and a_stat['MP'] >= 3:
                season_rows.append(match_features)

            # 3. Update the Running Stats AFTER the match data is recorded for training
            # Update Home Team
            h_pts = 3 if row['FTR'] == 'H' else (1 if row['FTR'] == 'D' else 0)
            team_stats[home_t]['MP'] += 1
            team_stats[home_t]['Pts'] += h_pts
            team_stats[home_t]['Gls'] += row['FTHG']
            team_stats[home_t]['GA'] += row['FTAG']
            team_stats[home_t]['H_MP'] += 1
            team_stats[home_t]['H_Pts'] += h_pts

            # Update Away Team
            a_pts = 3 if row['FTR'] == 'A' else (1 if row['FTR'] == 'D' else 0)
            team_stats[away_t]['MP'] += 1
            team_stats[away_t]['Pts'] += a_pts
            team_stats[away_t]['Gls'] += row['FTAG']
            team_stats[away_t]['GA'] += row['FTHG']
            team_stats[away_t]['A_MP'] += 1
            team_stats[away_t]['A_Pts'] += a_pts

        all_training_data.extend(season_rows)

    return pd.DataFrame(all_training_data)

# List of your files (Past 3 seasons + current)
csv_files = ['SP1 (3).csv', 'SP1 (2).csv', 'SP1 (1).csv', 'SP1.csv']

# Generate the ML training dataset
train_df = process_ml_data(csv_files)

# Show final structure
train_df

Unnamed: 0,Date,HomeTeam,AwayTeam,Result,Home_Rk,Home_Pts_MP,Home_GD,Home_H_Pts_MP,Home_Gls,Home_GA,Away_Rk,Away_Pts_MP,Away_GD,Away_A_Pts_MP,Away_Gls,Away_GA
0,2022-09-02,Celta,Cadiz,H,12,1.333333,-2,0.5,4,6,20,0.000000,-7,0.000000,0,7
1,2022-09-03,Sevilla,Barcelona,A,15,0.333333,-2,1.0,3,5,3,2.333333,7,3.000000,8,1
2,2022-09-03,Sociedad,Ath Madrid,D,9,2.000000,-1,0.0,3,4,8,2.000000,2,3.000000,4,2
3,2022-09-03,Real Madrid,Betis,H,2,3.000000,6,0.0,9,3,3,3.000000,5,3.000000,6,1
4,2022-09-03,Mallorca,Girona,D,10,1.333333,1,0.0,3,2,13,1.000000,0,0.000000,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1232,2026-02-01,Getafe,Celta,D,17,1.047619,-11,1.1,16,27,7,1.523810,6,1.800000,29,23
1233,2026-02-01,Ath Bilbao,Sociedad,D,14,1.142857,-10,1.6,20,30,8,1.285714,0,1.000000,29,29
1234,2026-02-01,Real Madrid,Vallecano,H,2,2.428571,28,2.7,45,17,17,1.047619,-11,0.909091,17,28
1235,2026-02-01,Betis,Valencia,H,7,1.523810,7,1.9,34,27,15,1.095238,-11,0.600000,22,33


In [19]:
train_df.to_csv('train.csv', index=False)

In [None]:
# Load data
df = pd.read_csv('train.csv')
sns.set_theme(style="whitegrid")

# 1. Distribution of Match Results
plt.subplots()
sns.countplot(data=df, x='Result', order=['H', 'D', 'A'], palette='viridis')
plt.title('Distribution of Match Results (H=Home, D=Draw, A=Away)')
plt.savefig('match_results_dist.png')

# 2. Distribution of Goals
plt.subplots()
plt.hist(df['Home_Gls'], bins=20, alpha=0.5, label='Home Goals', color='blue')
plt.hist(df['Away_Gls'], bins=20, alpha=0.5, label='Away Goals', color='red')
plt.title('Distribution of Home vs Away Goals')
plt.legend()
plt.savefig('goals_distribution.png')

# 3. Correlation Heatmap
plt.subplots(figsize=(12, 8))
numeric_df = df.select_dtypes(include=['float64', 'int64'])
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features')
plt.savefig('correlation_heatmap.png')

# 4. Home Rank vs Away Rank Scatter
plt.subplots()
sns.scatterplot(data=df, x='Home_Rk', y='Away_Rk', hue='Result', alpha=0.6)
plt.title('Home Rank vs Away Rank')
plt.savefig('rank_vs_rank_scatter.png')