In [1]:
import pandas as pd

df = pd.read_csv("matches_clean.csv")

df["date"] = pd.to_datetime(df["date"], errors="coerce", dayfirst=True)

df = df.dropna(subset=["date", "home_team", "away_team"])

# Create / overwrite these columns
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["weekday"] = df["date"].dt.weekday

df[['home_goals', 'away_goals']] = df['score'].str.split(':', expand=True).astype(float)

df['result'] = df.apply(
    lambda row: "1" if row['home_goals'] > row['away_goals']
    else ("2" if row['away_goals'] > row['home_goals'] else "X"),
    axis=1
)

# Encode categorical results
result_map = {"1": 0, "X": 1, "2": 2}
df["target"] = df["result"].map(result_map)

print(df[["home_goals", "away_goals", "result", "target"]].head())
print(df["target"].value_counts())



   home_goals  away_goals result  target
0         2.0         3.0      2       2
1         3.0         2.0      1       0
2         5.0         0.0      1       0
3         1.0         1.0      X       1
4         0.0         2.0      2       2
target
0    10219
1     5315
2     4152
Name: count, dtype: int64


In [2]:
#just checking for division 1 games in a specific timeframe 
df = df[df["division"] == 1]
df = df[df["season"] >= "1990-1991"]


In [9]:
def rolling_team_stats(df, team_col, goals_for_col, goals_against_col, window=5):
    stats = []
    team_histories = {}

    for _, row in df.iterrows():
        team = row[team_col]
        gf, ga = row[goals_for_col], row[goals_against_col]

        if team not in team_histories:
            team_histories[team] = {'gf': [], 'ga': []}

        past = team_histories[team]
        if len(past['gf']) < window:
            avg_gf = sum(past['gf']) / len(past['gf']) if past['gf'] else None
            avg_ga = sum(past['ga']) / len(past['ga']) if past['ga'] else None
        else:
            avg_gf = sum(past['gf'][-window:]) / window
            avg_ga = sum(past['ga'][-window:]) / window

        stats.append((avg_gf, avg_ga))

        # Append current game AFTER computing averages (so it’s truly "past" stats)
        past['gf'].append(gf)
        past['ga'].append(ga)

    return pd.DataFrame(stats, columns=[f"{team_col}_avg_goals_for", f"{team_col}_avg_goals_against"])


# --- Apply to your dataset ---
# Sort by date so history builds correctly
df = df.sort_values("date").reset_index(drop=True)

# Compute rolling stats for both home and away teams
home_stats = rolling_team_stats(df, "home_team", "home_goals", "away_goals")
away_stats = rolling_team_stats(df, "away_team", "away_goals", "home_goals")

# Combine into the dataframe
df = pd.concat([df, home_stats, away_stats], axis=1)

# Replace missing early values with dataset mean or 1.5 as fallback
df = df.fillna(1.5)
df = df.loc[:, ~df.columns.duplicated()]

# Compute combined features
df["avg_goals_for_combined"] = (
    df["home_team_avg_goals_for"] + df["away_team_avg_goals_for"]
) / 2
df["avg_goals_against_combined"] = (
    df["home_team_avg_goals_against"] + df["away_team_avg_goals_against"]
) / 2

print(df.head(20))


       season  division  matchday       date        home_team  \
0   1990-1991         1         1 1990-09-01         Espanyol   
1   1990-1991         1         1 1990-09-02      CD Logroñés   
2   1990-1991         1         1 1990-09-02      Real Madrid   
3   1990-1991         1         1 1990-09-02  Real Valladolid   
4   1990-1991         1         1 1990-09-02       Real Betis   
5   1990-1991         1         1 1990-09-02    Real Sociedad   
6   1990-1991         1         1 1990-09-02      Real Oviedo   
7   1990-1991         1         1 1990-09-02         Valencia   
8   1990-1991         1         1 1990-09-02      CD Tenerife   
9   1990-1991         1         1 1990-09-02        Burgos CF   
10  1990-1991         1         2 1990-09-08       Sevilla FC   
11  1990-1991         1         2 1990-09-09     CD Castellón   
12  1990-1991         1         2 1990-09-09  Atlético Madrid   
13  1990-1991         1         2 1990-09-09     RCD Mallorca   
14  1990-1991         1  

In [4]:

df = df.sort_values("date")
home_stats = rolling_team_stats(df, "home_team", "home_goals", "away_goals")
away_stats = rolling_team_stats(df, "away_team", "away_goals", "home_goals")

df = pd.concat([df, home_stats, away_stats], axis=1)


In [5]:


df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4806 entries, 0 to 4805
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   season                       4806 non-null   object        
 1   division                     4806 non-null   int64         
 2   matchday                     4806 non-null   int64         
 3   date                         4806 non-null   datetime64[ns]
 4   home_team                    4806 non-null   object        
 5   away_team                    4806 non-null   object        
 6   score                        4806 non-null   object        
 7   year                         4806 non-null   int32         
 8   month                        4806 non-null   int32         
 9   weekday                      4806 non-null   int32         
 10  home_goals                   4806 non-null   float64       
 11  away_goals                   4806 non-null   flo

In [6]:
df.to_csv("matches_features.csv", index=False)


In [7]:
print(df[[
    "date", "home_team", "away_team", "home_goals", "away_goals",
    "home_team_avg_goals_for", "home_team_avg_goals_against",
    "away_team_avg_goals_for", "away_team_avg_goals_against",
    "avg_goals_for_combined", "avg_goals_against_combined"
]].head(15))

         date        home_team        away_team  home_goals  away_goals  \
0  1990-09-01         Espanyol        Barcelona         0.0         1.0   
1  1990-09-02      CD Logroñés     RCD Mallorca         1.0         2.0   
2  1990-09-02      Real Madrid     CD Castellón         1.0         0.0   
3  1990-09-02  Real Valladolid       CA Osasuna         1.0         1.0   
4  1990-09-02       Real Betis   Sporting Gijón         2.0         2.0   
5  1990-09-02    Real Sociedad    Real Zaragoza         1.0         0.0   
6  1990-09-02      Real Oviedo       Sevilla FC         0.0         0.0   
7  1990-09-02         Valencia  Atlético Madrid         1.0         1.0   
8  1990-09-02      CD Tenerife         Athletic         1.0         0.0   
9  1990-09-02        Burgos CF         Cádiz CF         1.0         0.0   
10 1990-09-08       Sevilla FC      Real Madrid         2.0         0.0   
19 1990-09-09   Sporting Gijón  Real Valladolid         4.0         0.0   
17 1990-09-09  Atlético M