In [186]:
#Libraries
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import pandas as pd
import glob
import os
import re



In [3]:
# Specify the folder path where CSV files are stored
folder_path = 'data/'

# Use glob to find all CSV files in the specified folder
all_files = glob.glob(os.path.join(folder_path, "*.csv"))

# Use a list comprehension to read each CSV file into a DataFrame and ensure 'Date' is string
df_list = []
for file in all_files:
    try:
        # Read each CSV and convert 'Date' to string format
        buli_df = pd.read_csv(file, encoding='ISO-8859-1', dtype={'Date': str})
        df_list.append(buli_df)
    except pd.errors.ParserError as e:
        print(f"ParserError parsing {file}: {e}")
    except UnicodeDecodeError as e:
        print(f"UnicodeDecodeError in {file}: {e}")

# Concatenate all DataFrames in the list into a single DataFrame
buli_df = pd.concat(df_list, ignore_index=True)

# Standardize and parse the 'Date' column
buli_df['Date'] = buli_df['Date'].str.strip()  # Remove extra whitespace
buli_df['Date'] = buli_df['Date'].replace(r'[/-]', '-', regex=True)  # Replace separators with '-'

# Attempt to parse dates as `dayfirst` and handle both `dd/mm/yyyy` and `dd/mm/yy`
buli_df['Date'] = buli_df['Date'].apply(lambda x: re.sub(r'(\d{2}/\d{2}/)(\d{2})$', r'\120\2', x))
buli_df['Date'] = pd.to_datetime(buli_df['Date'], dayfirst=True, errors='coerce')

# Check for any remaining NaT values in 'Date' after parsing
missing_dates = buli_df[buli_df['Date'].isna()]
if not missing_dates.empty:
    print("Warning: Some dates could not be parsed after concatenation.")
    print(missing_dates)

# Display the combined DataFrame
print("Final combined DataFrame with parsed dates:")
buli_df

Final combined DataFrame with parsed dates:


  buli_df['Date'] = pd.to_datetime(buli_df['Date'], dayfirst=True, errors='coerce')


Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BFECAHA,IWCH,IWCD,IWCA,VCCH,VCCD,VCCA,Unnamed: 70,Unnamed: 71,Unnamed: 72
0,D1,2015-08-14,Bayern Munich,Hamburg,5,0,H,1,0,H,...,,,,,,,,,,
1,D1,2015-08-15,Augsburg,Hertha,0,1,A,0,0,D,...,,,,,,,,,,
2,D1,2015-08-15,Darmstadt,Hannover,2,2,D,1,0,H,...,,,,,,,,,,
3,D1,2015-08-15,Dortmund,M'gladbach,4,0,H,3,0,H,...,,,,,,,,,,
4,D1,2015-08-15,Leverkusen,Hoffenheim,2,1,H,1,1,D,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5575,D1,2013-05-18,Hamburg,Leverkusen,0,1,A,0,0,D,...,,,,,,,,,,
5576,D1,2013-05-18,Hannover,Fortuna Dusseldorf,3,0,H,1,0,H,...,,,,,,,,,,
5577,D1,2013-05-18,M'gladbach,Bayern Munich,3,4,A,3,2,H,...,,,,,,,,,,
5578,D1,2013-05-18,Nurnberg,Werder Bremen,3,2,H,0,1,A,...,,,,,,,,,,


In [4]:
#formatting the date column to datetime format and sorting by date
#buli_df['Date'] = pd.to_datetime(buli_df['Date'])

buli_df = buli_df.sort_values(['Date']).reset_index(drop=True)
buli_df

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BFECAHA,IWCH,IWCD,IWCA,VCCH,VCCD,VCCA,Unnamed: 70,Unnamed: 71,Unnamed: 72
0,D1,2006-08-11,Bayern Munich,Dortmund,2,0,H,1,0,H,...,,,,,,,,,,
1,D1,2006-08-12,Leverkusen,Aachen,3,0,H,2,0,H,...,,,,,,,,,,
2,D1,2006-08-12,Mainz,Bochum,2,1,H,1,0,H,...,,,,,,,,,,
3,D1,2006-08-12,M'gladbach,Cottbus,2,0,H,0,0,D,...,,,,,,,,,,
4,D1,2006-08-12,Schalke 04,Ein Frankfurt,1,1,D,1,0,H,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5575,,2024-10-26,St Pauli,Wolfsburg,0,0,D,0,0,D,...,2.08,,,,,,,,,
5576,,2024-10-26,RB Leipzig,Freiburg,3,1,H,0,1,A,...,1.90,,,,,,,,,
5577,,2024-10-27,Heidenheim,Hoffenheim,0,0,D,0,0,D,...,1.93,,,,,,,,,
5578,,2024-10-27,Union Berlin,Ein Frankfurt,1,1,D,0,1,A,...,2.25,,,,,,,,,


In [5]:
#checking for null values
buli_df.isna().sum()

Div              72
Date              0
HomeTeam          0
AwayTeam          0
FTHG              0
               ... 
VCCD           4050
VCCA           4050
Unnamed: 70    5580
Unnamed: 71    5580
Unnamed: 72    5580
Length: 168, dtype: int64

In [6]:
#dropping rows & columns with all null values
buli_df.dropna(axis=1, how='all', inplace=True) #dropped 3 columns
buli_df.dropna(axis=0, how='all',inplace=True) #0 rows dropped

In [7]:
buli_df.isna().sum()

Div           72
Date           0
HomeTeam       0
AwayTeam       0
FTHG           0
            ... 
IWCD        4223
IWCA        4223
VCCH        4050
VCCD        4050
VCCA        4050
Length: 165, dtype: int64

In [8]:
#code for including all games (first games of the season take the last games of the last season as past games)

import pandas as pd

buli_df_red = buli_df[['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']]

df = buli_df_red

# Define stats dictionary with the specified columns
stats = {
    'goals': {'scored': ('FTHG', 'FTAG'), 'conceded': ('FTAG', 'FTHG')},
    'shots': {'taken': ('HS', 'AS'), 'conceded': ('AS', 'HS')},
    'shots_on_target': {'taken': ('HST', 'AST'), 'conceded': ('AST', 'HST')},
    'fouls': {'fouls': ('HF', 'AF'), 'fouled': ('AF', 'HF')},
    'corners': {'taken': ('HC', 'AC'), 'conceded': ('AC', 'HC')},
    'yellow_cards': {'received': ('HY', 'AY'), 'provoked': ('AY', 'HY')},
    'red_cards': {'received': ('HR', 'AR'), 'provoked': ('AR', 'HR')},
}

# Define `npm` for the number of past matches to consider
npm = 7

# Initialize an empty list to accumulate each row's data as a dictionary
rows_list = []

# Iterate through each row to calculate rolling stats based on home and away perspectives
for index, row in df.iterrows():
    team_h = row['HomeTeam']
    team_a = row['AwayTeam']
    date = row['Date']
    
    # Get the past `npm` games for the home team, filtered by games before the current match date
    past_matches_home = df[((df['HomeTeam'] == team_h) | (df['AwayTeam'] == team_h)) & (df['Date'] < date)]
    past_matches_home = past_matches_home.tail(npm)

    # Get the past `npm` games for the away team, filtered by games before the current match date
    past_matches_away = df[((df['HomeTeam'] == team_a) | (df['AwayTeam'] == team_a)) & (df['Date'] < date)]
    past_matches_away = past_matches_away.tail(npm)

    # Initialize a dictionary to store the calculated stats for each row
    row_stats = {
        'Date': date,
        'HomeTeam': team_h,
        'AwayTeam': team_a,
        'FTR': row['FTR'],
        'FTHG': row['FTHG'],
        'FTAG': row['FTAG'],
    }
    
    # Calculate stats for the home team based on whether they played home or away in past matches
    for stat, subcategories in stats.items():
        for subcategory, columns in subcategories.items():
            home_column, away_column = columns
            # Sum the stat when the home team was actually playing at home
            stat_home_as_home = past_matches_home.loc[past_matches_home['HomeTeam'] == team_h, home_column].sum()
            # Sum the stat when the home team was actually playing as the away team
            stat_home_as_away = past_matches_home.loc[past_matches_home['AwayTeam'] == team_h, away_column].sum()
            row_stats[f'p_home_{stat}_{subcategory}_last_{npm}'] = stat_home_as_home + stat_home_as_away
            
    # Calculate stats for the away team based on whether they played home or away in past matches
    for stat, subcategories in stats.items():
        for subcategory, columns in subcategories.items():
            home_column, away_column = columns
            # Sum the stat when the away team was actually playing at home
            stat_away_as_home = past_matches_away.loc[past_matches_away['HomeTeam'] == team_a, home_column].sum()
            # Sum the stat when the away team was actually playing as the away team
            stat_away_as_away = past_matches_away.loc[past_matches_away['AwayTeam'] == team_a, away_column].sum()
            row_stats[f'p_away_{stat}_{subcategory}_last_{npm}'] = stat_away_as_home + stat_away_as_away

    # Calculate points for the home team in the past `npm` games
    points_home = (
        (past_matches_home.loc[past_matches_home['HomeTeam'] == team_h, 'FTR'] == 'H').sum() * 3 +
        (past_matches_home.loc[past_matches_home['AwayTeam'] == team_h, 'FTR'] == 'A').sum() * 3 +
        (past_matches_home['FTR'] == 'D').sum() * 1
    )
    row_stats[f'p_home_points_last_{npm}'] = points_home
    
    # Calculate points for the away team in the past `npm` games
    points_away = (
        (past_matches_away.loc[past_matches_away['HomeTeam'] == team_a, 'FTR'] == 'H').sum() * 3 +
        (past_matches_away.loc[past_matches_away['AwayTeam'] == team_a, 'FTR'] == 'A').sum() * 3 +
        (past_matches_away['FTR'] == 'D').sum() * 1
    )
    row_stats[f'p_away_points_last_{npm}'] = points_away
    
    # Append the dictionary for this row to the list
    rows_list.append(row_stats)

# Convert the list of dictionaries to a DataFrame
rolling_stats = pd.DataFrame(rows_list)

# Display the final DataFrame with only the desired columns
print("Final DataFrame with selected initial columns and past 7 games stats:")
rolling_stats_with_first_games_of_season = rolling_stats

Final DataFrame with selected initial columns and past 7 games stats:


In [9]:
rolling_stats_with_first_games_of_season = rolling_stats

In [10]:
#columns_to_check = ['p_away_goals_conceded_last_7', 'p_away_goals_scored_last_7', 'p_home_corners_conceived_last_7','p_away_corners_conceived_last_7', 'p_home_yellow_cards_received_last_7','p_home_yellow_cards_provoked_last_7','p_away_corners_taken_last_7']
rolling_stats_with_first_games_of_season.loc[5461]

Date                                      2024-04-14 00:00:00
HomeTeam                                            Darmstadt
AwayTeam                                             Freiburg
FTR                                                         A
FTHG                                                        0
FTAG                                                        1
p_home_goals_scored_last_7                                  6
p_home_goals_conceded_last_7                               22
p_home_shots_taken_last_7                                  85
p_home_shots_conceded_last_7                              139
p_home_shots_on_target_taken_last_7                        29
p_home_shots_on_target_conceded_last_7                     44
p_home_fouls_fouls_last_7                                  79
p_home_fouls_fouled_last_7                                 68
p_home_corners_taken_last_7                                26
p_home_corners_conceded_last_7                             41
p_home_y

In [11]:
#code for starting every season only with the 8th games, so that every game of the season that is taken into account has
# 7 past games that were played within the very same season

import pandas as pd
import numpy as np

# Sample data setup (make sure 'Date' column is in datetime format)
df['Date'] = pd.to_datetime(df['Date'])

# Step 1: Identify season breaks by detecting gaps of 2 months or more
df = df.sort_values(by='Date').reset_index(drop=True)
df['Date_Diff'] = df['Date'].diff().dt.days
season_breaks = df[df['Date_Diff'] >= 60].index  # Gaps of 60+ days signify a new season

# Step 2: Assign a season identifier
df['Season'] = 0
current_season = 1
for i in range(len(df)):
    if i in season_breaks:
        current_season += 1
    df.at[i, 'Season'] = current_season

# Drop the Date_Diff column as it's no longer needed
df = df.drop(columns=['Date_Diff'])

# Define `npm` for the number of past matches to consider
npm = 7

# Initialize an empty list to accumulate each row's data as a dictionary
rows_list = []

# Iterate through each row to calculate rolling stats based on home and away perspectives
for index, row in df.iterrows():
    team_h = row['HomeTeam']
    team_a = row['AwayTeam']
    date = row['Date']
    
    # Get the past `npm` games for the home team within the same season
    past_matches_home = df[((df['HomeTeam'] == team_h) | (df['AwayTeam'] == team_h)) & 
                           (df['Date'] < date) & 
                           (df['Season'] == row['Season'])]
    past_matches_home = past_matches_home.tail(npm)

    # Get the past `npm` games for the away team within the same season
    past_matches_away = df[((df['HomeTeam'] == team_a) | (df['AwayTeam'] == team_a)) & 
                           (df['Date'] < date) & 
                           (df['Season'] == row['Season'])]
    past_matches_away = past_matches_away.tail(npm)

    # Initialize a dictionary to store the calculated stats for each row
    row_stats = {
        'Date': date,
        'HomeTeam': team_h,
        'AwayTeam': team_a,
        'FTR': row['FTR'],
        'FTHG': row['FTHG'],  # Include Full Time Home Goals directly
        'FTAG': row['FTAG'],  # Include Full Time Away Goals directly
        'Season': row['Season']
    }
    
    # Calculate stats for the home team based on whether they played home or away in past matches
    for stat, subcategories in stats.items():
        for subcategory, columns in subcategories.items():
            home_column, away_column = columns
            # Sum the stat when the home team was actually playing at home
            stat_home_as_home = past_matches_home.loc[past_matches_home['HomeTeam'] == team_h, home_column].sum()
            # Sum the stat when the home team was actually playing as the away team
            stat_home_as_away = past_matches_home.loc[past_matches_home['AwayTeam'] == team_h, away_column].sum()
            row_stats[f'p_home_{stat}_{subcategory}_last_{npm}'] = stat_home_as_home + stat_home_as_away
            
    # Calculate stats for the away team based on whether they played home or away in past matches
    for stat, subcategories in stats.items():
        for subcategory, columns in subcategories.items():
            home_column, away_column = columns
            # Sum the stat when the away team was actually playing at home
            stat_away_as_home = past_matches_away.loc[past_matches_away['HomeTeam'] == team_a, home_column].sum()
            # Sum the stat when the away team was actually playing as the away team
            stat_away_as_away = past_matches_away.loc[past_matches_away['AwayTeam'] == team_a, away_column].sum()
            row_stats[f'p_away_{stat}_{subcategory}_last_{npm}'] = stat_away_as_home + stat_away_as_away

    # Calculate points for the home team in the past `npm` games
    points_home = (
        (past_matches_home.loc[past_matches_home['HomeTeam'] == team_h, 'FTR'] == 'H').sum() * 3 +
        (past_matches_home.loc[past_matches_home['AwayTeam'] == team_h, 'FTR'] == 'A').sum() * 3 +
        (past_matches_home['FTR'] == 'D').sum() * 1
    )
    row_stats[f'p_home_points_last_{npm}'] = points_home
    
    # Calculate points for the away team in the past `npm` games
    points_away = (
        (past_matches_away.loc[past_matches_away['HomeTeam'] == team_a, 'FTR'] == 'H').sum() * 3 +
        (past_matches_away.loc[past_matches_away['AwayTeam'] == team_a, 'FTR'] == 'A').sum() * 3 +
        (past_matches_away['FTR'] == 'D').sum() * 1
    )
    row_stats[f'p_away_points_last_{npm}'] = points_away
    
    # Append the dictionary for this row to the list
    rows_list.append(row_stats)

# Convert the list of dictionaries to a DataFrame
buli_df = pd.DataFrame(rows_list)

# Filter out the first `npm` games of each season from the final output
buli_df_first_games_skipped = buli_df.groupby('Season').apply(lambda x: x.iloc[npm:]).reset_index(drop=True)

# Display the final DataFrame with selected columns and past 7 games stats
buli_df_first_games_skipped

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'])
  buli_df_first_games_skipped = buli_df.groupby('Season').apply(lambda x: x.iloc[npm:]).reset_index(drop=True)


Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,FTHG,FTAG,Season,p_home_goals_scored_last_7,p_home_goals_conceded_last_7,p_home_shots_taken_last_7,...,p_away_fouls_fouls_last_7,p_away_fouls_fouled_last_7,p_away_corners_taken_last_7,p_away_corners_conceded_last_7,p_away_yellow_cards_received_last_7,p_away_yellow_cards_provoked_last_7,p_away_red_cards_received_last_7,p_away_red_cards_provoked_last_7,p_home_points_last_7,p_away_points_last_7
0,2006-08-13,Wolfsburg,Hertha,D,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2006-08-13,Hannover,Werder Bremen,A,2,4,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2006-08-18,Nurnberg,M'gladbach,H,1,0,1,3,0,11,...,11,24,3,5,0,4,0,1,3,3
3,2006-08-19,Hertha,Hannover,H,4,0,1,0,0,10,...,18,14,2,11,1,3,0,0,1,0
4,2006-08-19,Aachen,Schalke 04,A,0,1,1,0,3,8,...,17,19,7,5,1,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5428,2024-10-26,Werder Bremen,Leverkusen,D,2,2,21,12,14,72,...,59,67,61,22,16,13,0,1,11,14
5429,2024-10-26,RB Leipzig,Freiburg,H,3,1,21,11,2,85,...,58,77,38,27,9,11,0,0,17,15
5430,2024-10-27,Union Berlin,Ein Frankfurt,D,1,1,21,8,4,84,...,69,62,32,44,9,9,0,0,14,13
5431,2024-10-27,Heidenheim,Hoffenheim,D,0,0,21,12,11,89,...,74,92,32,42,18,13,1,1,9,7


In [12]:
#seasons dont fit yet, 2 month definition not really working at some points apparently

### Correlation Tests

In [32]:
correlation_matrix = rolling_stats.select_dtypes("number").corr()

filtered_correlation = correlation_matrix.applymap(lambda x: x if abs(x) > 0.25 else None)
filtered_correlation

  filtered_correlation = correlation_matrix.applymap(lambda x: x if abs(x) > 0.25 else None)


Unnamed: 0,FTHG,FTAG,p_home_goals_scored_last_7,p_home_goals_conceded_last_7,p_home_shots_taken_last_7,p_home_shots_conceded_last_7,p_home_shots_on_target_taken_last_7,p_home_shots_on_target_conceded_last_7,p_home_fouls_fouls_last_7,p_home_fouls_fouled_last_7,...,p_away_fouls_fouls_last_7,p_away_fouls_fouled_last_7,p_away_corners_taken_last_7,p_away_corners_conceded_last_7,p_away_yellow_cards_received_last_7,p_away_yellow_cards_provoked_last_7,p_away_red_cards_received_last_7,p_away_red_cards_provoked_last_7,p_home_points_last_7,p_away_points_last_7
FTHG,1.0,,,,,,,,,,...,,,,,,,,,,
FTAG,,1.0,,,,,,,,,...,,,,,,,,,,
p_home_goals_scored_last_7,,,1.0,,0.565705,,0.653538,,,,...,,,,,,,,,0.778772,
p_home_goals_conceded_last_7,,,,1.0,,0.501412,,0.564792,,,...,,,,,,,,,-0.577746,
p_home_shots_taken_last_7,,,0.565705,,1.0,,0.775212,,,,...,,,,,,,,,0.481141,
p_home_shots_conceded_last_7,,,,0.501412,,1.0,,0.740387,0.27314,,...,,,,,,,,,-0.298402,
p_home_shots_on_target_taken_last_7,,,0.653538,,0.775212,,1.0,,,,...,,,,,,,,,0.555587,
p_home_shots_on_target_conceded_last_7,,,,0.564792,,0.740387,,1.0,0.292585,,...,,,,,,,,,-0.368976,
p_home_fouls_fouls_last_7,,,,,,0.27314,,0.292585,1.0,0.710276,...,0.534836,0.547929,,,,,,,,
p_home_fouls_fouled_last_7,,,,,,,,,0.710276,1.0,...,0.54799,0.574253,,,,,,,,


### Engineering "FTR" feature

In [179]:
rolling_stats["FTR_num"] = rolling_stats["FTR"].apply(lambda x: 1 if x == "D" else (2 if x == "H" else 3))

### Train / Test Split

In [314]:
features = rolling_stats.select_dtypes("number").drop(columns=["FTHG", "FTAG"])
target = rolling_stats["FTHG"]

X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 0)

### Scaling

In [316]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Descision Tree - Regressor - Home Goals

In [318]:
tree = DecisionTreeRegressor(max_depth=3)
tree.fit(X_train_scaled, y_train)

In [320]:
pred = tree.predict(X_test_scaled)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", tree.score(X_test_scaled, y_test))

MAE 0.7764994976122089
RMSE 0.989011514556089
R2 score 0.44869297807114694




In [322]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns)
X_train_scaled.head()

Unnamed: 0,p_home_goals_scored_last_7,p_home_goals_conceded_last_7,p_home_shots_taken_last_7,p_home_shots_conceded_last_7,p_home_shots_on_target_taken_last_7,p_home_shots_on_target_conceded_last_7,p_home_fouls_fouls_last_7,p_home_fouls_fouled_last_7,p_home_corners_taken_last_7,p_home_corners_conceded_last_7,...,p_away_fouls_fouled_last_7,p_away_corners_taken_last_7,p_away_corners_conceded_last_7,p_away_yellow_cards_received_last_7,p_away_yellow_cards_provoked_last_7,p_away_red_cards_received_last_7,p_away_red_cards_provoked_last_7,p_home_points_last_7,p_away_points_last_7,FTR_num
0,0.103448,0.703704,0.448864,0.723926,0.317073,0.559524,0.505747,0.511111,0.492958,0.8,...,0.413043,0.416667,0.603175,0.40625,0.740741,0.25,0.25,0.0,0.619048,0.0
1,0.448276,0.407407,0.6875,0.570552,0.670732,0.452381,0.574713,0.444444,0.929577,0.338462,...,0.5,0.472222,0.809524,0.3125,0.481481,0.5,0.0,0.619048,0.380952,0.5
2,0.275862,0.555556,0.545455,0.674847,0.353659,0.571429,0.586207,0.388889,0.577465,0.569231,...,0.434783,0.305556,0.603175,0.375,0.296296,0.5,0.25,0.428571,0.285714,1.0
3,0.62069,0.407407,0.636364,0.533742,0.585366,0.309524,0.683908,0.65,0.535211,0.569231,...,0.576087,0.430556,0.888889,0.3125,0.222222,0.0,0.0,0.571429,0.380952,1.0
4,0.586207,0.333333,0.664773,0.404908,0.597561,0.297619,0.563218,0.555556,0.760563,0.430769,...,0.554348,0.541667,0.380952,0.3125,0.296296,0.0,0.0,0.571429,0.666667,1.0


In [136]:
tree_importance = {feature : importance for feature, importance in zip(X_train_scaled.columns, tree.feature_importances_)}
tree_importance           

{'p_home_goals_scored_last_7': 0.45718420650269476,
 'p_home_goals_conceded_last_7': 0.0,
 'p_home_shots_taken_last_7': 0.16927663520882538,
 'p_home_shots_conceded_last_7': 0.0,
 'p_home_shots_on_target_taken_last_7': 0.0,
 'p_home_shots_on_target_conceded_last_7': 0.0,
 'p_home_fouls_fouls_last_7': 0.12083341223941044,
 'p_home_fouls_fouled_last_7': 0.0,
 'p_home_corners_taken_last_7': 0.0,
 'p_home_corners_conceded_last_7': 0.0,
 'p_home_yellow_cards_received_last_7': 0.0,
 'p_home_yellow_cards_provoked_last_7': 0.0,
 'p_home_red_cards_received_last_7': 0.0,
 'p_home_red_cards_provoked_last_7': 0.0,
 'p_away_goals_scored_last_7': 0.04221065883506092,
 'p_away_goals_conceded_last_7': 0.0,
 'p_away_shots_taken_last_7': 0.0,
 'p_away_shots_conceded_last_7': 0.14522020847840136,
 'p_away_shots_on_target_taken_last_7': 0.0,
 'p_away_shots_on_target_conceded_last_7': 0.0,
 'p_away_fouls_fouls_last_7': 0.0,
 'p_away_fouls_fouled_last_7': 0.0,
 'p_away_corners_taken_last_7': 0.0,
 'p_away_c

## Updating Features

In [382]:
# Assuming X_train and X_test are the original DataFrames with column names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Now you can select the specific columns for each
X_train_scaled_new = X_train_scaled[[
    "p_home_goals_scored_last_7", "p_home_shots_taken_last_7", 
    "p_home_fouls_fouls_last_7", "p_away_goals_scored_last_7", 
    "p_away_shots_conceded_last_7", "p_away_points_last_7"
]]

X_test_scaled_new = X_test_scaled[[
    "p_home_goals_scored_last_7", "p_home_shots_taken_last_7", 
    "p_home_fouls_fouls_last_7", "p_away_goals_scored_last_7", 
    "p_away_shots_conceded_last_7", "p_away_points_last_7"
]]

In [384]:
tree = DecisionTreeRegressor(max_depth=3)
tree.fit(X_train_scaled_new, y_train)

In [386]:
pred = tree.predict(X_test_scaled_new)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", tree.score(X_test_scaled_new, y_test))

MAE 0.9183002912282585
RMSE 1.1779747570492076
R2 score 0.0332635622320796




# Descision Tree - Regressor - Away Goals

In [324]:
features = rolling_stats.select_dtypes("number").drop(columns=["FTHG", "FTAG"])
target = rolling_stats["FTAG"]

X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 0)

In [326]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [352]:
tree = DecisionTreeRegressor(max_depth=3)
tree.fit(X_train_scaled, y_train)

In [354]:
pred = tree.predict(X_test_scaled)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", tree.score(X_test_scaled, y_test))

MAE 0.709696468624136
RMSE 0.9044023511929057
R2 score 0.4301514001232617




# KNN - Regressor

In [356]:
knn = KNeighborsRegressor(n_neighbors = 2)
knn.fit(X_train_scaled, y_train)

In [358]:
knn.score(X_test_scaled, y_test)

0.18095877728794896

In [None]:
# Test with updated features

In [378]:
knn = KNeighborsRegressor(n_neighbors = 250)
knn.fit(X_train_scaled_new, y_train)

In [376]:
knn.score(X_test_scaled_new, y_test)

-0.3634039865877434

# KNN - Classifier

In [392]:
knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(X_train_cl_scaled, y_train)

In [396]:
knn.score(X_test_cl_scaled, y_test_cl)

0.14623655913978495

# Decision Tree - Classifier

In [194]:
# Train / Test Split

In [204]:
features_cl = rolling_stats.select_dtypes("number").drop(columns=["FTHG", "FTAG", "FTR_num"])
target_cl = rolling_stats["FTR_num"]

X_train_cl, X_test_cl, y_train_cl, y_test_cl = train_test_split(features_cl, target_cl, random_state = 0)

In [244]:
# Scaling

scaler = MinMaxScaler()
scaler.fit(X_train_cl)
X_train_cl_scaled = scaler.transform(X_train_cl)
X_test_cl_scaled = scaler.transform(X_test_cl)


In [268]:
tree = DecisionTreeClassifier(max_depth=71)

In [270]:
tree.fit(X_train_cl_scaled, y_train_cl)

In [280]:
pred = tree.predict(X_test_cl)

print("Accuracy:", accuracy_score(y_test_cl, pred))
print(classification_report(y_test_cl, pred))

Accuracy: 0.45161290322580644
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       358
           2       0.45      1.00      0.62       633
           3       0.00      0.00      0.00       404

    accuracy                           0.45      1395
   macro avg       0.15      0.33      0.21      1395
weighted avg       0.21      0.45      0.28      1395





# Logistic Regression

In [283]:
features_cl = rolling_stats.select_dtypes("number").drop(columns=["FTHG", "FTAG", "FTR_num"])
target_cl = rolling_stats["FTR_num"]

X_train_cl, X_test_cl, y_train_cl, y_test_cl = train_test_split(features_cl, target_cl, random_state = 0)

In [285]:
# Scaling

scaler = MinMaxScaler()
scaler.fit(X_train_cl)
X_train_cl_scaled = scaler.transform(X_train_cl)
X_test_cl_scaled = scaler.transform(X_test_cl)

In [289]:
log_reg = LogisticRegression()

In [291]:
log_reg.fit(X_train_cl_scaled, y_train_cl)

In [293]:
pred = log_reg.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

Accuracy: 0.21577060931899641
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       266
           1       0.40      0.01      0.03       446
           2       0.27      0.75      0.40       352
           3       0.08      0.16      0.11       195
           4       0.00      0.00      0.00        86
           5       0.00      0.00      0.00        37
           6       0.00      0.00      0.00        10
           7       0.00      0.00      0.00         3

    accuracy                           0.22      1395
   macro avg       0.09      0.12      0.07      1395
weighted avg       0.21      0.22      0.12      1395



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
