# Engineering features

The purpose of this notebook is to generate derived information from the *Matches* dataset :
- we are adding derived features to the current dataset.
- we are creating a *Competitions* (or divisions) dataset.
- we are creating a *Teams* (or divisions) dataset.


In [1]:
import pandas as pd

## Derived features for *Matches* dataset

In [3]:
df_matches = pd.read_csv("../DATASETS/Matches.csv", dtype={2: str}, parse_dates=["MatchDate"])
df_matches.head(5)

Unnamed: 0,Division,MatchDate,MatchTime,HomeTeam,AwayTeam,HomeElo,AwayElo,Form3Home,Form5Home,Form3Away,...,MaxHome,MaxDraw,MaxAway,Over25,Under25,MaxOver25,MaxUnder25,HandiSize,HandiHome,HandiAway
0,F1,2000-07-28,,Marseille,Troyes,1686.34,1586.57,0.0,0.0,0.0,...,,,,,,,,,,
1,F1,2000-07-28,,Paris SG,Strasbourg,1714.89,1642.51,0.0,0.0,0.0,...,,,,,,,,,,
2,F2,2000-07-28,,Wasquehal,Nancy,1465.08,1633.8,0.0,0.0,0.0,...,,,,,,,,,,
3,F2,2000-07-29,,Ajaccio,Le Mans,1470.87,1477.89,0.0,0.0,0.0,...,,,,,,,,,,
4,F2,2000-07-29,,Beauvais,Montpellier,1422.21,1606.0,0.0,0.0,0.0,...,,,,,,,,,,


In [4]:
df_matches["Division"].unique()

array(['F1', 'F2', 'D1', 'D2', 'T1', 'B1', 'E2', 'E1', 'N1', 'P1', 'E0',
       'I2', 'SP2', 'SP1', 'I1', 'E3', 'SC0', 'SC1', 'SC2', 'SC3', 'G1',
       'EC', 'USA', 'SWE', 'NOR', 'IRL', 'BRA', 'ARG', 'MEX', 'JAP',
       'RUS', 'POL', 'DEN', 'ROM', 'AUT', 'SUI', 'FIN', 'CHN'],
      dtype=object)

In [5]:
df_league_names = pd.read_csv("./leagues_mapping.csv", sep=";")
df_league_names

Unnamed: 0,Source name,Real name,Countries
0,F1,Ligue 1,France
1,D1,Bundesliga,Germany
2,T1,Super Lig,Turkey
3,B1,Jupiler Pro League,Belgium
4,E0,Premier League,England
5,N1,Eredivisie,Netherlands
6,P1,Liga Portugal,Portugal
7,I1,Serie A,Italia
8,SP1,Liga,Spain
9,SC0,Premiership,Scotland


In [6]:
# Updating league name
mapping_dict = dict(zip(df_league_names["Source name"], df_league_names["Real name"]))

df_matches["Division"] = df_matches["Division"].replace(mapping_dict)
df_matches

Unnamed: 0,Division,MatchDate,MatchTime,HomeTeam,AwayTeam,HomeElo,AwayElo,Form3Home,Form5Home,Form3Away,...,MaxHome,MaxDraw,MaxAway,Over25,Under25,MaxOver25,MaxUnder25,HandiSize,HandiHome,HandiAway
0,Ligue 1,2000-07-28,,Marseille,Troyes,1686.34,1586.57,0.0,0.0,0.0,...,,,,,,,,,,
1,Ligue 1,2000-07-28,,Paris SG,Strasbourg,1714.89,1642.51,0.0,0.0,0.0,...,,,,,,,,,,
2,F2,2000-07-28,,Wasquehal,Nancy,1465.08,1633.80,0.0,0.0,0.0,...,,,,,,,,,,
3,F2,2000-07-29,,Ajaccio,Le Mans,1470.87,1477.89,0.0,0.0,0.0,...,,,,,,,,,,
4,F2,2000-07-29,,Beauvais,Montpellier,1422.21,1606.00,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228372,Premier League,2025-02-26,19:30:00,Brentford,Everton,1738.05,1731.52,6.0,9.0,5.0,...,2.06,3.82,3.67,1.85,1.99,1.89,2.04,-0.5,2.04,1.86
228373,Premier League,2025-02-26,19:30:00,Man United,Ipswich,1757.62,1584.51,1.0,4.0,1.0,...,1.56,4.75,6.50,1.64,2.29,1.69,2.34,-1.0,1.88,2.02
228374,Premier League,2025-02-26,19:30:00,Nott'm Forest,Arsenal,1788.28,1999.49,3.0,6.0,6.0,...,4.10,3.60,2.02,2.17,1.71,2.24,1.75,0.5,1.90,2.00
228375,Premier League,2025-02-26,19:30:00,Tottenham,Man City,1785.53,1926.48,9.0,9.0,3.0,...,3.35,4.25,2.10,1.34,3.29,1.37,3.40,0.5,1.84,2.06


In [None]:
# Adding countries
mapping_dict_2 = dict(zip(df_league_names["Real name"], df_league_names["Countries"]))

df_matches["Country"] = df_matches["Division"].replace(mapping_dict_2)
df_matches

Unnamed: 0,Division,MatchDate,MatchTime,HomeTeam,AwayTeam,HomeElo,AwayElo,Form3Home,Form5Home,Form3Away,...,MaxDraw,MaxAway,Over25,Under25,MaxOver25,MaxUnder25,HandiSize,HandiHome,HandiAway,Country
0,Ligue 1,2000-07-28,,Marseille,Troyes,1686.34,1586.57,0.0,0.0,0.0,...,,,,,,,,,,France
1,Ligue 1,2000-07-28,,Paris SG,Strasbourg,1714.89,1642.51,0.0,0.0,0.0,...,,,,,,,,,,France
2,F2,2000-07-28,,Wasquehal,Nancy,1465.08,1633.80,0.0,0.0,0.0,...,,,,,,,,,,F2
3,F2,2000-07-29,,Ajaccio,Le Mans,1470.87,1477.89,0.0,0.0,0.0,...,,,,,,,,,,F2
4,F2,2000-07-29,,Beauvais,Montpellier,1422.21,1606.00,0.0,0.0,0.0,...,,,,,,,,,,F2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228372,Premier League,2025-02-26,19:30:00,Brentford,Everton,1738.05,1731.52,6.0,9.0,5.0,...,3.82,3.67,1.85,1.99,1.89,2.04,-0.5,2.04,1.86,England
228373,Premier League,2025-02-26,19:30:00,Man United,Ipswich,1757.62,1584.51,1.0,4.0,1.0,...,4.75,6.50,1.64,2.29,1.69,2.34,-1.0,1.88,2.02,England
228374,Premier League,2025-02-26,19:30:00,Nott'm Forest,Arsenal,1788.28,1999.49,3.0,6.0,6.0,...,3.60,2.02,2.17,1.71,2.24,1.75,0.5,1.90,2.00,England
228375,Premier League,2025-02-26,19:30:00,Tottenham,Man City,1785.53,1926.48,9.0,9.0,3.0,...,4.25,2.10,1.34,3.29,1.37,3.40,0.5,1.84,2.06,England


In [7]:
# Saving the updated dataset
df_matches.to_csv("../DATASETS/Matches_derived.csv")

## Creation of the *Competitions* and *Teams* datasets

The main puropose is to extract key information about divisions, such as :
- the competitiveness
- the average level
- the average evolution of the teams

In [8]:
from datetime import datetime

### Ranking

In [9]:
def create_ranking(matches: pd.DataFrame, at_home: bool, from_date: str, to_date: str) -> pd.DataFrame:
    # Creating a ranking from matches that happened between two dates
    # NB : Win = 3 points, Draw = 1 point, Loss = 0 point

    col = "HomeTeam" if at_home else "AwayTeam"
    result_mapping = {
        "H": "Win" if at_home else "Loss",
        "D": "Draw",
        "A": "Loss" if at_home else "Win",
    }

    fd = datetime.strptime(from_date, "%Y-%m-%d")
    td = datetime.strptime(to_date, "%Y-%m-%d")
    current_matches = matches[matches["MatchDate"].between(fd, td)]
    current_matches = current_matches.dropna(subset="FTResult").reset_index()
    results = current_matches.groupby([col, "FTResult"])["MatchDate"].count().reset_index()
    results = results.pivot(index=col, columns="FTResult", values="MatchDate")
    results = results.rename(columns=result_mapping)
    results = results.fillna(0)
    results = results[["Win", "Draw", "Loss"]]
    results["Points"] = results["Win"] * 3 + results["Draw"] * 1
    return results

In [10]:
def create_overall_ranking(matches: pd.DataFrame, from_date: str, to_date: str) -> pd.DataFrame:
    home_ranking = create_ranking(matches, True, from_date, to_date)
    away_ranking = create_ranking(matches, False, from_date, to_date)
    combined = pd.concat([home_ranking, away_ranking], axis="index")
    return combined.groupby(combined.index).sum()

In [11]:
create_ranking(df_matches, True, "2024-09-01", "2025-03-09")

FTResult,Win,Draw,Loss,Points
HomeTeam,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A. Klagenfurt,0.0,3.0,1.0,3.0
A. Lustenau,2.0,0.0,0.0,6.0
AC Oulu,3.0,2.0,0.0,11.0
AEK,5.0,1.0,0.0,16.0
AFC Wimbledon,9.0,3.0,2.0,30.0
...,...,...,...,...
Zaragoza,4.0,3.0,6.0,15.0
Zenit,2.0,0.0,1.0,6.0
Zhejiang Professional,0.0,2.0,1.0,2.0
Zurich,3.0,0.0,0.0,9.0


In [12]:
ranking = create_overall_ranking(df_matches, "2024-09-01", "2025-03-09")
ranking

FTResult,Win,Draw,Loss,Points
A. Klagenfurt,1.0,4.0,1.0,7.0
A. Lustenau,3.0,0.0,0.0,9.0
AC Oulu,3.0,3.0,1.0,12.0
AEK,10.0,1.0,2.0,31.0
AFC Wimbledon,13.0,9.0,6.0,48.0
...,...,...,...,...
Zaragoza,7.0,8.0,11.0,29.0
Zenit,4.0,0.0,2.0,12.0
Zhejiang Professional,1.0,2.0,4.0,5.0
Zurich,5.0,0.0,2.0,15.0


In [39]:
ranking.index.name = "Teams"

### Team goals

In [13]:
from typing import Literal

In [33]:
# Add an indicator for form consistency
def form_consistency_indicator(results: list[Literal["H", "D", "A"]]) -> float:
    if len(results) <= 1:
        return None
    # Count how much result change between the last match and a previous one
    variation = 0
    prev_res = results[0]
    for i in range(1, len(results)):
        if results[i] != prev_res:
            variation += 1
        prev_res = results[i]
    # Normalize with the total number of possible switches (number of matches played - 1)
    possible_switches = len(results) - 1
    return 1-(variation/possible_switches)

def team_side_consistency(matches: pd.DataFrame, team_name: str, from_date: str, to_date: str, at_home: bool = True):
    col_team = "HomeTeam" if at_home else "AwayTeam"

    fd = datetime.strptime(from_date, "%Y-%m-%d")
    td = datetime.strptime(to_date, "%Y-%m-%d")
    team_matches = matches[matches[col_team] == team_name]
    team_matches = team_matches[team_matches["MatchDate"].between(fd, td)]
    team_matches = team_matches.sort_values(by="MatchDate", ascending=True)
    return form_consistency_indicator(list(team_matches["FTResult"]))

def team_all_consistency(matches: pd.DataFrame, team_name: str, from_date: str, to_date: str):
    def team_result(team_name: str, home_team: str, ftresult: str):
        inverted_result = {
            'H': 'A',
            'D': 'D',
            'A': 'H',
        }
        if team_name != home_team:
            return inverted_result[ftresult]
        return ftresult

    fd = datetime.strptime(from_date, "%Y-%m-%d")
    td = datetime.strptime(to_date, "%Y-%m-%d")
    team_matches = matches[(matches["HomeTeam"] == team_name) | (matches["AwayTeam"] == team_name)]
    team_matches = team_matches[team_matches["MatchDate"].between(fd, td)]
    team_matches = team_matches.sort_values(by="MatchDate", ascending=True)
    team_matches["TeamResult"] = team_matches.apply(lambda row: team_result(team_name, row["HomeTeam"], row["FTResult"]), axis=1)
    return form_consistency_indicator(list(team_matches["TeamResult"]))

def team_consistency(
        matches: pd.DataFrame, team_name: str,
        from_date: str, to_date: str,
        side: Literal['home', 'away'] = None):
    if side == 'home':
        return team_side_consistency(matches, team_name, from_date, to_date, at_home=True)
    elif side == 'away':
        return team_side_consistency(matches, team_name, from_date, to_date, at_home=False)
    return team_all_consistency(matches, team_name, from_date, to_date)


In [41]:
ranking["Consistency - Home"] = ranking.index.map(lambda name: team_consistency(df_matches, name, "2024-09-01", "2025-03-09", side="home"))

In [35]:
ranking["Consistency - Away"] = ranking.index.map(lambda name: team_consistency(df_matches, name, "2024-09-01", "2025-03-09", side="away"))

In [36]:
ranking["Consistency - All"] = ranking.index.map(lambda name: team_consistency(df_matches, name, "2024-09-01", "2025-03-09"))

In [42]:
ranking

FTResult,Win,Draw,Loss,Points,Consistency,Consistency - Home,Consistency - Away,Consistency - All
Teams,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A. Klagenfurt,1.0,4.0,1.0,7.0,0.666667,0.666667,0.000000,0.400000
A. Lustenau,3.0,0.0,0.0,9.0,1.000000,1.000000,,1.000000
AC Oulu,3.0,3.0,1.0,12.0,0.750000,0.750000,0.000000,0.500000
AEK,10.0,1.0,2.0,31.0,0.600000,0.600000,0.500000,0.583333
AFC Wimbledon,13.0,9.0,6.0,48.0,0.461538,0.461538,0.615385,0.259259
...,...,...,...,...,...,...,...,...
Zaragoza,7.0,8.0,11.0,29.0,0.416667,0.416667,0.416667,0.200000
Zenit,4.0,0.0,2.0,12.0,0.500000,0.500000,0.500000,0.400000
Zhejiang Professional,1.0,2.0,4.0,5.0,0.000000,0.000000,0.666667,0.333333
Zurich,5.0,0.0,2.0,15.0,1.000000,1.000000,0.000000,0.500000


In [53]:
# Scored goals
def team_side_goals(matches: pd.DataFrame, team_name: str, from_date: str, to_date: str, at_home: bool = True) -> int:
    col_team = "HomeTeam" if at_home else "AwayTeam"

    fd = datetime.strptime(from_date, "%Y-%m-%d")
    td = datetime.strptime(to_date, "%Y-%m-%d")

    team_matches = matches[matches[col_team] == team_name]
    team_matches = team_matches[team_matches["MatchDate"].between(fd, td)]
    col_goals = "FTHome" if at_home else "FTAway"
    return int(team_matches[col_goals].sum())

In [44]:
ranking["Goals - Home"] = ranking.index.map(lambda name: team_side_goals(df_matches, name, "2024-09-01", "2025-03-09", at_home=True))

In [45]:
ranking["Goals - Away"] = ranking.index.map(lambda name: team_side_goals(df_matches, name, "2024-09-01", "2025-03-09", at_home=False))

In [49]:
ranking["Goals - All"] = ranking.apply(lambda row: row["Goals - Home"] + row["Goals - Away"], axis=1)

In [51]:
ranking

FTResult,Win,Draw,Loss,Points,Consistency,Consistency - Home,Consistency - Away,Consistency - All,Goals - Home,Goals - Away,Goals - All
Teams,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
A. Klagenfurt,1.0,4.0,1.0,7.0,0.666667,0.666667,0.000000,0.400000,4,3,7.0
A. Lustenau,3.0,0.0,0.0,9.0,1.000000,1.000000,,1.000000,3,2,5.0
AC Oulu,3.0,3.0,1.0,12.0,0.750000,0.750000,0.000000,0.500000,8,3,11.0
AEK,10.0,1.0,2.0,31.0,0.600000,0.600000,0.500000,0.583333,15,13,28.0
AFC Wimbledon,13.0,9.0,6.0,48.0,0.461538,0.461538,0.615385,0.259259,24,14,38.0
...,...,...,...,...,...,...,...,...,...,...,...
Zaragoza,7.0,8.0,11.0,29.0,0.416667,0.416667,0.416667,0.200000,16,16,32.0
Zenit,4.0,0.0,2.0,12.0,0.500000,0.500000,0.500000,0.400000,4,5,9.0
Zhejiang Professional,1.0,2.0,4.0,5.0,0.000000,0.000000,0.666667,0.333333,5,5,10.0
Zurich,5.0,0.0,2.0,15.0,1.000000,1.000000,0.000000,0.500000,4,8,12.0


In [60]:
# Conceded goals
def team_side_conceded(matches: pd.DataFrame, team_name: str, from_date: str, to_date: str, at_home: bool = True) -> int:
    col_team = "HomeTeam" if at_home else "AwayTeam"

    fd = datetime.strptime(from_date, "%Y-%m-%d")
    td = datetime.strptime(to_date, "%Y-%m-%d")

    team_matches = matches[matches[col_team] == team_name]
    team_matches = team_matches[team_matches["MatchDate"].between(fd, td)]
    col_goals = "FTAway" if at_home else "FTHome"
    return int(team_matches[col_goals].sum())

In [55]:
ranking["Conceded - Home"] = ranking.index.map(lambda name: team_side_conceded(df_matches, name, "2024-09-01", "2025-03-09", at_home=True))

In [56]:
ranking["Conceded - Away"] = ranking.index.map(lambda name: team_side_conceded(df_matches, name, "2024-09-01", "2025-03-09", at_home=False))

In [57]:
ranking["Conceded - All"] = ranking.apply(lambda row: row["Conceded - Home"] + row["Conceded - Away"], axis=1)

In [58]:
ranking

FTResult,Win,Draw,Loss,Points,Consistency,Consistency - Home,Consistency - Away,Consistency - All,Goals - Home,Goals - Away,Goals - All,Conceded - Home,Conceded - Away,Conceded - All
Teams,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A. Klagenfurt,1.0,4.0,1.0,7.0,0.666667,0.666667,0.000000,0.400000,4,3,7.0,5,2,7.0
A. Lustenau,3.0,0.0,0.0,9.0,1.000000,1.000000,,1.000000,3,2,5.0,0,0,0.0
AC Oulu,3.0,3.0,1.0,12.0,0.750000,0.750000,0.000000,0.500000,8,3,11.0,3,4,7.0
AEK,10.0,1.0,2.0,31.0,0.600000,0.600000,0.500000,0.583333,15,13,28.0,2,9,11.0
AFC Wimbledon,13.0,9.0,6.0,48.0,0.461538,0.461538,0.615385,0.259259,24,14,38.0,7,10,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zaragoza,7.0,8.0,11.0,29.0,0.416667,0.416667,0.416667,0.200000,16,16,32.0,16,18,34.0
Zenit,4.0,0.0,2.0,12.0,0.500000,0.500000,0.500000,0.400000,4,5,9.0,2,3,5.0
Zhejiang Professional,1.0,2.0,4.0,5.0,0.000000,0.000000,0.666667,0.333333,5,5,10.0,6,9,15.0
Zurich,5.0,0.0,2.0,15.0,1.000000,1.000000,0.000000,0.500000,4,8,12.0,0,8,8.0


### Team shots

In [73]:
# Shots made
def team_side_shots(matches: pd.DataFrame, team_name: str, from_date: str, to_date: str, at_home: bool = True) -> int:
    col_team = "HomeTeam" if at_home else "AwayTeam"

    fd = datetime.strptime(from_date, "%Y-%m-%d")
    td = datetime.strptime(to_date, "%Y-%m-%d")

    team_matches = matches[matches[col_team] == team_name]
    team_matches = team_matches[team_matches["MatchDate"].between(fd, td)]
    col_shots = "HomeShots" if at_home else "AwayShots"
    return int(team_matches[col_shots].sum())

In [64]:
ranking["Shots - Home"] = ranking.index.map(lambda name: team_side_shots(df_matches, name, "2024-09-01", "2025-03-09", at_home=True))

In [65]:
ranking["Shots - Away"] = ranking.index.map(lambda name: team_side_shots(df_matches, name, "2024-09-01", "2025-03-09", at_home=False))

In [66]:
ranking["Shots - All"] = ranking.apply(lambda row: row["Shots - Home"] + row["Shots - Away"], axis=1)

In [67]:
ranking

FTResult,Win,Draw,Loss,Points,Consistency,Consistency - Home,Consistency - Away,Consistency - All,Goals - Home,Goals - Away,Goals - All,Conceded - Home,Conceded - Away,Conceded - All,Shots - Home,Shots - Away,Shots - All
Teams,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
A. Klagenfurt,1.0,4.0,1.0,7.0,0.666667,0.666667,0.000000,0.400000,4,3,7.0,5,2,7.0,0,0,0.0
A. Lustenau,3.0,0.0,0.0,9.0,1.000000,1.000000,,1.000000,3,2,5.0,0,0,0.0,0,0,0.0
AC Oulu,3.0,3.0,1.0,12.0,0.750000,0.750000,0.000000,0.500000,8,3,11.0,3,4,7.0,0,0,0.0
AEK,10.0,1.0,2.0,31.0,0.600000,0.600000,0.500000,0.583333,15,13,28.0,2,9,11.0,135,82,217.0
AFC Wimbledon,13.0,9.0,6.0,48.0,0.461538,0.461538,0.615385,0.259259,24,14,38.0,7,10,17.0,173,163,336.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zaragoza,7.0,8.0,11.0,29.0,0.416667,0.416667,0.416667,0.200000,16,16,32.0,16,18,34.0,197,156,353.0
Zenit,4.0,0.0,2.0,12.0,0.500000,0.500000,0.500000,0.400000,4,5,9.0,2,3,5.0,0,0,0.0
Zhejiang Professional,1.0,2.0,4.0,5.0,0.000000,0.000000,0.666667,0.333333,5,5,10.0,6,9,15.0,0,0,0.0
Zurich,5.0,0.0,2.0,15.0,1.000000,1.000000,0.000000,0.500000,4,8,12.0,0,8,8.0,0,0,0.0


In [None]:
# Shots conceded
def team_side_shots_conceded(matches: pd.DataFrame, team_name: str, from_date: str, to_date: str, at_home: bool = True) -> int:
    col_team = "HomeTeam" if at_home else "AwayTeam"

    fd = datetime.strptime(from_date, "%Y-%m-%d")
    td = datetime.strptime(to_date, "%Y-%m-%d")

    team_matches = matches[matches[col_team] == team_name]
    team_matches = team_matches[team_matches["MatchDate"].between(fd, td)]
    col_shots = "AwayShots" if at_home else "HomeShots"
    return int(team_matches[col_shots].sum())

In [69]:
ranking["Shots conceded - Home"] = ranking.index.map(lambda name: team_side_shots_conceded(df_matches, name, "2024-09-01", "2025-03-09", at_home=True))

In [70]:
ranking["Shots conceded - Away"] = ranking.index.map(lambda name: team_side_shots_conceded(df_matches, name, "2024-09-01", "2025-03-09", at_home=False))

In [71]:
ranking["Shots conceded - All"] = ranking.apply(lambda row: row["Shots conceded - Home"] + row["Shots conceded - Away"], axis=1)

In [72]:
ranking

FTResult,Win,Draw,Loss,Points,Consistency,Consistency - Home,Consistency - Away,Consistency - All,Goals - Home,Goals - Away,Goals - All,Conceded - Home,Conceded - Away,Conceded - All,Shots - Home,Shots - Away,Shots - All,Shots conceded - Home,Shots conceded - Away,Shots conceded - All
Teams,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
A. Klagenfurt,1.0,4.0,1.0,7.0,0.666667,0.666667,0.000000,0.400000,4,3,7.0,5,2,7.0,0,0,0.0,0,0,0.0
A. Lustenau,3.0,0.0,0.0,9.0,1.000000,1.000000,,1.000000,3,2,5.0,0,0,0.0,0,0,0.0,0,0,0.0
AC Oulu,3.0,3.0,1.0,12.0,0.750000,0.750000,0.000000,0.500000,8,3,11.0,3,4,7.0,0,0,0.0,0,0,0.0
AEK,10.0,1.0,2.0,31.0,0.600000,0.600000,0.500000,0.583333,15,13,28.0,2,9,11.0,135,82,217.0,35,41,76.0
AFC Wimbledon,13.0,9.0,6.0,48.0,0.461538,0.461538,0.615385,0.259259,24,14,38.0,7,10,17.0,173,163,336.0,100,151,251.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zaragoza,7.0,8.0,11.0,29.0,0.416667,0.416667,0.416667,0.200000,16,16,32.0,16,18,34.0,197,156,353.0,139,183,322.0
Zenit,4.0,0.0,2.0,12.0,0.500000,0.500000,0.500000,0.400000,4,5,9.0,2,3,5.0,0,0,0.0,0,0,0.0
Zhejiang Professional,1.0,2.0,4.0,5.0,0.000000,0.000000,0.666667,0.333333,5,5,10.0,6,9,15.0,0,0,0.0,0,0,0.0
Zurich,5.0,0.0,2.0,15.0,1.000000,1.000000,0.000000,0.500000,4,8,12.0,0,8,8.0,0,0,0.0,0,0,0.0


In [74]:
# Shots on target made
def team_side_target(matches: pd.DataFrame, team_name: str, from_date: str, to_date: str, at_home: bool = True) -> int:
    col_team = "HomeTeam" if at_home else "AwayTeam"

    fd = datetime.strptime(from_date, "%Y-%m-%d")
    td = datetime.strptime(to_date, "%Y-%m-%d")

    team_matches = matches[matches[col_team] == team_name]
    team_matches = team_matches[team_matches["MatchDate"].between(fd, td)]
    col_target = "HomeTarget" if at_home else "AwayTarget"
    return int(team_matches[col_target].sum())

In [75]:
ranking["Shots on target - Home"] = ranking.index.map(lambda name: team_side_target(df_matches, name, "2024-09-01", "2025-03-09", at_home=True))

In [76]:
ranking["Shots on target - Away"] = ranking.index.map(lambda name: team_side_target(df_matches, name, "2024-09-01", "2025-03-09", at_home=False))

In [77]:
ranking["Shots on target - All"] = ranking.apply(lambda row: row["Shots on target - Home"] + row["Shots on target - Away"], axis=1)

In [78]:
ranking

FTResult,Win,Draw,Loss,Points,Consistency,Consistency - Home,Consistency - Away,Consistency - All,Goals - Home,Goals - Away,...,Conceded - All,Shots - Home,Shots - Away,Shots - All,Shots conceded - Home,Shots conceded - Away,Shots conceded - All,Shots on target - Home,Shots on target - Away,Shots on target - All
Teams,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A. Klagenfurt,1.0,4.0,1.0,7.0,0.666667,0.666667,0.000000,0.400000,4,3,...,7.0,0,0,0.0,0,0,0.0,0,0,0.0
A. Lustenau,3.0,0.0,0.0,9.0,1.000000,1.000000,,1.000000,3,2,...,0.0,0,0,0.0,0,0,0.0,0,0,0.0
AC Oulu,3.0,3.0,1.0,12.0,0.750000,0.750000,0.000000,0.500000,8,3,...,7.0,0,0,0.0,0,0,0.0,0,0,0.0
AEK,10.0,1.0,2.0,31.0,0.600000,0.600000,0.500000,0.583333,15,13,...,11.0,135,82,217.0,35,41,76.0,38,34,72.0
AFC Wimbledon,13.0,9.0,6.0,48.0,0.461538,0.461538,0.615385,0.259259,24,14,...,17.0,173,163,336.0,100,151,251.0,67,47,114.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zaragoza,7.0,8.0,11.0,29.0,0.416667,0.416667,0.416667,0.200000,16,16,...,34.0,197,156,353.0,139,183,322.0,70,51,121.0
Zenit,4.0,0.0,2.0,12.0,0.500000,0.500000,0.500000,0.400000,4,5,...,5.0,0,0,0.0,0,0,0.0,0,0,0.0
Zhejiang Professional,1.0,2.0,4.0,5.0,0.000000,0.000000,0.666667,0.333333,5,5,...,15.0,0,0,0.0,0,0,0.0,0,0,0.0
Zurich,5.0,0.0,2.0,15.0,1.000000,1.000000,0.000000,0.500000,4,8,...,8.0,0,0,0.0,0,0,0.0,0,0,0.0


### Team elo

In [117]:
df_elo = pd.read_csv(r"C:\Users\thier\OneDrive\Documents\Informatique\Projets\Soccer BI\DATASETS\EloRatings.csv", parse_dates=["date"])
df_elo

Unnamed: 0,date,club,country,elo
0,2000-07-01,Aachen,GER,1453.60
1,2000-07-01,Aalborg,DEN,1482.61
2,2000-07-01,Aalst,BEL,1337.53
3,2000-07-01,Aarhus,DEN,1381.46
4,2000-07-01,Aberdeen,SCO,1360.43
...,...,...,...,...
242586,2025-03-01,Wolfsburg,GER,1681.99
242587,2025-03-01,Wolves,ENG,1677.75
242588,2025-03-01,Zaragoza,ESP,1482.18
242589,2025-03-01,Zenit,RUS,1634.92


In [122]:
def elo(elo_dataset: pd.DataFrame, team_name: str, from_date: str, to_date: str):
    team_elos = elo_dataset[elo_dataset["club"] == team_name]
    fd = datetime.strptime(from_date, "%Y-%m-%d")
    td = datetime.strptime(to_date, "%Y-%m-%d")
    team_elos = team_elos[team_elos["date"].between(fd, td)]
    team_elos = team_elos.sort_values("date", ascending=False)
    
    if len(team_elos) <= 1:
        return None
    res = team_elos["elo"].iloc[0]
    return res

In [123]:
ranking["Elo"] = ranking.index.map(lambda name: elo(df_elo, name, "2024-09-01", "2025-03-09"))

In [125]:
ranking

FTResult,Win,Elo
Teams,Unnamed: 1_level_1,Unnamed: 2_level_1
A. Klagenfurt,1.0,
A. Lustenau,3.0,
AC Oulu,3.0,
AEK,10.0,1585.67
AFC Wimbledon,13.0,
...,...,...
Zaragoza,7.0,1482.18
Zenit,4.0,1634.92
Zhejiang Professional,1.0,
Zurich,5.0,
