# Combining Recent Form into a Single Dataset

In [1]:
import pandas as pd
import time
import random
from fuzzywuzzy import process

# === Step 1: Load Base Data ===
all_teams_df = pd.read_csv("../../../data/teams/cleaned/all_league_teams.csv")
all_teams_df.rename(columns={"Team Name": "Team"}, inplace=True)

all_teams_df = all_teams_df.loc[:, ~all_teams_df.columns.str.contains("Recent Form", case=False)]

# === Step 2: Load All Recent Form Files ===
form_files = [
    "../../../data/teams/raw/recent_form/bundesliga_recent_form.csv",
    "../../../data/teams/raw/recent_form/la_liga_recent_form.csv",
    "../../../data/teams/raw/recent_form/ligue_1_recent_form.csv",
    "../../../data/teams/raw/recent_form/pl_recent_form.csv",
    "../../../data/teams/raw/recent_form/serie_a_recent_form.csv"
]
recent_form_df = pd.concat([pd.read_csv(file) for file in form_files], ignore_index=True)

# === Step 3: Fuzzy Match Team Names ===
team_names_all = all_teams_df['Team'].unique()

# Map each team in the recent form data to the closest match in the main dataset
recent_form_df['Matched Team'] = recent_form_df['Team'].apply(
    lambda x: process.extractOne(x, team_names_all)[0]
)

# === Step 4: Merge DataFrames Using Matched Team Names ===
merged_df = pd.merge(
    all_teams_df,
    recent_form_df[['Matched Team', 'Recent Form']],
    left_on='Team',
    right_on='Matched Team',
    how='left'
)

# Clean up
merged_df.drop(columns=['Matched Team'], inplace=True)

print(merged_df.head())



           League               Team  MP  GF  GA  GD    xG   xGA   xGD  \
0  Premier League    Manchester City  36  67  43  24  63.6  45.3  18.3   
1  Premier League         Arsenal FC  36  66  33  33  57.0  32.3  24.7   
2  Premier League       Liverpool FC  36  83  37  46  77.8  34.6  43.2   
3  Premier League         Chelsea FC  36  62  43  19  65.9  45.8  20.1   
4  Premier League  Tottenham Hotspur  36  63  59   4  56.2  59.7  -3.5   

   Shots/90  ...  Tackles Won  Challenges Attempted  Aerial Duel Win %  \
0     15.89  ...          287                   430               48.4   
1     14.14  ...          335                   483               50.5   
2     16.86  ...          380                   626               52.8   
3     15.92  ...          346                   516               51.1   
4     13.53  ...          403                   569               49.8   

   Most Common Formation                                           Logo URL  \
0                4-2-3-1  https

In [2]:
# === Step 5: (Optional) Save or Display ===
merged_df.to_csv("../../../data/teams/cleaned/all_league_teams.csv", index=False)