In [23]:
import pandas as pd
from pathlib import Path
import os
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Define the base and data directories
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR.parents[0] / "transfer_data"
CLUBS_FILE = "clubs.csv"
ELOS_FILE = "club_elo_2024-09-21.csv"
FBREF_FILE = 'standard_stats_big5.csv'
# Load CSV data
fbref_df = pd.read_csv(os.path.join(DATA_DIR, FBREF_FILE), sep=',', encoding='UTF-8')
clubs_df = pd.read_csv(os.path.join(DATA_DIR, CLUBS_FILE), sep=",", encoding="UTF-8")
elos_df = pd.read_csv(os.path.join(DATA_DIR, ELOS_FILE), sep=",", encoding="UTF-8")
elos_df = elos_df.head(632)  # Restrict the ELO data to the first 632 rows

import pandas as pd
from fuzzywuzzy import fuzz, process

def fuzzy_merge(
    df_1, df_2, key1, key2, threshold=90, limit=2, scorer=fuzz.partial_ratio
):
    """
    Fuzzy merge two dataframes based on a similarity score, including all columns
    from both dataframes.
    
    Args:
    - df_1: First dataframe.
    - df_2: Second dataframe.
    - key1: Column name in df_1 to merge on.
    - key2: Column name in df_2 to merge on.
    - threshold: Minimum score for matching.
    - limit: Maximum number of matches to consider.
    - scorer: Scoring function for fuzzy matching.
    
    Returns:
    - Merged dataframe with all columns from both dataframes.
    """
    s = df_2[key2].tolist()
    # Apply fuzzy matching and get the closest matches
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit, scorer=scorer))
    # Filter matches that meet the thresholdx
    df_1["possible_matches"] = m.apply(lambda x: [i for i in x if i[1] >= threshold])
    
    # Extract the top match (if any) from the possible_matches and merge columns
    df_1["matches"] = df_1["possible_matches"].apply(lambda x: x[0][0] if x else None)
    # Merge df_1 with df_2 based on the best match from df_2's key2 column
    merged_df = pd.merge(df_1, df_2, how="left", left_on="matches", right_on=key2)
    
    # Drop unnecessary columns (if you don't want 'matches' and 'possible_matches')
    merged_df.drop(columns=["possible_matches"], inplace=True)
    
    return merged_df



In [24]:


# ClubElos - Transfermrkt
# Initialize the unmatched dataframe
unmatched_df = elos_df.copy()
unmatched_df.columns = unmatched_df.columns.str.lower()
unmatched_df = unmatched_df[unmatched_df['country'] == 'ENG']

# Replace 'Man' with 'Manchester' in the 'club' column (case-insensitive)
unmatched_df['club'] = unmatched_df['club'].str.replace(r'\bMan\b', 'Manchester', case=False, regex=True)

# Filter clubs_df so that it only contains English teams
clubs_df = clubs_df[clubs_df['domestic_competition_id'] == 'GB1']

# Start the threshold at a higher value and keep lowering it until all teams are matched
initial_threshold = 85
threshold_step = 5  # Decrease the threshold by 5 in each iteration
min_threshold = 60  # Stop decreasing if threshold reaches this minimum
all_results = []
counter = 0

while not unmatched_df.empty and initial_threshold >= min_threshold:
    # Perform fuzzy merge with the current threshold
    matched_df = fuzzy_merge(
        unmatched_df.copy(),
        clubs_df,
        "club",
        "name",
        threshold=initial_threshold,
        limit=1,
        scorer=fuzz.partial_ratio,
    )

    # Save the intermediate results for each iteration
    # matched_df.to_csv(f"match_V{counter}.csv", index=True)
    counter += 1

    # Filter out matched rows
    matched_clubs = matched_df[matched_df["matches"] != ""]
    matched_names = matched_clubs["matches"].str.split(", ").explode().unique()

    # Remove matched clubs from clubs_df and unmatched_df
    clubs_df = clubs_df[~clubs_df["name"].isin(matched_names)]
    unmatched_df = unmatched_df[~unmatched_df["club"].isin(matched_clubs["club"])]

    # Append current iteration's matched results to all_results
    all_results.append(matched_clubs)

    # If there are unmatched clubs, reduce the threshold
    if not unmatched_df.empty:
        print(f"Unmatched teams remaining: {len(unmatched_df)}")
        initial_threshold -= threshold_step

# Combine all matched results and save
final_result = pd.concat(all_results, ignore_index=True)
final = final_result.copy()
# final_result.to_csv("final_match.csv", index=False)

# Save any unmatched entries
if not unmatched_df.empty:
    # unmatched_df.to_csv("unmatched.csv", index=False)
    final = pd.concat([final_result, unmatched_df], ignore_index=True)
    # Drop last column cuz it shouldn't be there
    final = final[final.columns.intersection(final_result.columns)]

final.to_csv('clubelos_transfermrkt.csv', index=False)
print("All teams were matched!")

All teams were matched!
