In [1]:
import pandas as pd
from pathlib import Path
import os
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Define the base and data directories
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR.parents[0] / "transfer_data"
CLUBS_FILE = "clubs.csv"
ELOS_FILE = "club_elos.csv"

# Load CSV data
clubs_df = pd.read_csv(os.path.join(DATA_DIR, CLUBS_FILE), sep=",", encoding="UTF-8")
elos_df = pd.read_csv(os.path.join(DATA_DIR, ELOS_FILE), sep=",", encoding="UTF-8")
elos_df = elos_df.head(632)  # Restrict the ELO data to the first 632 rows


# Define the fuzzy merge function
def fuzzy_merge(
    df_1, df_2, key1, key2, threshold=90, limit=2, scorer=fuzz.partial_ratio
):
    """
    Fuzzy merge two dataframes based on a similarity score.
    """
    s = df_2[key2].tolist()
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit, scorer=scorer))
    df_1["matches"] = m
    df_1["matches"] = df_1["matches"].apply(
        lambda x: ", ".join([i[0] for i in x if i[1] >= threshold])
    )
    return df_1


# Initialize the unmatched dataframe
unmatched_df = elos_df.copy()

# Define different configurations for fuzzy_merge
configurations = [
{"threshold": 85, "limit": 1, "scorer": fuzz.partial_ratio},
{"threshold": 80, "limit": 1, "scorer": fuzz.partial_ratio},
{"threshold": 75, "limit": 1, "scorer": fuzz.partial_ratio},
]

# Loop through each configuration and perform fuzzy matching
all_results = []
counter = 0
# for config in configurations:

# Perform fuzzy merge
config = configurations[0]
matched_df = fuzzy_merge(
    unmatched_df.copy(),
    clubs_df,
    "Club",
    "name",
    threshold=config["threshold"],
    limit=config["limit"],
    scorer=config["scorer"],
)

# Save the intermediate results for each iteration
# Reset index
# matched_df = matched_df.set_index(list(matched_df)[0])

# matched_df = matched_df.reset_index(drop=True)
matched_df.to_csv(
    f"match_V{counter}.csv",
    index=True,
)
counter += 1





In [2]:
matched_df

Unnamed: 0.1,Unnamed: 0,Rank,Club,Country,Level,Elo,From,To,Updated,matches
0,0,1,Liverpool,ENG,1,2088.261963,2020-01-30,2020-02-01,2020-01-31,Liverpool FC
1,1,2,Man City,ENG,1,2002.820190,2020-01-22,2020-02-02,2020-01-31,
2,2,3,Barcelona,ESP,1,1964.985352,2020-01-26,2020-02-02,2020-01-31,Futbol Club Barcelona
3,3,4,Bayern,GER,1,1951.214600,2020-01-26,2020-02-01,2020-01-31,FC Bayern München
4,4,5,Juventus,ITA,1,1908.358643,2020-01-27,2020-02-02,2020-01-31,Juventus Football Club
...,...,...,...,...,...,...,...,...,...,...
627,627,628,Santa Coloma,AND,0,773.999878,2019-08-02,2020-08-08,2020-01-31,
628,628,629,Matecosa Sant Julia,AND,0,754.937744,2019-08-02,2020-07-01,2020-01-31,
629,629,630,Tre Fiori,SMR,0,707.414734,2019-07-31,2020-08-08,2020-01-31,
630,630,631,Tre Penne,SMR,0,676.546082,2019-07-31,2020-08-08,2020-01-31,


In [None]:

# Filter out matched rows
matched_clubs = matched_df[matched_df["matches"] != ""]
matched_names = matched_clubs["matches"].str.split(", ").explode().unique()

# Remove matched clubs from clubs_df and unmatched_df
clubs_df = clubs_df[~clubs_df["name"].isin(matched_names)]
unmatched_df = unmatched_df[~unmatched_df["Club"].isin(matched_clubs["Club"])]

# Append current iteration's matched results to all_results
all_results.append(matched_clubs)

# Combine all matched results and save
final_result = pd.concat(all_results, ignore_index=True)
final_result.to_csv("final_match.csv", index=False)

# Save unmatched entries
unmatched_df.to_csv("unmatched.csv", index=False)