In [1]:
from datetime import datetime
import io
import requests
import pandas as pd
from pathlib import Path
import time
import os
from bs4 import BeautifulSoup

In [2]:
# response = requests.get(f"https://kassiesa.net/uefa/data/method4/trank2012.html")
# Since it uses different method
#TODO : Try else (method4, or method5)
df = pd.read_html('https://kassiesa.net/uefa/data/method5/trank2023.html')[0]


In [3]:
# Cleaning df
df.drop(df.columns[[0, 1]], axis=1, inplace=True)
# new_columns = ['name', 'country']
df.rename(columns={df.columns[0]: 'name', df.columns[1]: 'country'}, inplace=True)

In [4]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR.parents[0] / "transfer_data"
CLUBS_FILE = "clubs.csv"
ELOS_FILE = "club_elos.csv"

# Load CSV data
clubs_df = pd.read_csv(os.path.join(DATA_DIR, CLUBS_FILE), sep=",", encoding="UTF-8")

In [10]:
from fuzzywuzzy import fuzz, process

# Define the fuzzy merge function
def fuzzy_merge(
    df_1, df_2, key1, key2, threshold=90, limit=2, scorer=fuzz.partial_ratio
):
    """
    Fuzzy merge two dataframes based on a similarity score.
    """
    s = df_2[key2].tolist()
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit, scorer=scorer))
    df_1["matches"] = m
    df_1["matches"] = df_1["matches"].apply(
        lambda x: ", ".join([i[0] for i in x if i[1] >= threshold])
    )

    # df_1 = df_1.set_index(list(df_1)[0])
    # df_1 = df_1.reset_index(drop=True)
    return df_1



In [11]:

# Initialize the unmatched dataframe
unmatched_df = df.copy()

# Define different configurations for fuzzy_merge
configurations = [
    {"threshold": 85, "limit": 1, "scorer": fuzz.partial_ratio},
    {"threshold": 80, "limit": 1, "scorer": fuzz.partial_ratio},
    {"threshold": 75, "limit": 1, "scorer": fuzz.partial_ratio},
]

# Loop through each configuration and perform fuzzy matching
all_results = []
counter = 0
for config in configurations:
    # Perform fuzzy merge
    matched_df = fuzzy_merge(
        unmatched_df.copy(),
        clubs_df,
        "name",
        "name",
        threshold=config["threshold"],
        limit=config["limit"],
        scorer=config["scorer"],
    )

    # Save the intermediate results for each iteration
    # Reset index
    # matched_df = matched_df.set_index(list(matched_df)[0])
    # matched_df = matched_df.reset_index(drop=True)
    # print(matched_df.columns)
    matched_df.to_csv(
        f"match_V{counter}.csv",
        index=True,
    )
    counter += 1

    # Filter out matched rows
    matched_clubs = matched_df[matched_df["matches"] != ""]
    matched_names = matched_clubs["matches"].str.split(", ").explode().unique()

    # Remove matched clubs from clubs_df and unmatched_df
    clubs_df = clubs_df[~clubs_df["name"].isin(matched_names)]
    unmatched_df = unmatched_df[~unmatched_df["name"].isin(matched_clubs["name"])]

    # Append current iteration's matched results to all_results
    all_results.append(matched_clubs)

# Combine all matched results and save
final_result = pd.concat(all_results, ignore_index=True)
final_result.to_csv("final_match.csv", index=False)

# Save unmatched entries
unmatched_df.to_csv("unmatched.csv", index=False)
