In [2]:
import pandas as pd
from unidecode import unidecode

# Load datasets
players_2022 = pd.read_json('assets/FootballPlayerStats2022-2023.json')
players_url = pd.read_json('assets/players_22.json')

top_5_leagues = ["Premier League", 
                 "Serie A",
                 "La Liga", 
                 "Ligue 1", 
                 "Bundesliga"]

# Filter players in top 5 leagues
players_2022 = players_2022[players_2022["Comp"].isin(top_5_leagues)].reset_index(drop=True)

# Drop unwanted column if it exists
if 'Unnamed: 0' in players_2022.columns:
    players_2022 = players_2022.drop(['Unnamed: 0'], axis=1)

# Preprocessing function
def preprocess_name(name):
    if pd.isna(name):
        return ''
    without_accents = unidecode(name)
    return without_accents.lower()

# Tokenize function
def tokenize(name, delimiter=' '):
    return set(name.split(delimiter))

# Create copies of the columns for preprocessing
players_url['processed_player_url'] = players_url['player_url'].apply(preprocess_name)
players_2022['processed_Player'] = players_2022['Player'].apply(preprocess_name)
players_url['url_name'] = players_url['processed_player_url'].apply(lambda x: x.split('/')[-2])

players_2022['Player_tokens'] = players_2022['processed_Player'].apply(tokenize)
players_url['url_name_tokens'] = players_url['url_name'].apply(lambda x: tokenize(x, delimiter='-'))

# Export tokens to CSV for debugging
# players_2022[['Player', 'Player_tokens']].to_csv('player_name_tokens.csv', index=False)
# players_url[['url_name', 'url_name_tokens']].to_csv('url_name_tokens.csv', index=False)

# Function to match names using the tokenized names
def subset_match(player_tokens, url_name_tokens):
    if player_tokens.issubset(url_name_tokens):
        return True
    return False

# Create a column for matched url names in players_2022
players_2022['matched_url_name'] = players_2022.apply(
    lambda row: next((url_name for url_name, tokens in zip(players_url['url_name'], players_url['url_name_tokens']) 
                      if subset_match(row['Player_tokens'], tokens)), None), axis=1)

# Merge datasets based on the matched url names
merged_df = pd.merge(players_2022, players_url, left_on='matched_url_name', right_on='url_name', how='left')

# Fill NaN values if needed
merged_df['matched_url_name'] = merged_df['matched_url_name'].fillna('No match found')

# Display the merged DataFrame
print("\nMerged DataFrame:")
print(merged_df[['Player', 'matched_url_name']])

# Print rows with no match found
# print("\nRows with no match found:")
# print(merged_df[merged_df['matched_url_name'] == 'No match found'][['Player', 'matched_url_name']])

# Remove rows with no match found
merged_df = merged_df[merged_df['matched_url_name'] != 'No match found']

# Export the merged DataFrame to CSV
merged_df.to_csv('assets/merged_players_data.csv', index=False)



Merged DataFrame:
                 Player   matched_url_name
0      Brenden Aaronson   brenden-aaronson
1      Yunis Abdelhamid   yunis-abdelhamid
2         Himad Abdelli      himad-abdelli
3     Salis Abdul Samed  salis-abdul-samed
4       Laurent Abergel    laurent-abergel
...                 ...                ...
2700   Szymon Żurkowski   szymon-zurkowski
2701   Szymon Żurkowski   szymon-zurkowski
2702    Martin Ødegaard    martin-odegaard
2703        Milan Đurić        milan-duric
2704      Filip Đuričić      filip-duricic

[2705 rows x 2 columns]
