In [59]:
import pandas as pd
import difflib  
import numpy as np

In [60]:
def convert_string(string):
    # removes symbols "€" and replace "," with "."
    clean_string = string.replace("€", "").strip().replace(",", ".")

    # check if string contains mln or mila and remove them 
    if "mln" in clean_string:
        # remove "mln" and convert the value to millions
        return float(clean_string.replace("mln", "").strip()) * 1000000
    elif "mila" in clean_string:
        # remove "mila" and convert the value to thousans
        return float(clean_string.replace("mila", "").strip()) * 1000
    elif "-" in clean_string:
        return 0
    else:
        return float(clean_string)

In [61]:
def cleaning_player_data(df):
    # leggi il file 

    
    for index, row in df.iterrows():
        name = row['name'].strip().split()
        role = row["role"].strip().split()

        if len(name) + 1 == len(role):
            role_clean = " ".join(role[-1:])
            df.at[index,"role"] = role_clean
        elif len(name) + 2 == len(role):
            role_clean = " ".join(role[-2:])
            df.at[index,"role"] = role_clean
        else:
            role_clean = " ".join(role[-3:])
            df.at[index,"role"] = role_clean
            
    #  apply the function to the whole column
    df['market_value'] = df["market_value"].apply(convert_string)


    return df


In [62]:
def lower_string(word):
    # converts everything to lower case
    word = word.lower()
    return word

In [63]:
def distance_between_string(string1 , string2):
    # converts string to lower case
    lower_string1 = lower_string(string1)
    lower_string2 = lower_string(string2)

    # calculate the difference between string
    d = difflib.Differ()
    diff = list(d.compare(lower_string1,lower_string2))

    # count the number of insertion, deletion or replacement operations
    #distance_count = sum(1 for sim in diff if sim.startswith('-') or sim.startswith('+'))
    distance_count = 0
    # calculate max length between the string
    max_length = max(len(lower_string1), len(lower_string2))

    for i, sim in enumerate(diff):
        if sim.startswith('-') or sim.startswith('+'):  # Modifica, inserzione o cancellazione
            weight = 1.5 if i < 5 else 1  # Pesa di più i primi 3 caratteri
            distance_count += weight  

    # calculate EOD normalized
    normalized_eod = distance_count/max_length 

    return normalized_eod

In [64]:
# find closest string 
def find_closest_string(name, target_string_list):
    if not target_string_list:
        return None
    
    min_normalized = float('inf')
    closest_string = None  

    for name_target in target_string_list:
        normalized_eod = distance_between_string(name,name_target)

        if normalized_eod < min_normalized:
            min_normalized = normalized_eod
            closest_string = name_target

    return closest_string

In [65]:
def find_and_replace_name(team_target, diz):
    team = diz[team_target] 

    return team

In [66]:
def create_diz(df, name_column, target_name):
    diz = {}

    for index,row in df.iterrows():
        home_team = row[name_column]
        team = find_closest_string(home_team, target_name)
        if home_team not in diz:
            diz[home_team] = team
    
    return diz

In [67]:
def create_diz_player(df, name_column, target_name):
    years = [2021,2022,2023,2024]
    diz = {}
    for year in years:
        diz[year] = {}
        for index, row in df[df['season'] == year].iterrows():
            player_name = row[name_column]
            name = find_closest_string(player_name, target_name[target_name['season']==year]['name'].tolist())
            if name not in diz[year]:
                diz[year][player_name] = name 
    return diz

In [68]:
file_player_data = 'dataset/player-team.csv'
file_matches = 'dataset/matches.csv'
file_matches_history = 'dataset/matches_history.csv'
file_lista_team = 'dataset/list-team.csv'
file_odds_per_match = 'dataset/odds_per_match.csv'
file_matches_goal = 'dataset/matches_goal.csv'

df_lista_team = pd.read_csv(file_lista_team)
df_player = pd.read_csv(file_player_data)
odds_per_match = pd.read_csv(file_odds_per_match, sep = ";")
matches  = pd.read_csv(file_matches)
matches_history = pd.read_csv(file_matches_history)
matches_goal = pd.read_csv(file_matches_goal, sep = ";")


In [69]:
# we derive the list of team names taken from the odds data
nomi_squadre = odds_per_match['home_team'].unique().tolist()

# we create a dictionary for each dataframe derived from web scraping where for each team we associate the respective API name, so as to standardize the names 
diz_squadre_player = create_diz(df_player, name_column='team', target_name=nomi_squadre)

diz_lista_squadre = create_diz(df_lista_team, name_column='team', target_name=nomi_squadre)

diz_matches = create_diz(matches,name_column='home_team',target_name=nomi_squadre)

diz_matches_history = create_diz(matches_history, name_column="home_team", target_name=nomi_squadre)

diz_matches_goal = create_diz(matches_goal, name_column='home_team', target_name=nomi_squadre)

In [70]:
target_name_player = df_player[['season','name']].drop_duplicates()
diz_player = create_diz_player(matches_goal,name_column='scorer', target_name=target_name_player)

In [71]:
df_player_data = cleaning_player_data(df_player)
df_player_data['team'] = df_player_data['team'].apply(lambda team_target: find_and_replace_name(team_target, diz=diz_squadre_player))
df_player_data['age'] = df_player_data['age'].str.extract(r'\((\d+)\)').astype(int)
df_player_data.to_csv('dataset/clean dataset/clean_player_team.csv')

In [72]:
df_lista_team['team'] = df_lista_team['team'].apply(lambda team_target: find_and_replace_name(team_target, diz=diz_lista_squadre))
df_lista_team['team_value'] = df_lista_team['team_value'].apply(convert_string)
df_lista_team = df_lista_team.drop('link', axis=1)
df_lista_team.to_csv('dataset/clean dataset/clean_list_team.csv')

In [73]:
matches['home_team'] = matches['home_team'].apply(lambda team_target: find_and_replace_name(team_target, diz=diz_matches))
matches['away_team'] = matches['away_team'].apply(lambda team_target: find_and_replace_name(team_target, diz=diz_matches))
matches.dropna(subset=['winner'], inplace = True)

In [74]:
matches_history['home_team'] = matches_history['home_team'].apply(lambda team_target: find_and_replace_name(team_target, diz=diz_matches_history))
matches_history['away_team'] = matches_history['away_team'].apply(lambda team_target: find_and_replace_name(team_target, diz=diz_matches_history))
matches_history[["round","matchweek"]] = matches_history["matchweek"].str.split("-", expand=True)
matches_history["home_winner"] = matches_history["home_winner"].fillna(False) 
matches_history["away_winner"] = matches_history["away_winner"].fillna(False) 

# Definizione delle condizioni
conditions = [
    (matches_history["home_winner"] == True) & (matches_history["away_winner"] == False),  # Home vince
    (matches_history["home_winner"] == False) & (matches_history["away_winner"] == True),  # Away vince
    (matches_history["home_winner"] == False) & (matches_history["away_winner"] == False)  # Pareggio
]

# Valori da assegnare per ogni condizione
values = ["HOME_WINNER", "AWAY_WINNER", "DRAW"]

# Creazione della nuova colonna 'winner'
matches_history["winner"] = np.select(conditions, values, default="UNKNOWN")  # Se ci sono valori mancanti, restituisce NaN
matches_history = matches_history[["matchweek", "season","date","home_team", "away_team", "winner", "home_goals_halftime", "away_goals_halftime", "home_goals", "away_goals"]]



  matches_history["home_winner"] = matches_history["home_winner"].fillna(False)
  matches_history["away_winner"] = matches_history["away_winner"].fillna(False)


In [75]:
matches = matches.rename(columns={"half_time_home_score":"home_goals_halftime", "half_time_away_score": "away_goals_halftime", "home_score": "home_goals", "away_score": "away_goals"})
matches = matches[["matchweek", "season","date","home_team", "away_team", "winner", "home_goals_halftime", "away_goals_halftime", "home_goals", "away_goals"]]

matches_all = pd.concat([matches_history,matches], ignore_index=True)
matches_all['match_key'] = matches_all['season'].astype(str) + '-' + matches_all['home_team'] + '-' + matches_all['away_team']
matches_all.to_csv('dataset/clean dataset/clean_matches.csv')

In [76]:
matches_goal['home_team'] = matches_goal['home_team'].apply(lambda team_target: find_and_replace_name(team_target, diz=diz_matches_goal))
matches_goal['away_team'] = matches_goal['away_team'].apply(lambda team_target: find_and_replace_name(team_target, diz=diz_matches_goal))
matches_goal['match_key'] = matches_goal['season'].astype(str) + '-' + matches_goal['home_team'] + '-' + matches_goal['away_team']
matches_goal = matches_goal.drop(['home_team', 'away_team'], axis = 1)

years = [2021,2022,2023,2024]
for year in years:
    matches_goal[matches_goal['season'] == year]['scorer'] = matches_goal[matches_goal['season'] == year]['scorer'].apply(lambda player_target: find_and_replace_name(player_target, diz = diz_player[year]))

matches_goal.to_csv('dataset/clean dataset/clean_matches_goal.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches_goal[matches_goal['season'] == year]['scorer'] = matches_goal[matches_goal['season'] == year]['scorer'].apply(lambda player_target: find_and_replace_name(player_target, diz = diz_player[year]))


In [77]:
odds_per_match[['season', 'hours', 'home_team', 'away_team', 'quota_1','quota_x', 'quota_2']]
odds_per_match.to_csv('dataset/clean dataset/clean_odds_per_match.csv')
