In [4]:
import pandas as pd
import difflib  


In [5]:
def lower_string(word):
    # Controlla se il valore è una stringa, altrimenti restituisce una stringa vuota
    if isinstance(word, str):
        return word.lower().strip()
    else:
        return ""

In [6]:

def distance_between_string(string1 , string2):
    # converts string to lower case
    lower_string1 = lower_string(string1)
    lower_string2 = lower_string(string2)

    # calculate the difference between string
    d = difflib.Differ()
    diff = list(d.compare(lower_string1,lower_string2))

    # count the number of insertion, deletion or replacement operations
    distance_count = sum(1 for sim in diff if sim.startswith('-') or sim.startswith('+'))

    # calculate max length between the string
    max_length = max(len(lower_string1), len(lower_string2))

    # calculate ED normalized
    normalized_ed = distance_count/max_length 

    return normalized_ed


In [7]:
# find closest string 
def find_closest_string(target, string_list):
    if not string_list:
        return None
    
    min_normalized = float('inf')
    closest_string = None  

    for word in string_list:
        normalized_eod = distance_between_string(target,word)

        if normalized_eod < min_normalized:
            min_normalized = normalized_eod
            closest_string = word

    return closest_string,min_normalized

In [8]:
def unique_id(column):
    if column.is_unique:
        print("Gli ID sono unici.")
    else:
        print("Gli ID non sono unici.")


### count_nulls_per_column
The function counts how many null values (NaN) are present in each column of a DataFrame.

In [9]:
def count_nulls_per_column(df):
    """
    Calcola il numero di valori nulli per ogni colonna in un DataFrame.

    Parametri:
        df (pd.DataFrame): Il DataFrame da analizzare.

    Ritorna:
        pd.DataFrame: DataFrame con due colonne: 'Column' e 'Null Count',
                      che mostrano il nome della colonna e il numero di valori nulli.
    """
    null_counts = df.isnull().sum()
    result = pd.DataFrame({'Column': null_counts.index, 'Null Count': null_counts.values})
    print(result)
    return result


In [10]:
file_player_team = 'dataset/clean dataset/clean_player_team.csv'
file_matches = 'dataset/clean dataset/clean_matches.csv'
file_lista_team = 'dataset/clean dataset/clean_list_team.csv'
file_odds_team = 'dataset/clean dataset/clean_odds_per_match.csv'
file_matches_goal = 'dataset/clean dataset/clean_matches_goal.csv' 

api_matches = pd.read_csv(file_matches)
scraping_lista_team = pd.read_csv(file_lista_team)
scraping_player_team = pd.read_csv(file_player_team)
scraping_odds_team = pd.read_csv(file_odds_team)
matches_goal = pd.read_csv(file_matches_goal)

1. Verify the uniqueness of match keys. Print or return a list of duplicate or non-unique IDs.
2. Looking for the closest API name to each name from scraping. Function find_closest_string() returns the closest API name (team_api) and the normalized distance (dist), calculated with distance_between_string().


In [11]:
unique_id(api_matches['match_key'])



#### SYNTACTIC ACCURACY
scraping_team = scraping_lista_team['team'].unique().tolist()
api_teams = api_matches['home_team'].unique().tolist()

for team in scraping_team:
    team_api, dist = find_closest_string(team, api_teams)
    print("##########################")
    print("")
    print(team)
    print("")
    print(team_api) 
    print("")
    print(f"distanza:  {dist}")
    print("")


Gli ID non sono unici.
##########################

Juventus

Juventus

distanza:  0.0

##########################

Inter

Inter

distanza:  0.0

##########################

Milan

Milan

distanza:  0.0

##########################

Napoli

Napoli

distanza:  0.0

##########################

Atalanta

Atalanta

distanza:  0.0

##########################

Roma

Roma

distanza:  0.0

##########################

Fiorentina

Fiorentina

distanza:  0.0

##########################

Lazio

Lazio

distanza:  0.0

##########################

Sassuolo

Sassuolo

distanza:  0.0

##########################

Torino

Torino

distanza:  0.0

##########################

Bologna

Bologna

distanza:  0.0

##########################

Verona

Verona

distanza:  0.0

##########################

Udinese

Udinese

distanza:  0.0

##########################

Cagliari

Cagliari

distanza:  0.0

##########################

Sampdoria

Sampdoria

distanza:  0.0

##########################

Empoli

Empoli

distanza:

In [12]:
odds_team = scraping_odds_team['home_team'].unique().tolist()
for team in odds_team:
    team_api, dist = find_closest_string(team, api_teams) 
    print("##########################")
    print("")
    print(team)
    print("")
    print(team_api) 
    print("")
    print(f"distanza:  {dist}")
    print("")


##########################

Atalanta

Atalanta

distanza:  0.0

##########################

Empoli

Empoli

distanza:  0.0

##########################

Lazio

Lazio

distanza:  0.0

##########################

Torino

Torino

distanza:  0.0

##########################

Udinese

Udinese

distanza:  0.0

##########################

Venezia

Venezia

distanza:  0.0

##########################

Milan

Milan

distanza:  0.0

##########################

Bologna

Bologna

distanza:  0.0

##########################

Como

Como

distanza:  0.0

##########################

Napoli

Napoli

distanza:  0.0

##########################

Roma

Roma

distanza:  0.0

##########################

Cagliari

Cagliari

distanza:  0.0

##########################

Fiorentina

Fiorentina

distanza:  0.0

##########################

Inter

Inter

distanza:  0.0

##########################

Juventus

Juventus

distanza:  0.0

##########################

Lecce

Lecce

distanza:  0.0

##########################

Mo

In [13]:
count_nulls_per_column(scraping_player_team)

         Column  Null Count
0    Unnamed: 0           0
1          name           0
2          role           0
3           age           0
4  market_value           0
5          team           0
6        season           0


Unnamed: 0,Column,Null Count
0,Unnamed: 0,0
1,name,0
2,role,0
3,age,0
4,market_value,0
5,team,0
6,season,0


### check column with null value 


In [17]:

##### COMPLETENESS
count_nulls_per_column(api_matches) 

api_matches[api_matches['matchweek'].isnull()]



                 Column  Null Count
0            Unnamed: 0           0
1             matchweek           1
2                season           0
3                  date           0
4             home_team           0
5             away_team           0
6                winner           0
7   home_goals_halftime           0
8   away_goals_halftime           0
9            home_goals           0
10           away_goals           0
11            match_key           0


Unnamed: 0.1,Unnamed: 0,matchweek,season,date,home_team,away_team,winner,home_goals_halftime,away_goals_halftime,home_goals,away_goals,match_key
760,760,,2022,2023-06-11,Spezia,Verona,AWAY_WINNER,1,3,1,3,2022-Spezia-Verona


we can see one row with matchweek value nullable, I checked why, and this match is a playoff to stay in Serie A. We can delete this line

In [15]:
count_nulls_per_column(scraping_lista_team)

         Column  Null Count
0    Unnamed: 0           0
1          team           0
2     team_size           0
3       avg_age           0
4  n_foreigners           0
5    team_value           0
6        season           0


Unnamed: 0,Column,Null Count
0,Unnamed: 0,0
1,team,0
2,team_size,0
3,avg_age,0
4,n_foreigners,0
5,team_value,0
6,season,0


In [16]:
api_matches[api_matches['matchweek'].isnull()]

Unnamed: 0.1,Unnamed: 0,matchweek,season,date,home_team,away_team,winner,home_goals_halftime,away_goals_halftime,home_goals,away_goals,match_key
760,760,,2022,2023-06-11,Spezia,Verona,AWAY_WINNER,1,3,1,3,2022-Spezia-Verona
