#Esempio di cleaning dei dati tramite sostituzione dei dati nulli con dati standardizzati dentro a un dataset tramite jupyter notebook
##Librerie utilizzate

In [104]:
import os
import pandas as pd
import numpy as np
from dateutil.parser import parse, ParserError

##Importiamo, per esempio, il dataset players.csv

In [105]:
df = pd.read_csv('../DataAnalysis/Datasets/players.csv')
pd.options.display.max_rows = 400
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30302 entries, 0 to 30301
Data columns (total 23 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   player_id                             30302 non-null  int64  
 1   first_name                            28337 non-null  object 
 2   last_name                             30302 non-null  object 
 3   name                                  30302 non-null  object 
 4   last_season                           30302 non-null  int64  
 5   current_club_id                       30302 non-null  int64  
 6   player_code                           30302 non-null  object 
 7   country_of_birth                      27613 non-null  object 
 8   city_of_birth                         28099 non-null  object 
 9   country_of_citizenship                29759 non-null  object 
 10  date_of_birth                         30255 non-null  object 
 11  sub_position   

##Osserviamo adesso quali colonne contengono campi vuoti al loro interno e la quantità di tali campi

In [106]:
colonne_con_campi_vuoti = df.columns[df.isnull().any()]
print("Colonne con campi vuoti:")
print(colonne_con_campi_vuoti)

Colonne con campi vuoti:
Index(['first_name', 'country_of_birth', 'city_of_birth',
       'country_of_citizenship', 'date_of_birth', 'sub_position', 'foot',
       'height_in_cm', 'market_value_in_eur', 'highest_market_value_in_eur',
       'contract_expiration_date', 'agent_name'],
      dtype='object')


In [107]:
for colonna in colonne_con_campi_vuoti:
    conteggio_campi_vuoti = df[colonna].isnull().sum()
    print(f"Colonna '{colonna}' ha {conteggio_campi_vuoti} campi vuoti.")

Colonna 'first_name' ha 1965 campi vuoti.
Colonna 'country_of_birth' ha 2689 campi vuoti.
Colonna 'city_of_birth' ha 2203 campi vuoti.
Colonna 'country_of_citizenship' ha 543 campi vuoti.
Colonna 'date_of_birth' ha 47 campi vuoti.
Colonna 'sub_position' ha 172 campi vuoti.
Colonna 'foot' ha 2389 campi vuoti.
Colonna 'height_in_cm' ha 2098 campi vuoti.
Colonna 'market_value_in_eur' ha 10919 campi vuoti.
Colonna 'highest_market_value_in_eur' ha 1321 campi vuoti.
Colonna 'contract_expiration_date' ha 11467 campi vuoti.
Colonna 'agent_name' ha 15361 campi vuoti.


##Cambiamo adesso i valori nulli con i corretti valori di default

In [108]:
df.fillna({"foot": "null"}, inplace=True)
df.fillna({"market_value_in_eur": -1}, inplace=True)
df.fillna({"sub_position": "Missing"}, inplace=True)
df.fillna({"first_name": "Missing"}, inplace=True)
df.fillna({"country_of_birth": "Missing"}, inplace=True)
df.fillna({"city_of_birth": "Missing"}, inplace=True)
df.fillna({"country_of_citizenship": "Missing"}, inplace=True)
df.fillna({"height_in_cm": -1}, inplace=True)
df.fillna({"market_value_in_eur": -1}, inplace=True)
df.fillna({"highest_market_value_in_eur": -1}, inplace=True)
df.fillna({"agent_name": "Missing"}, inplace=True)
print(df)

       player_id first_name    last_name                name  last_season  \
0            598       Timo   Hildebrand     Timo Hildebrand         2014   
1            670     Martin       Petrov       Martin Petrov         2012   
2           1323     Martin      Amedick      Martin Amedick         2012   
3           3195   Jermaine      Pennant    Jermaine Pennant         2013   
4           3259     Damien         Duff         Damien Duff         2013   
...          ...        ...          ...                 ...          ...   
30297     371851       Jaka        Bijol          Jaka Bijol         2023   
30298     537171     Semuel  Pizzignacco  Semuel Pizzignacco         2018   
30299     586756      Festy      Ebosele       Festy Ebosele         2023   
30300     704692     Nicolò      Cocetta      Nicolò Cocetta         2022   
30301     925584       Axel     Guessand       Axel Guessand         2023   

       current_club_id         player_code country_of_birth  \
0           

##Cambiamo il formato della data di "contract_expiration_date" e "date_of_birth" rimpiazzando i valori "null" con la data di default "1900-01-01"

In [109]:
df.fillna({"contract_expiration_date": "1900-01-01 00:00:00"}, inplace=True)
df["contract_expiration_date"] = pd.to_datetime(df["contract_expiration_date"]).dt.normalize()
#utilizzo il formato mixed poichè date of birth presenta valori letterali nel dataset, che impedirebbero la pulitura senza tale specifica dato che sto inserendo solo caratteri numerici
df.fillna({"date_of_birth": "1900-01-01 00:00:00"}, inplace=True)
df["date_of_birth"] = pd.to_datetime(df["date_of_birth"], format = 'mixed').dt.normalize()

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30302 entries, 0 to 30301
Data columns (total 23 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   player_id                             30302 non-null  int64         
 1   first_name                            30302 non-null  object        
 2   last_name                             30302 non-null  object        
 3   name                                  30302 non-null  object        
 4   last_season                           30302 non-null  int64         
 5   current_club_id                       30302 non-null  int64         
 6   player_code                           30302 non-null  object        
 7   country_of_birth                      30302 non-null  object        
 8   city_of_birth                         30302 non-null  object        
 9   country_of_citizenship                30302 non-null  object        
 10

##Convertiamo tipi di oggetti nei relativi tipi corretti

In [110]:
df["market_value_in_eur"] = df["market_value_in_eur"].astype(float)
df["first_name"] = df["first_name"].astype(str)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30302 entries, 0 to 30301
Data columns (total 23 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   player_id                             30302 non-null  int64         
 1   first_name                            30302 non-null  object        
 2   last_name                             30302 non-null  object        
 3   name                                  30302 non-null  object        
 4   last_season                           30302 non-null  int64         
 5   current_club_id                       30302 non-null  int64         
 6   player_code                           30302 non-null  object        
 7   country_of_birth                      30302 non-null  object        
 8   city_of_birth                         30302 non-null  object        
 9   country_of_citizenship                30302 non-null  object        
 10

#Adesso ricapitoliamo il tutto in unica funzione, def save_cleaned_data, che andrà a prendere il nostro dataset, lo pulirà, convertirà i tipi di oggetti nei relativi tipi corretti e lo salverà nella cartella "cleanDataset"

In [111]:
def save_cleaned_data(dataframes, output_folder="cleanDatasets"):
    """
    Saves cleaned DataFrames to separate CSV files in the specified output folder.

    Args:
        dataframes (dict): A dictionary mapping DataFrame names (keys) to DataFrames (values).
        output_folder (str, optional): The path to the output folder. Defaults to "cleanDatasets".

    Raises:
        OSError: If there's an issue creating the output directory (if it doesn't exist).
    """

    # Creo la directory di output se non esiste
    output_dir = os.path.join(os.getcwd(), output_folder)
    if not os.path.exists(output_dir):
        try:
            os.makedirs(output_dir)
        except OSError:
            raise OSError(f"Could not create output directory: {output_dir}")

    # pulisco il dataframe
    for name, df in dataframes.items():
        # Leggi l'originale ( assumendo che sia un file .csv
        if name == "competitions":
            df = pd.read_csv('../DataAnalysis/Datasets/players.csv')

            # Logica per la pulizia dei dataframes
            #tramite il metodo fillna riempio i valori vuoti con valori generici, -1 in caso di valori numerici e "Missing" per valori testuali
            df.fillna({"foot": "null"}, inplace=True)
            df.fillna({"market_value_in_eur": -1}, inplace=True)
            df.fillna({"sub_position": "Missing"}, inplace=True)
            df.fillna({"first_name": "Missing"}, inplace=True)
            df.fillna({"country_of_birth": "Missing"}, inplace=True)
            df.fillna({"city_of_birth": "Missing"}, inplace=True)
            df.fillna({"country_of_citizenship": "Missing"}, inplace=True)
            df.fillna({"height_in_cm": -1}, inplace=True)
            df.fillna({"market_value_in_eur": -1}, inplace=True)
            df.fillna({"highest_market_value_in_eur": -1}, inplace=True)
            df.fillna({"agent_name": "Missing"}, inplace=True)
            df.fillna({"contract_expiration_date": "1900-01-01 00:00:00"}, inplace=True)
            df["contract_expiration_date"] = pd.to_datetime(df["contract_expiration_date"]).dt.normalize()
            df.fillna({"date_of_birth": "1900-01-01 00:00:00"}, inplace=True)
            df["date_of_birth"] = pd.to_datetime(df["date_of_birth"], format = 'mixed').dt.normalize()
            df["market_value_in_eur"] = df["market_value_in_eur"].astype(float)
            df["first_name"] = df["first_name"].astype(str)

        # Salvo il dataframe pulito
        output_file = os.path.join(output_dir, f"{name}.csv")
        df.to_csv(output_file, index=False)  # Salva senza indice

#definisco il dataframe players pulito
dataframes = {
    "players": df

}

#salvo il dataframe players pulito
save_cleaned_data(dataframes)

##Puliamo adesso il dataset "Competitions"

In [112]:
dfcompetitions = pd.read_csv('../DataAnalysis/Datasets/competitions.csv')
pd.options.display.max_rows = 400
print(dfcompetitions.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   competition_id        43 non-null     object
 1   competition_code      43 non-null     object
 2   name                  43 non-null     object
 3   sub_type              43 non-null     object
 4   type                  43 non-null     object
 5   country_id            43 non-null     int64 
 6   country_name          36 non-null     object
 7   domestic_league_code  36 non-null     object
 8   confederation         43 non-null     object
 9   url                   43 non-null     object
dtypes: int64(1), object(9)
memory usage: 3.5+ KB
None


###Anche qui osserviamo quali colonne contengono valori nulli

In [113]:
# Trovo le colonne con celle vuote
empty_columns_cells = dfcompetitions.columns[dfcompetitions.isnull().any()]
print("Colonne con campi vuoti:")
print(empty_columns_cells)
# Conto le celle vuote per ogni colonna
for col in empty_columns_cells:
    conteggio_campi_vuoti = dfcompetitions[col].isnull().sum()
    print(f"Colonna '{col}' ha {conteggio_campi_vuoti} campi vuoti.")

Colonne con campi vuoti:
Index(['country_name', 'domestic_league_code'], dtype='object')
Colonna 'country_name' ha 7 campi vuoti.
Colonna 'domestic_league_code' ha 7 campi vuoti.


In [114]:
def save_cleaned_data(dataframes, output_folder="cleanDatasets"):
    """
    Saves cleaned DataFrames to separate CSV files in the specified output folder.

    Args:
        dataframes (dict): A dictionary mapping DataFrame names (keys) to DataFrames (values).
        output_folder (str, optional): The path to the output folder. Defaults to "cleanDatasets".

    Raises:
        OSError: If there's an issue creating the output directory (if it doesn't exist).
    """

    # Creo la directory di output se non esiste
    output_dir = os.path.join(os.getcwd(), output_folder)
    if not os.path.exists(output_dir):
        try:
            os.makedirs(output_dir)
        except OSError:
            raise OSError(f"Could not create output directory: {output_dir}")

    # pulisco il dataframe
    for name, dfcompetitions in dataframes.items():
        # Leggi l'originale ( assumendo che sia un file .csv
        if name == "competitions":
            dfcompetitions = pd.read_csv('../DataAnalysis/Datasets/competitions.csv')

            # Logica per la pulizia dei dataframes
            #tramite il metodo fillna riempio i valori vuoti con valori generici, -1 in caso di valori numerici e "Missing" per valori testuali
            dfcompetitions.fillna({"country_name": "Missing"}, inplace=True)
            dfcompetitions.fillna({"domestic_league_code": "Missing"}, inplace=True)

        # Salvo il dataframe pulito
        output_file = os.path.join(output_dir, f"{name}.csv")
        dfcompetitions.to_csv(output_file, index=False)  # Salva senza indice

#definisco il dataframe competitions pulito
dataframes = {
    "competitions": dfcompetitions

}

#salvo il dataframe competitions pulito
save_cleaned_data(dataframes)

##Now let's clean the dataset "clubs", osserviamo le colonne con valori nulli e andiamo a riempirle con dati generici indicanti appunto la mancanza di dati

In [115]:
dfclub = pd.read_csv('../DataAnalysis/Datasets/clubs.csv')
pd.options.display.max_rows = 400
print(dfclub.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426 entries, 0 to 425
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   club_id                  426 non-null    int64  
 1   club_code                426 non-null    object 
 2   name                     426 non-null    object 
 3   domestic_competition_id  426 non-null    object 
 4   total_market_value       0 non-null      float64
 5   squad_size               426 non-null    int64  
 6   average_age              388 non-null    float64
 7   foreigners_number        426 non-null    int64  
 8   foreigners_percentage    379 non-null    float64
 9   national_team_players    426 non-null    int64  
 10  stadium_name             426 non-null    object 
 11  stadium_seats            426 non-null    int64  
 12  net_transfer_record      426 non-null    object 
 13  coach_name               0 non-null      float64
 14  last_season              4

In [116]:
# Trovo le colonne con celle vuote
empty_columns_cells = dfclub.columns[dfclub.isnull().any()]

print("Colonne con campi vuoti:")
print(empty_columns_cells)  # List of column names

# Conto le celle vuote per ogni colonna
for col in empty_columns_cells:
    conteggio_campi_vuoti_club = dfclub[col].isnull().sum()
    print(f"Colonna '{col}' ha {conteggio_campi_vuoti_club} campi vuoti.")

Colonne con campi vuoti:
Index(['total_market_value', 'average_age', 'foreigners_percentage',
       'coach_name'],
      dtype='object')
Colonna 'total_market_value' ha 426 campi vuoti.
Colonna 'average_age' ha 38 campi vuoti.
Colonna 'foreigners_percentage' ha 47 campi vuoti.
Colonna 'coach_name' ha 426 campi vuoti.


In [117]:
def save_cleaned_data(dataframes, output_folder="cleanDatasets"):
    """
    Saves cleaned DataFrames to separate CSV files in the specified output folder.

    Args:
        dataframes (dict): A dictionary mapping DataFrame names (keys) to DataFrames (values).
        output_folder (str, optional): The path to the output folder. Defaults to "cleanDatasets".

    Raises:
        OSError: If there's an issue creating the output directory (if it doesn't exist).
    """

    # Creo la directory di output se non esiste
    output_dir = os.path.join(os.getcwd(), output_folder)
    if not os.path.exists(output_dir):
        try:
            os.makedirs(output_dir)
        except OSError:
            raise OSError(f"Could not create output directory: {output_dir}")

    # pulisco il dataframe
    for name, dfclub in dataframes.items():
        # Leggi l'originale ( assumendo che sia un file .csv
        if name == "clubs":
            dfclub = pd.read_csv('../DataAnalysis/Datasets/clubs.csv')

            # Logica per la pulizia dei dataframes
            #tramite il metodo fillna riempio i valori vuoti con valori generici, -1 in caso di valori numerici e "Missing" per valori testuali
            dfclub.fillna({"total_market_value": -1}, inplace=True)
            dfclub.fillna({"average_age": -1}, inplace=True)
            dfclub.fillna({"foreigners_percentage": -1}, inplace=True)
            dfclub.fillna({"coach_name": "Missing"}, inplace=True)

        # Salvo il dataframe pulito
        output_file = os.path.join(output_dir, f"{name}.csv")
        dfclub.to_csv(output_file, index=False)  # Salva senza indice

#definisco il dataframe clubs pulito
dataframes = {
    "clubs": dfclub

}

#salvo il dataframe clubs pulito
save_cleaned_data(dataframes)

##puliamo il dataset players_valuations

In [118]:
dfeval = pd.read_csv('../DataAnalysis/Datasets/player_valuations.csv')
pd.options.display.max_rows = 400
print(dfeval.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32939 entries, 0 to 32938
Data columns (total 9 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   player_id                            32939 non-null  int64  
 1   last_season                          32939 non-null  int64  
 2   datetime                             32938 non-null  object 
 3   date                                 32938 non-null  object 
 4   dateweek                             32938 non-null  object 
 5   market_value_in_eur                  32938 non-null  float64
 6   n                                    32938 non-null  float64
 7   current_club_id                      32938 non-null  float64
 8   player_club_domestic_competition_id  32938 non-null  object 
dtypes: float64(3), int64(2), object(4)
memory usage: 2.3+ MB
None


In [119]:
# Trovo le colonne con celle vuote
empty_columns_cells = dfeval.columns[dfeval.isnull().any()]

print("Colonne con campi vuoti:")
print(empty_columns_cells)  # List of column names

# Conto le celle vuote per ogni colonna
for col in empty_columns_cells:
    conteggio_campi_vuoti_eval = dfeval[col].isnull().sum()
    print(f"Colonna '{col}' ha {conteggio_campi_vuoti_eval} campi vuoti.")

Colonne con campi vuoti:
Index(['datetime', 'date', 'dateweek', 'market_value_in_eur', 'n',
       'current_club_id', 'player_club_domestic_competition_id'],
      dtype='object')
Colonna 'datetime' ha 1 campi vuoti.
Colonna 'date' ha 1 campi vuoti.
Colonna 'dateweek' ha 1 campi vuoti.
Colonna 'market_value_in_eur' ha 1 campi vuoti.
Colonna 'n' ha 1 campi vuoti.
Colonna 'current_club_id' ha 1 campi vuoti.
Colonna 'player_club_domestic_competition_id' ha 1 campi vuoti.


In [120]:
def save_cleaned_data(dataframes, output_folder="cleanDatasets"):
    """
    Saves cleaned DataFrames to separate CSV files in the specified output folder.

    Args:
        dataframes (dict): A dictionary mapping DataFrame names (keys) to DataFrames (values).
        output_folder (str, optional): The path to the output folder. Defaults to "cleanDatasets".

    Raises:
        OSError: If there's an issue creating the output directory (if it doesn't exist).
    """

    # Creo la directory di output se non esiste
    output_dir = os.path.join(os.getcwd(), output_folder)
    if not os.path.exists(output_dir):
        try:
            os.makedirs(output_dir)
        except OSError:
            raise OSError(f"Could not create output directory: {output_dir}")

    # pulisco il dataframe
    for name, dfeval in dataframes.items():
        # Leggi l'originale ( assumendo che sia un file .csv
        if name == "player_valuations":
            dfeval = pd.read_csv('../DataAnalysis/Datasets/player_valuations.csv')

            # Logica per la pulizia dei dataframes
            #tramite il metodo fillna riempio i valori vuoti con valori generici, -1 in caso di valori numerici e "Missing" per valori testuali
            dfeval.fillna({"n": 0}, inplace=True)
            dfeval.fillna({"market_value_in_eur": -1}, inplace=True)
            dfeval.fillna({"current_club_id": "Missing"}, inplace=True)
            dfeval.fillna({"player_club_domestic_competition_id": "Missing"}, inplace=True)
            dfeval.fillna({"date": "1900-01-01"}, inplace=True)
            dfeval["date"] = pd.to_datetime(dfeval["date"], format = 'mixed').dt.normalize()
            dfeval.fillna({"datetime": "1900-01-01 00:00:00"}, inplace=True)
            dfeval["datetime"] = pd.to_datetime(dfeval["datetime"], format = 'mixed').dt.normalize()
            dfeval.fillna({"dateweek": "1900-01-01"}, inplace=True)
            dfeval["dateweek"] = pd.to_datetime(dfeval["dateweek"], format = 'mixed').dt.normalize()

        # Salvo il dataframe pulito
        output_file = os.path.join(output_dir, f"{name}.csv")
        dfeval.to_csv(output_file, index=False)  # Salva senza indice

#definisco il dataframe player_valuations pulito
dataframes = {
    "player_valuations": dfeval

}

#salvo il dataframe player_valuations pulito
save_cleaned_data(dataframes)

#puliamo adesso i dataset presenti sul server con driver MongoDB, in ordine appearances, club_games, game_events, game_lineups e games

In [121]:
dfapp = pd.read_csv('../DataAnalysis/Datasets/appearances.csv')
pd.options.display.max_rows = 400
print(dfapp.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29974 entries, 0 to 29973
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   appearance_id           29974 non-null  object 
 1   game_id                 29974 non-null  int64  
 2   player_id               29974 non-null  int64  
 3   player_club_id          29974 non-null  int64  
 4   player_current_club_id  29974 non-null  int64  
 5   date                    29974 non-null  object 
 6   player_name             29974 non-null  object 
 7   competition_id          29974 non-null  object 
 8   yellow_cards            29974 non-null  int64  
 9   red_cards               29974 non-null  int64  
 10  goals                   29974 non-null  int64  
 11  assists                 29974 non-null  int64  
 12  minutes_played          29973 non-null  float64
dtypes: float64(1), int64(8), object(4)
memory usage: 3.0+ MB
None


In [122]:
# Trovo le colonne con celle vuote
empty_columns_cells = dfapp.columns[dfapp.isnull().any()]

print("Colonne con campi vuoti:")
print(empty_columns_cells)  # List of column names

# Conto le celle vuote per ogni colonna
for col in empty_columns_cells:
    conteggio_campi_vuoti_app = dfapp[col].isnull().sum()
    print(f"Colonna '{col}' ha {conteggio_campi_vuoti_app} campi vuoti.")

Colonne con campi vuoti:
Index(['minutes_played'], dtype='object')
Colonna 'minutes_played' ha 1 campi vuoti.


In [123]:
def save_cleaned_data(dataframes, output_folder="cleanDatasets"):
    """
    Saves cleaned DataFrames to separate CSV files in the specified output folder.

    Args:
        dataframes (dict): A dictionary mapping DataFrame names (keys) to DataFrames (values).
        output_folder (str, optional): The path to the output folder. Defaults to "cleanDatasets".

    Raises:
        OSError: If there's an issue creating the output directory (if it doesn't exist).
    """

    # Creo la directory di output se non esiste
    output_dir = os.path.join(os.getcwd(), output_folder)
    if not os.path.exists(output_dir):
        try:
            os.makedirs(output_dir)
        except OSError:
            raise OSError(f"Could not create output directory: {output_dir}")

    # pulisco il dataframe
    for name, dfapp in dataframes.items():
        # Leggi l'originale ( assumendo che sia un file .csv
        if name == "appearances":
            dfapp = pd.read_csv('../DataAnalysis/Datasets/appearances.csv')

            # Logica per la pulizia dei dataframes
            dfapp.fillna({"minutes_played": "Missing"}, inplace=True)

        # Salvo il dataframe pulito
        output_file = os.path.join(output_dir, f"{name}.csv")
        dfapp.to_csv(output_file, index=False)  # Salva senza indice

#definisco il dataframe appearances pulito
dataframes = {
    "appearances": dfapp

}

#salvo il dataframe appearances pulito
save_cleaned_data(dataframes)

##puliamo il dataset club_games

In [124]:
dfcluga = pd.read_csv('../DataAnalysis/Datasets/club_games.csv')
pd.options.display.max_rows = 400
print(dfcluga.info())

# Trovo le colonne con celle vuote
empty_columns_cells = dfcluga.columns[dfcluga.isnull().any()]

print("Colonne con campi vuoti:")
print(empty_columns_cells)  # List of column names

# Conto le celle vuote per ogni colonna
for col in empty_columns_cells:
    conteggio_campi_vuoti_cluga = dfcluga[col].isnull().sum()
    print(f"Colonna '{col}' ha {conteggio_campi_vuoti_cluga} campi vuoti.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130432 entries, 0 to 130431
Data columns (total 11 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   game_id                130432 non-null  int64  
 1   club_id                130432 non-null  int64  
 2   own_goals              130432 non-null  int64  
 3   own_position           91524 non-null   float64
 4   own_manager_name       128948 non-null  object 
 5   opponent_id            130432 non-null  int64  
 6   opponent_goals         130432 non-null  int64  
 7   opponent_position      91524 non-null   float64
 8   opponent_manager_name  128948 non-null  object 
 9   hosting                130432 non-null  object 
 10  is_win                 130432 non-null  int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 10.9+ MB
None
Colonne con campi vuoti:
Index(['own_position', 'own_manager_name', 'opponent_position',
       'opponent_manager_name'],
      dtyp

In [125]:
def save_cleaned_data(dataframes, output_folder="cleanDatasets"):
    """
    Saves cleaned DataFrames to separate CSV files in the specified output folder.

    Args:
        dataframes (dict): A dictionary mapping DataFrame names (keys) to DataFrames (values).
        output_folder (str, optional): The path to the output folder. Defaults to "cleanDatasets".

    Raises:
        OSError: If there's an issue creating the output directory (if it doesn't exist).
    """

    # Creo la directory di output se non esiste
    output_dir = os.path.join(os.getcwd(), output_folder)
    if not os.path.exists(output_dir):
        try:
            os.makedirs(output_dir)
        except OSError:
            raise OSError(f"Could not create output directory: {output_dir}")

    # pulisco il dataframe
    for name, dfcluga in dataframes.items():
        # Leggi l'originale ( assumendo che sia un file .csv
        if name == "club_games":
            dfcluga = pd.read_csv('../DataAnalysis/Datasets/club_games.csv')

        # Logica per la pulizia dei dataframes
            dfcluga.fillna({"own_position": "Missing"}, inplace=True)
            dfcluga.fillna({"own_manager_name": "Missing"}, inplace=True)
            dfcluga.fillna({"opponent_position": "Missing"}, inplace=True)
            dfcluga.fillna({"opponent_manager_name": "Missing"}, inplace=True)

        # Salvo il dataframe pulito
        output_file = os.path.join(output_dir, f"{name}.csv")
        dfcluga.to_csv(output_file, index=False)  # Salva senza indice

#definisco il dataframe club_games pulito
dataframes = {
    "club_games": dfcluga

}

#salvo il dataframe club_games pulito
save_cleaned_data(dataframes)

#puliamo il dataset game_events

In [126]:
dfevents = pd.read_csv('../DataAnalysis/Datasets/game_events.csv')
pd.options.display.max_rows = 400
print(dfevents.info())

# Trovo le colonne con celle vuote
empty_columns_cells = dfevents.columns[dfevents.isnull().any()]

print("Colonne con campi vuoti:")
print(empty_columns_cells)  # List of column names

# Conto le celle vuote per ogni colonna
for col in empty_columns_cells:
    conteggio_campi_vuoti_events = dfevents[col].isnull().sum()
    print(f"Colonna '{col}' ha {conteggio_campi_vuoti_events} campi vuoti.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21018 entries, 0 to 21017
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   game_event_id     21018 non-null  object 
 1   date              21018 non-null  object 
 2   game_id           21018 non-null  int64  
 3   minute            21018 non-null  int64  
 4   type              21018 non-null  object 
 5   club_id           21018 non-null  int64  
 6   player_id         21017 non-null  float64
 7   description       21017 non-null  object 
 8   player_in_id      9386 non-null   float64
 9   player_assist_id  3980 non-null   float64
dtypes: float64(3), int64(3), object(4)
memory usage: 1.6+ MB
None
Colonne con campi vuoti:
Index(['player_id', 'description', 'player_in_id', 'player_assist_id'], dtype='object')
Colonna 'player_id' ha 1 campi vuoti.
Colonna 'description' ha 1 campi vuoti.
Colonna 'player_in_id' ha 11632 campi vuoti.
Colonna 'player_assist_i

In [127]:
def save_cleaned_data(dataframes, output_folder="cleanDatasets"):
    """
    Saves cleaned DataFrames to separate CSV files in the specified output folder.

    Args:
        dataframes (dict): A dictionary mapping DataFrame names (keys) to DataFrames (values).
        output_folder (str, optional): The path to the output folder. Defaults to "cleanDatasets".

    Raises:
        OSError: If there's an issue creating the output directory (if it doesn't exist).
    """

    # Creo la directory di output se non esiste
    output_dir = os.path.join(os.getcwd(), output_folder)
    if not os.path.exists(output_dir):
        try:
            os.makedirs(output_dir)
        except OSError:
            raise OSError(f"Could not create output directory: {output_dir}")

    # pulisco il dataframe
    for name, dfevents in dataframes.items():
        # Leggi l'originale ( assumendo che sia un file .csv
        if name == "game_events":
            dfevents = pd.read_csv('../DataAnalysis/Datasets/game_events.csv')

        # Logica per la pulizia dei dataframes
        dfevents.fillna({"player_id": "Missing"}, inplace=True)
        dfevents.fillna({"description": "Missing"}, inplace=True)
        dfevents.fillna({"player_in_id": "Missing"}, inplace=True)
        dfevents.fillna({"player_assist_id": "Missing"}, inplace=True)
        
        # Salvo il dataframe pulito
        output_file = os.path.join(output_dir, f"{name}.csv")
        dfevents.to_csv(output_file, index=False)  # Salva senza indice

#definisco il dataframe game_events pulito
dataframes = {
    "game_events": dfevents

}

#salvo il dataframe game_events pulito
save_cleaned_data(dataframes)

##puliamo il dataset game_lineups

In [128]:
dflineup = pd.read_csv('../DataAnalysis/Datasets/game_lineups.csv')
pd.options.display.max_rows = 400
print(dfcluga.info())

# Trovo le colonne con celle vuote
empty_columns_cells = dflineup.columns[dflineup.isnull().any()]

print("Colonne con campi vuoti:")
print(empty_columns_cells)  # List of column names

# Conto le celle vuote per ogni colonna
for col in empty_columns_cells:
    conteggio_campi_vuoti_lineup = dflineup[col].isnull().sum()
    print(f"Colonna '{col}' ha {conteggio_campi_vuoti_lineup} campi vuoti.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130432 entries, 0 to 130431
Data columns (total 11 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   game_id                130432 non-null  int64  
 1   club_id                130432 non-null  int64  
 2   own_goals              130432 non-null  int64  
 3   own_position           91524 non-null   float64
 4   own_manager_name       128948 non-null  object 
 5   opponent_id            130432 non-null  int64  
 6   opponent_goals         130432 non-null  int64  
 7   opponent_position      91524 non-null   float64
 8   opponent_manager_name  128948 non-null  object 
 9   hosting                130432 non-null  object 
 10  is_win                 130432 non-null  int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 10.9+ MB
None
Colonne con campi vuoti:
Index([], dtype='object')


##In questo singolare caso, il datset game_lineups non ha alcun campo vuoto. Possiamo procedere con la pulizia dell'ultimo dataset in esame, games

In [129]:
dfgames = pd.read_csv('../DataAnalysis/Datasets/games.csv')
pd.options.display.max_rows = 400
print(dfgames.info())

# Trovo le colonne con celle vuote
empty_columns_cells = dfgames.columns[dfgames.isnull().any()]

print("Colonne con campi vuoti:")
print(empty_columns_cells)

# Conto le celle vuote per ogni colonna
for col in empty_columns_cells:
    conteggio_campi_vuoti_games = dfgames[col].isnull().sum()
    print(f"Colonna '{col}' ha {conteggio_campi_vuoti_games} campi vuoti.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65216 entries, 0 to 65215
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   game_id                 65216 non-null  int64  
 1   competition_id          65216 non-null  object 
 2   season                  65216 non-null  int64  
 3   round                   65216 non-null  object 
 4   date                    65216 non-null  object 
 5   home_club_id            65216 non-null  int64  
 6   away_club_id            65216 non-null  int64  
 7   home_club_goals         65216 non-null  int64  
 8   away_club_goals         65216 non-null  int64  
 9   home_club_position      45762 non-null  float64
 10  away_club_position      45762 non-null  float64
 11  home_club_manager_name  64474 non-null  object 
 12  away_club_manager_name  64474 non-null  object 
 13  stadium                 65005 non-null  object 
 14  attendance              55704 non-null

##Andiamo quindi a pulire il dataset e a salvarlo nella cartella cleanDatasets

In [130]:
def save_cleaned_data(dataframes, output_folder="cleanDatasets"):
    """
    Saves cleaned DataFrames to separate CSV files in the specified output folder.

    Args:
        dataframes (dict): A dictionary mapping DataFrame names (keys) to DataFrames (values).
        output_folder (str, optional): The path to the output folder. Defaults to "cleanDatasets".

    Raises:
        OSError: If there's an issue creating the output directory (if it doesn't exist).
    """

    # Creo la directory di output se non esiste
    output_dir = os.path.join(os.getcwd(), output_folder)
    if not os.path.exists(output_dir):
        try:
            os.makedirs(output_dir)
        except OSError:
            raise OSError(f"Could not create output directory: {output_dir}")

    # pulisco il dataframe
    for name, dfgames in dataframes.items():
        # Leggi l'originale ( assumendo che sia un file .csv
        if name == "games":
            dfgames = pd.read_csv('../DataAnalysis/Datasets/games.csv')

        # Logica per pulire il dataframe "games"
        dfgames.fillna({"home_club_position": "Missing"}, inplace=True)
        dfgames.fillna({"away_club_position": "Missing"}, inplace=True)
        dfgames.fillna({"home_club_manager_name": "Missing"}, inplace=True)
        dfgames.fillna({"away_club_manager_name": "Missing"}, inplace=True)
        dfgames.fillna({"stadium": "Missing"}, inplace=True)
        dfgames.fillna({"attendance": -1}, inplace=True)
        dfgames.fillna({"referee": "Missing"}, inplace=True)
        dfgames.fillna({"home_club_formation": "Missing"}, inplace=True)
        dfgames.fillna({"away_club_formation": "Missing"}, inplace=True)
        dfgames.fillna({"home_club_name": "Missing"}, inplace=True)
        dfgames.fillna({"away_club_name": "Missing"}, inplace=True)


        # Salvo il dataframe pulito
        output_file = os.path.join(output_dir, f"{name}.csv")
        dfgames.to_csv(output_file, index=False)  # Salva senza indice

#definisco il dataframe games pulito
dataframes = {
    "games": dfgames

}

#definisco il dataframe games pulito
save_cleaned_data(dataframes)

Adesso possiamo fare l'analisi dei dati, precedentemente fatta utilizzando dataset con celle mancanti andando a sopperire a tali mancanze usando direttamente i metodi di fillna e dropna nella stessa, con i dataset puliti. 