#Esempio di cleaning dei dati dentro a un dataset tramite jupyter notebook
##Librerie utilizzate

In [36]:
import os
import pandas as pd
import numpy as np
from dateutil.parser import parse, ParserError

##Importiamo, per esempio, il dataset players.csv

In [37]:
df = pd.read_csv('../DataAnalysis/Datasets/players.csv')
pd.options.display.max_rows = 400
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30302 entries, 0 to 30301
Data columns (total 23 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   player_id                             30302 non-null  int64  
 1   first_name                            28337 non-null  object 
 2   last_name                             30302 non-null  object 
 3   name                                  30302 non-null  object 
 4   last_season                           30302 non-null  int64  
 5   current_club_id                       30302 non-null  int64  
 6   player_code                           30302 non-null  object 
 7   country_of_birth                      27613 non-null  object 
 8   city_of_birth                         28099 non-null  object 
 9   country_of_citizenship                29759 non-null  object 
 10  date_of_birth                         30255 non-null  object 
 11  sub_position   

##Osserviamo adesso quali colonne contengono campi vuoti al loro interno e la quantità di tali campi

In [38]:
colonne_con_campi_vuoti = df.columns[df.isnull().any()]
print("Colonne con campi vuoti:")
print(colonne_con_campi_vuoti)

Colonne con campi vuoti:
Index(['first_name', 'country_of_birth', 'city_of_birth',
       'country_of_citizenship', 'date_of_birth', 'sub_position', 'foot',
       'height_in_cm', 'market_value_in_eur', 'highest_market_value_in_eur',
       'contract_expiration_date', 'agent_name'],
      dtype='object')


In [39]:
for colonna in colonne_con_campi_vuoti:
    conteggio_campi_vuoti = df[colonna].isnull().sum()
    print(f"Colonna '{colonna}' ha {conteggio_campi_vuoti} campi vuoti.")

Colonna 'first_name' ha 1965 campi vuoti.
Colonna 'country_of_birth' ha 2689 campi vuoti.
Colonna 'city_of_birth' ha 2203 campi vuoti.
Colonna 'country_of_citizenship' ha 543 campi vuoti.
Colonna 'date_of_birth' ha 47 campi vuoti.
Colonna 'sub_position' ha 172 campi vuoti.
Colonna 'foot' ha 2389 campi vuoti.
Colonna 'height_in_cm' ha 2098 campi vuoti.
Colonna 'market_value_in_eur' ha 10919 campi vuoti.
Colonna 'highest_market_value_in_eur' ha 1321 campi vuoti.
Colonna 'contract_expiration_date' ha 11467 campi vuoti.
Colonna 'agent_name' ha 15361 campi vuoti.


##Cambiamo adesso i valori nulli con i corretti valori di default

In [40]:
df.fillna({"foot": "null"}, inplace=True)
df.fillna({"market_value_in_eur": -1}, inplace=True)
df.fillna({"sub_position": "Missing"}, inplace=True)
df.fillna({"first_name": "Missing"}, inplace=True)
df.fillna({"country_of_birth": "Missing"}, inplace=True)
df.fillna({"city_of_birth": "Missing"}, inplace=True)
df.fillna({"country_of_citizenship": "Missing"}, inplace=True)
df.fillna({"height_in_cm": -1}, inplace=True)
df.fillna({"market_value_in_eur": -1}, inplace=True)
df.fillna({"highest_market_value_in_eur": -1}, inplace=True)
df.fillna({"agent_name": "Missing"}, inplace=True)
print(df)

       player_id first_name    last_name                name  last_season  \
0            598       Timo   Hildebrand     Timo Hildebrand         2014   
1            670     Martin       Petrov       Martin Petrov         2012   
2           1323     Martin      Amedick      Martin Amedick         2012   
3           3195   Jermaine      Pennant    Jermaine Pennant         2013   
4           3259     Damien         Duff         Damien Duff         2013   
...          ...        ...          ...                 ...          ...   
30297     371851       Jaka        Bijol          Jaka Bijol         2023   
30298     537171     Semuel  Pizzignacco  Semuel Pizzignacco         2018   
30299     586756      Festy      Ebosele       Festy Ebosele         2023   
30300     704692     Nicolò      Cocetta      Nicolò Cocetta         2022   
30301     925584       Axel     Guessand       Axel Guessand         2023   

       current_club_id         player_code country_of_birth  \
0           

##Cambiamo il formato della data di "contract_expiration_date" e "date_of_birth" rimpiazzando i valori "null" con la data di default "1900-01-01"

In [41]:
df.fillna({"contract_expiration_date": "1900-01-01 00:00:00"}, inplace=True)
df["contract_expiration_date"] = pd.to_datetime(df["contract_expiration_date"]).dt.normalize()
#utilizzo il formato mixed poichè date of birth presenta valori letterali nel dataset, che impedirebbero la pulitura senza tale specifica dato che sto inserendo solo caratteri numerici
df.fillna({"date_of_birth": "1900-01-01 00:00:00"}, inplace=True)
df["date_of_birth"] = pd.to_datetime(df["date_of_birth"], format = 'mixed').dt.normalize()

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30302 entries, 0 to 30301
Data columns (total 23 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   player_id                             30302 non-null  int64         
 1   first_name                            30302 non-null  object        
 2   last_name                             30302 non-null  object        
 3   name                                  30302 non-null  object        
 4   last_season                           30302 non-null  int64         
 5   current_club_id                       30302 non-null  int64         
 6   player_code                           30302 non-null  object        
 7   country_of_birth                      30302 non-null  object        
 8   city_of_birth                         30302 non-null  object        
 9   country_of_citizenship                30302 non-null  object        
 10

##Convertiamo tipi di oggetti nei relativi tipi corretti

In [42]:
df["market_value_in_eur"] = df["market_value_in_eur"].astype(float)
df["first_name"] = df["first_name"].astype(str)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30302 entries, 0 to 30301
Data columns (total 23 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   player_id                             30302 non-null  int64         
 1   first_name                            30302 non-null  object        
 2   last_name                             30302 non-null  object        
 3   name                                  30302 non-null  object        
 4   last_season                           30302 non-null  int64         
 5   current_club_id                       30302 non-null  int64         
 6   player_code                           30302 non-null  object        
 7   country_of_birth                      30302 non-null  object        
 8   city_of_birth                         30302 non-null  object        
 9   country_of_citizenship                30302 non-null  object        
 10

##Puliamo adesso il dataset "Competitions"

In [43]:
dfcompetitions = pd.read_csv('../DataAnalysis/Datasets/competitions.csv')
pd.options.display.max_rows = 400
print(dfcompetitions.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   competition_id        43 non-null     object
 1   competition_code      43 non-null     object
 2   name                  43 non-null     object
 3   sub_type              43 non-null     object
 4   type                  43 non-null     object
 5   country_id            43 non-null     int64 
 6   country_name          36 non-null     object
 7   domestic_league_code  36 non-null     object
 8   confederation         43 non-null     object
 9   url                   43 non-null     object
dtypes: int64(1), object(9)
memory usage: 3.5+ KB
None


###Anche qui osserviamo quali colonne contengono valori nulli

In [44]:
# Trovo le colonne con celle vuote
empty_columns_cells = dfcompetitions.columns[dfcompetitions.isnull().any()]
print("Colonne con campi vuoti:")
print(empty_columns_cells)
# Conto le celle vuote per ogni colonna
for col in empty_columns_cells:
    conteggio_campi_vuoti = dfcompetitions[col].isnull().sum()
    print(f"Colonna '{col}' ha {conteggio_campi_vuoti} campi vuoti.")

Colonne con campi vuoti:
Index(['country_name', 'domestic_league_code'], dtype='object')
Colonna 'country_name' ha 7 campi vuoti.
Colonna 'domestic_league_code' ha 7 campi vuoti.


In [45]:
dfcompetitions.fillna({"country_name": "Missing"}, inplace=True)
dfcompetitions.fillna({"domestic_league_code": "Missing"}, inplace=True)
print(dfcompetitions.info())
print(dfcompetitions)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   competition_id        43 non-null     object
 1   competition_code      43 non-null     object
 2   name                  43 non-null     object
 3   sub_type              43 non-null     object
 4   type                  43 non-null     object
 5   country_id            43 non-null     int64 
 6   country_name          43 non-null     object
 7   domestic_league_code  43 non-null     object
 8   confederation         43 non-null     object
 9   url                   43 non-null     object
dtypes: int64(1), object(9)
memory usage: 3.5+ KB
None
   competition_id                             competition_code  \
0             CIT                                    italy-cup   
1            NLSC                         johan-cruijff-schaal   
2             GRP                           

##Now let's clean the dataset "clubs", osserviamo le colonne con valori nulli e andiamo a riempirle con dati generici indicanti appunto la mancanza di dati

In [46]:
dfclub = pd.read_csv('../DataAnalysis/Datasets/clubs.csv')
pd.options.display.max_rows = 400
print(dfclub.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426 entries, 0 to 425
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   club_id                  426 non-null    int64  
 1   club_code                426 non-null    object 
 2   name                     426 non-null    object 
 3   domestic_competition_id  426 non-null    object 
 4   total_market_value       0 non-null      float64
 5   squad_size               426 non-null    int64  
 6   average_age              388 non-null    float64
 7   foreigners_number        426 non-null    int64  
 8   foreigners_percentage    379 non-null    float64
 9   national_team_players    426 non-null    int64  
 10  stadium_name             426 non-null    object 
 11  stadium_seats            426 non-null    int64  
 12  net_transfer_record      426 non-null    object 
 13  coach_name               0 non-null      float64
 14  last_season              4

In [47]:
# Trovo le colonne con celle vuote
empty_columns_cells = dfclub.columns[dfclub.isnull().any()]

print("Colonne con campi vuoti:")
print(empty_columns_cells)  # List of column names

# Conto le celle vuote per ogni colonna
for col in empty_columns_cells:
    conteggio_campi_vuoti_club = dfclub[col].isnull().sum()
    print(f"Colonna '{col}' ha {conteggio_campi_vuoti_club} campi vuoti.")

Colonne con campi vuoti:
Index(['total_market_value', 'average_age', 'foreigners_percentage',
       'coach_name'],
      dtype='object')
Colonna 'total_market_value' ha 426 campi vuoti.
Colonna 'average_age' ha 38 campi vuoti.
Colonna 'foreigners_percentage' ha 47 campi vuoti.
Colonna 'coach_name' ha 426 campi vuoti.


In [48]:
#tramite il metodo fillna riempio i valori vuoti con valori generici, -1 in caso di valori numerici e "Missing" per valori testuali
dfclub.fillna({"total_market_value": -1}, inplace=True)
dfclub.fillna({"average_age": -1}, inplace=True)
dfclub.fillna({"foreigners_percentage": -1}, inplace=True)
dfclub.fillna({"coach_name": "Missing"}, inplace=True)
print(dfclub.info())
print(dfclub)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426 entries, 0 to 425
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   club_id                  426 non-null    int64  
 1   club_code                426 non-null    object 
 2   name                     426 non-null    object 
 3   domestic_competition_id  426 non-null    object 
 4   total_market_value       426 non-null    float64
 5   squad_size               426 non-null    int64  
 6   average_age              426 non-null    float64
 7   foreigners_number        426 non-null    int64  
 8   foreigners_percentage    426 non-null    float64
 9   national_team_players    426 non-null    int64  
 10  stadium_name             426 non-null    object 
 11  stadium_seats            426 non-null    int64  
 12  net_transfer_record      426 non-null    object 
 13  coach_name               426 non-null    object 
 14  last_season              4