In [1]:
import pandas as pd
import numpy as np
import os

# Load the dataset
df_players = pd.read_csv("raw_data/players.csv")

# Show first rows
df_players.head()

Unnamed: 0,player_id,first_name,last_name,name,last_season,current_club_id,player_code,country_of_birth,city_of_birth,country_of_citizenship,...,foot,height_in_cm,contract_expiration_date,agent_name,image_url,url,current_club_domestic_competition_id,current_club_name,market_value_in_eur,highest_market_value_in_eur
0,10,Miroslav,Klose,Miroslav Klose,2015,398,miroslav-klose,Poland,Opole,Germany,...,right,184.0,,ASBW Sport Marketing,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/miroslav-klose...,IT1,Società Sportiva Lazio S.p.A.,1000000.0,30000000.0
1,26,Roman,Weidenfeller,Roman Weidenfeller,2017,16,roman-weidenfeller,Germany,Diez,Germany,...,left,190.0,,Neubauer 13 GmbH,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/roman-weidenfe...,L1,Borussia Dortmund,750000.0,8000000.0
2,65,Dimitar,Berbatov,Dimitar Berbatov,2015,1091,dimitar-berbatov,Bulgaria,Blagoevgrad,Bulgaria,...,,,,CSKA-AS-23 Ltd.,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/dimitar-berbat...,GR1,Panthessalonikios Athlitikos Omilos Konstantin...,1000000.0,34500000.0
3,77,,Lúcio,Lúcio,2012,506,lucio,Brazil,Brasília,Brazil,...,,,,,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/lucio/profil/s...,IT1,Juventus Football Club,200000.0,24500000.0
4,80,Tom,Starke,Tom Starke,2017,27,tom-starke,East Germany (GDR),Freital,Germany,...,right,194.0,,IFM,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/tom-starke/pro...,L1,FC Bayern München,100000.0,3000000.0


In [3]:
# shape of the DataFrame
df_players.shape

# data types and missing values
df_players.info()

# counting missing values
df_players.isna().sum()

# basic statistics
df_players.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32601 entries, 0 to 32600
Data columns (total 23 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   player_id                             32601 non-null  int64  
 1   first_name                            30539 non-null  object 
 2   last_name                             32601 non-null  object 
 3   name                                  32601 non-null  object 
 4   last_season                           32601 non-null  int64  
 5   current_club_id                       32601 non-null  int64  
 6   player_code                           32601 non-null  object 
 7   country_of_birth                      29802 non-null  object 
 8   city_of_birth                         30146 non-null  object 
 9   country_of_citizenship                32218 non-null  object 
 10  date_of_birth                         32554 non-null  object 
 11  sub_position   

Unnamed: 0,player_id,first_name,last_name,name,last_season,current_club_id,player_code,country_of_birth,city_of_birth,country_of_citizenship,...,foot,height_in_cm,contract_expiration_date,agent_name,image_url,url,current_club_domestic_competition_id,current_club_name,market_value_in_eur,highest_market_value_in_eur
count,32601.0,30539,32601,32601,32601.0,32601.0,32601,29802,30146,32218,...,30065,30345.0,20510,16582,32601,32601,32601,32601,31078.0,31078.0
unique,,7030,23795,31892,,,31852,185,8578,183,...,3,,119,2897,26854,32601,14,437,,
top,,David,García,Paulinho,,,paulinho,France,London,Spain,...,right,,2023-06-30 00:00:00,Wasserman,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/miroslav-klose...,TR1,Kilmarnock Football Club,,
freq,,215,68,13,,,13,2337,479,1965,...,21149,,4502,504,5748,1,3220,185,,
mean,348312.6,,,,2019.357259,4815.495813,,,,,...,,182.291943,,,,,,,1639685.0,3789347.0
std,284508.9,,,,3.961564,11583.707786,,,,,...,,7.03703,,,,,,,6407293.0,9887241.0
min,10.0,,,,2012.0,3.0,,,,,...,,17.0,,,,,,,10000.0,10000.0
25%,107973.0,,,,2016.0,403.0,,,,,...,,178.0,,,,,,,100000.0,275000.0
50%,283917.0,,,,2020.0,1063.0,,,,,...,,183.0,,,,,,,250000.0,800000.0
75%,530223.0,,,,2023.0,3060.0,,,,,...,,187.0,,,,,,,700000.0,3000000.0


In [4]:
# checking for duplicates
print("Duplicates:", df_players.duplicated().sum())

# dropping duplicate rows
df_players.drop_duplicates(inplace=True)

Duplicates: 0


In [5]:
# checking missing values
missing = df_players.isna().sum().sort_values(ascending=False)
print("Missing values:\n", missing[missing > 0])

Missing values:
 agent_name                     16019
contract_expiration_date       12091
country_of_birth                2799
foot                            2536
city_of_birth                   2455
height_in_cm                    2256
first_name                      2062
highest_market_value_in_eur     1523
market_value_in_eur             1523
country_of_citizenship           383
sub_position                     180
date_of_birth                     47
dtype: int64


In [6]:
# dropping rows missing essential player identity information
df_players = df_players.dropna(subset=["first_name", "date_of_birth"])

# filling missing height with median
df_players["height_in_cm"] = df_players["height_in_cm"].fillna(df_players["height_in_cm"].median())

# filling missing 'foot' (e.g., right/left) with mode
df_players["foot"] = df_players["foot"].fillna(df_players["foot"].mode()[0])

# filling missing country info with "Unknown"
df_players["country_of_birth"] = df_players["country_of_birth"].fillna("Unknown")
df_players["city_of_birth"] = df_players["city_of_birth"].fillna("Unknown")
df_players["country_of_citizenship"] = df_players["country_of_citizenship"].fillna("Unknown")

# filling market values with 0 if missing
df_players["market_value_in_eur"] = df_players["market_value_in_eur"].fillna(0)
df_players["highest_market_value_in_eur"] = df_players["highest_market_value_in_eur"].fillna(0)

# filling contract expiration with a placeholder date
df_players["contract_expiration_date"] = pd.to_datetime(
    df_players["contract_expiration_date"],
    errors="coerce"
).fillna(pd.Timestamp("2099-12-31"))

# filling agent_name and sub_position with "Unknown"
df_players["agent_name"] = df_players["agent_name"].fillna("Unknown")
df_players["sub_position"] = df_players["sub_position"].fillna("Unknown")

# and then verifying after cleaning
df_players.isna().sum().sort_values(ascending=False)

player_id                               0
position                                0
market_value_in_eur                     0
current_club_name                       0
current_club_domestic_competition_id    0
url                                     0
image_url                               0
agent_name                              0
contract_expiration_date                0
height_in_cm                            0
foot                                    0
sub_position                            0
first_name                              0
date_of_birth                           0
country_of_citizenship                  0
city_of_birth                           0
country_of_birth                        0
player_code                             0
current_club_id                         0
last_season                             0
name                                    0
last_name                               0
highest_market_value_in_eur             0
dtype: int64

In [7]:
# cleaning column names: lowercase, replace spaces with underscores
df_players.columns = (
    df_players.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
    .str.replace(r'[^\w_]', '', regex=True)
)

df_players.columns  # Check new column names

Index(['player_id', 'first_name', 'last_name', 'name', 'last_season',
       'current_club_id', 'player_code', 'country_of_birth', 'city_of_birth',
       'country_of_citizenship', 'date_of_birth', 'sub_position', 'position',
       'foot', 'height_in_cm', 'contract_expiration_date', 'agent_name',
       'image_url', 'url', 'current_club_domestic_competition_id',
       'current_club_name', 'market_value_in_eur',
       'highest_market_value_in_eur'],
      dtype='object')

In [8]:
# defining which text columns to clean
text_columns = [
    'first_name', 'last_name', 'name', 'player_code',
    'country_of_birth', 'city_of_birth', 'country_of_citizenship',
    'position', 'sub_position', 'foot', 'current_club_name',
    'agent_name'
]

# and cleaning them
for col in text_columns:
    df_players[col] = df_players[col].astype(str).str.strip().str.title()

In [9]:
# converting dates to datetime
df_players['date_of_birth'] = pd.to_datetime(df_players['date_of_birth'], errors='coerce')
df_players['contract_expiration_date'] = pd.to_datetime(df_players['contract_expiration_date'], errors='coerce')

# and just to confirm changes
df_players.dtypes

player_id                                        int64
first_name                                      object
last_name                                       object
name                                            object
last_season                                      int64
current_club_id                                  int64
player_code                                     object
country_of_birth                                object
city_of_birth                                   object
country_of_citizenship                          object
date_of_birth                           datetime64[ns]
sub_position                                    object
position                                        object
foot                                            object
height_in_cm                                   float64
contract_expiration_date                datetime64[ns]
agent_name                                      object
image_url                                       object
url       

In [10]:
# filtering out height values
df_players = df_players[(df_players['height_in_cm'] > 130) & (df_players['height_in_cm'] < 220)]

In [12]:
import os

# create the directory
os.makedirs("clean_data", exist_ok=True)

In [13]:
df_players.to_csv("clean_data/clean_players.csv", index=False)