In [2]:
import pandas as pd
import numpy as np
import os

# Load the dataset
df_transfers = pd.read_csv("raw_data/transfers.csv")

# Show first rows
df_transfers.head()

Unnamed: 0,player_id,transfer_date,transfer_season,from_club_id,to_club_id,from_club_name,to_club_name,transfer_fee,market_value_in_eur,player_name
0,16136,2026-07-01,26/27,417,123,OGC Nice,Retired,,500000.0,Dante
1,1138758,2026-07-01,26/27,336,631,Sporting CP,Chelsea,52140000.0,45000000.0,Geovany Quenda
2,195778,2026-06-30,25/26,79,27,VfB Stuttgart,Bayern Munich,0.0,12000000.0,Alexander Nübel
3,569033,2026-06-30,25/26,39,27,1.FSV Mainz 05,Bayern Munich,0.0,4000000.0,Armindo Sieb
4,626913,2026-06-30,25/26,398,380,Lazio,Salernitana,0.0,15000000.0,Boulaye Dia


In [3]:
# shape of the DataFrame
df_transfers.shape

# data types and missing values
df_transfers.info()

# counting missing values
df_transfers.isna().sum()

# basic statistics
df_transfers.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79646 entries, 0 to 79645
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   player_id            79646 non-null  int64  
 1   transfer_date        79646 non-null  object 
 2   transfer_season      79646 non-null  object 
 3   from_club_id         79646 non-null  int64  
 4   to_club_id           79646 non-null  int64  
 5   from_club_name       79646 non-null  object 
 6   to_club_name         79646 non-null  object 
 7   transfer_fee         51931 non-null  float64
 8   market_value_in_eur  49330 non-null  float64
 9   player_name          79646 non-null  object 
dtypes: float64(2), int64(3), object(5)
memory usage: 6.1+ MB


Unnamed: 0,player_id,transfer_date,transfer_season,from_club_id,to_club_id,from_club_name,to_club_name,transfer_fee,market_value_in_eur,player_name
count,79646.0,79646,79646,79646.0,79646.0,79646,79646,51931.0,49330.0,79646
unique,,4186,34,,,10123,7777,,,10359
top,,2022-07-01,23/24,,,Without Club,Without Club,,,Paulinho
freq,,2385,8516,,,964,1456,,,42
mean,423242.6,,,17158.68822,12902.337556,,,1115650.0,2488055.0,
std,269454.3,,,23567.526887,20482.855861,,,5258424.0,5901403.0,
min,3333.0,,,1.0,1.0,,,0.0,10000.0,
25%,207302.0,,,853.0,589.0,,,0.0,225000.0,
50%,371197.5,,,6646.0,2976.0,,,0.0,600000.0,
75%,597153.0,,,24226.0,14685.0,,,0.0,2000000.0,


In [4]:
# checking for duplicates
print("Duplicates:", df_transfers.duplicated().sum())

# dropping duplicate rows
df_transfers.drop_duplicates(inplace=True)

Duplicates: 0


In [5]:
# checking missing values
missing = df_transfers.isna().sum().sort_values(ascending=False)
print("Missing values:\n", missing[missing > 0])

Missing values:
 market_value_in_eur    30316
transfer_fee           27715
dtype: int64


In [6]:
# filling market values with 0 if missing
df_transfers["market_value_in_eur"] = df_transfers["market_value_in_eur"].fillna(0)
df_transfers["transfer_fee"] = df_transfers["transfer_fee"].fillna(0)

# and then verifying after cleaning
df_transfers.isna().sum().sort_values(ascending=False)

player_id              0
transfer_date          0
transfer_season        0
from_club_id           0
to_club_id             0
from_club_name         0
to_club_name           0
transfer_fee           0
market_value_in_eur    0
player_name            0
dtype: int64

In [7]:
# cleaning column names: lowercase, replace spaces with underscores
df_transfers.columns = (
    df_transfers.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
    .str.replace(r'[^\w_]', '', regex=True)
)

df_transfers.columns  # Check new column names

Index(['player_id', 'transfer_date', 'transfer_season', 'from_club_id',
       'to_club_id', 'from_club_name', 'to_club_name', 'transfer_fee',
       'market_value_in_eur', 'player_name'],
      dtype='object')

In [8]:
# defining which text columns to clean
text_columns = [
    'from_club_name', 'to_club_name', 'player_name'
]

# and cleaning them
for col in text_columns:
    df_transfers[col] = df_transfers[col].astype(str).str.strip().str.title()

In [9]:
# converting dates to datetime
df_transfers['transfer_date'] = pd.to_datetime(df_transfers['transfer_date'], errors='coerce')

# and just to confirm changes
df_transfers.dtypes

player_id                       int64
transfer_date          datetime64[ns]
transfer_season                object
from_club_id                    int64
to_club_id                      int64
from_club_name                 object
to_club_name                   object
transfer_fee                  float64
market_value_in_eur           float64
player_name                    object
dtype: object

In [10]:
import os

# create the directory
os.makedirs("clean_data", exist_ok=True)

In [11]:
df_transfers.to_csv("clean_data/clean_transfers.csv", index=False)

In [12]:
df_transfers.head()

Unnamed: 0,player_id,transfer_date,transfer_season,from_club_id,to_club_id,from_club_name,to_club_name,transfer_fee,market_value_in_eur,player_name
0,16136,2026-07-01,26/27,417,123,Ogc Nice,Retired,0.0,500000.0,Dante
1,1138758,2026-07-01,26/27,336,631,Sporting Cp,Chelsea,52140000.0,45000000.0,Geovany Quenda
2,195778,2026-06-30,25/26,79,27,Vfb Stuttgart,Bayern Munich,0.0,12000000.0,Alexander Nübel
3,569033,2026-06-30,25/26,39,27,1.Fsv Mainz 05,Bayern Munich,0.0,4000000.0,Armindo Sieb
4,626913,2026-06-30,25/26,398,380,Lazio,Salernitana,0.0,15000000.0,Boulaye Dia
