In [2]:
import pandas as pd
import numpy as np
import os

# Load the dataset
df_clubs = pd.read_csv("raw_data/clubs.csv")

# Show first rows
df_clubs.head()

Unnamed: 0,club_id,club_code,name,domestic_competition_id,total_market_value,squad_size,average_age,foreigners_number,foreigners_percentage,national_team_players,stadium_name,stadium_seats,net_transfer_record,coach_name,last_season,filename,url
0,105,sv-darmstadt-98,SV Darmstadt 98,L1,,27,25.6,13,48.1,1,Merck-Stadion am Böllenfalltor,17810,+€3.05m,,2023,../data/raw/transfermarkt-scraper/2023/clubs.j...,https://www.transfermarkt.co.uk/sv-darmstadt-9...
1,11127,ural-ekaterinburg,Ural Yekaterinburg,RU1,,30,26.5,11,36.7,3,Yekaterinburg Arena,23000,+€880k,,2023,../data/raw/transfermarkt-scraper/2023/clubs.j...,https://www.transfermarkt.co.uk/ural-ekaterinb...
2,114,besiktas-istanbul,Beşiktaş Jimnastik Kulübü,TR1,,30,26.6,15,50.0,8,Beşiktaş Park,42445,€-25.26m,,2024,../data/raw/transfermarkt-scraper/2024/clubs.j...,https://www.transfermarkt.co.uk/besiktas-istan...
3,12,as-rom,Associazione Sportiva Roma,IT1,,26,26.3,18,69.2,17,Olimpico di Roma,70634,€-76.90m,,2024,../data/raw/transfermarkt-scraper/2024/clubs.j...,https://www.transfermarkt.co.uk/as-rom/startse...
4,148,tottenham-hotspur,Tottenham Hotspur Football Club,GB1,,30,25.5,21,70.0,18,Tottenham Hotspur Stadium,62850,€-120.05m,,2024,../data/raw/transfermarkt-scraper/2024/clubs.j...,https://www.transfermarkt.co.uk/tottenham-hots...


In [3]:
# shape of the DataFrame
df_clubs.shape

# data types and missing values
df_clubs.info()

# counting missing values
df_clubs.isna().sum()

# basic statistics
df_clubs.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 439 entries, 0 to 438
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   club_id                  439 non-null    int64  
 1   club_code                439 non-null    object 
 2   name                     439 non-null    object 
 3   domestic_competition_id  439 non-null    object 
 4   total_market_value       0 non-null      float64
 5   squad_size               439 non-null    int64  
 6   average_age              401 non-null    float64
 7   foreigners_number        439 non-null    int64  
 8   foreigners_percentage    390 non-null    float64
 9   national_team_players    439 non-null    int64  
 10  stadium_name             439 non-null    object 
 11  stadium_seats            439 non-null    int64  
 12  net_transfer_record      439 non-null    object 
 13  coach_name               0 non-null      float64
 14  last_season              4

Unnamed: 0,club_id,club_code,name,domestic_competition_id,total_market_value,squad_size,average_age,foreigners_number,foreigners_percentage,national_team_players,stadium_name,stadium_seats,net_transfer_record,coach_name,last_season,filename,url
count,439.0,439,439,439,0.0,439.0,401.0,439.0,390.0,439.0,439,439.0,439,0.0,439.0,439,439
unique,,439,439,14,,,,,,,420,,305,,,13,439
top,,sv-darmstadt-98,SV Darmstadt 98,TR1,,,,,,,Valeriy Lobanovsky Stadion,,+-0,,,../data/raw/transfermarkt-scraper/2024/clubs.j...,https://www.transfermarkt.co.uk/sv-darmstadt-9...
freq,,1,1,42,,,,,,,2,,100,,,237,1
mean,6283.476082,,,,,24.18451,25.616459,11.186788,46.582564,4.751708,,24204.091116,,,2021.526196,,
std,14588.760049,,,,,8.692365,2.059485,6.807645,20.274257,4.896634,,17230.554384,,,3.563978,,
min,3.0,,,,,0.0,0.0,0.0,2.4,0.0,,0.0,,,2012.0,,
25%,435.5,,,,,24.0,24.6,6.0,30.8,1.0,,10815.0,,,2020.0,,
50%,1158.0,,,,,26.0,25.8,12.0,48.05,3.0,,20087.0,,,2024.0,,
75%,3575.0,,,,,29.0,26.8,16.0,62.4,7.0,,32392.0,,,2024.0,,


In [4]:
# checking for duplicates
print("Duplicates:", df_clubs.duplicated().sum())

# dropping duplicate rows
df_clubs.drop_duplicates(inplace=True)

Duplicates: 0


In [5]:
# checking missing values
missing = df_clubs.isna().sum().sort_values(ascending=False)
print("Missing values:\n", missing[missing > 0])

Missing values:
 total_market_value       439
coach_name               439
foreigners_percentage     49
average_age               38
dtype: int64


In [6]:
#keeping only relevant columns
df_clubs = df_clubs[["club_id", "club_code", "name","domestic_competition_id", "last_season"]]

In [7]:
# cleaning column names: lowercase, replace spaces with underscores
df_clubs.columns = (
    df_clubs.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
    .str.replace(r'[^\w_]', '', regex=True)
)

df_clubs.columns  # Check new column names

Index(['club_id', 'club_code', 'name', 'domestic_competition_id',
       'last_season'],
      dtype='object')

In [8]:
# defining which text columns to clean
text_columns = [
    'name'
]

# and cleaning them
for col in text_columns:
    df_clubs[col] = df_clubs[col].astype(str).str.strip().str.title()

In [9]:
# and just to confirm changes
df_clubs.dtypes

club_id                     int64
club_code                  object
name                       object
domestic_competition_id    object
last_season                 int64
dtype: object

In [10]:
import os

# create the directory
os.makedirs("clean_data", exist_ok=True)

In [None]:
df_clubs.to_csv("clean_data/j_clean_clubs.csv", index=False)

In [12]:
df_clubs.head()

Unnamed: 0,club_id,club_code,name,domestic_competition_id,last_season
0,105,sv-darmstadt-98,Sv Darmstadt 98,L1,2023
1,11127,ural-ekaterinburg,Ural Yekaterinburg,RU1,2023
2,114,besiktas-istanbul,Beşiktaş Jimnastik Kulübü,TR1,2024
3,12,as-rom,Associazione Sportiva Roma,IT1,2024
4,148,tottenham-hotspur,Tottenham Hotspur Football Club,GB1,2024
