In [2]:
import pandas as pd
import numpy as np
import os

# Load the dataset
df_appearances = pd.read_csv("raw_data/appearances.csv")

# Show first rows
df_appearances.head()

Unnamed: 0,appearance_id,game_id,player_id,player_club_id,player_current_club_id,date,player_name,competition_id,yellow_cards,red_cards,goals,assists,minutes_played
0,2231978_38004,2231978,38004,853,235,2012-07-03,Aurélien Joachim,CLQ,0,0,2,0,90
1,2233748_79232,2233748,79232,8841,2698,2012-07-05,Ruslan Abyshov,ELQ,0,0,0,0,90
2,2234413_42792,2234413,42792,6251,465,2012-07-05,Sander Puri,ELQ,0,0,0,0,45
3,2234418_73333,2234418,73333,1274,6646,2012-07-05,Vegar Hedenstad,ELQ,0,0,0,0,90
4,2234421_122011,2234421,122011,195,3008,2012-07-05,Markus Henriksen,ELQ,0,0,0,1,90


In [3]:
# shape of the DataFrame
df_appearances.shape

# data types and missing values
df_appearances.info()

# counting missing values
df_appearances.isna().sum()

# basic statistics
df_appearances.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1706806 entries, 0 to 1706805
Data columns (total 13 columns):
 #   Column                  Dtype 
---  ------                  ----- 
 0   appearance_id           object
 1   game_id                 int64 
 2   player_id               int64 
 3   player_club_id          int64 
 4   player_current_club_id  int64 
 5   date                    object
 6   player_name             object
 7   competition_id          object
 8   yellow_cards            int64 
 9   red_cards               int64 
 10  goals                   int64 
 11  assists                 int64 
 12  minutes_played          int64 
dtypes: int64(9), object(4)
memory usage: 169.3+ MB


Unnamed: 0,appearance_id,game_id,player_id,player_club_id,player_current_club_id,date,player_name,competition_id,yellow_cards,red_cards,goals,assists,minutes_played
count,1706806,1706806.0,1706806.0,1706806.0,1706806.0,1706806,1706800,1706806,1706806.0,1706806.0,1706806.0,1706806.0,1706806.0
unique,1706806,,,,,3738,25141,43,,,,,
top,2231978_38004,,,,,2020-10-04,Danilo,IT1,,,,,
freq,1,,,,,1795,1115,140729,,,,,
mean,,3166096.0,209974.5,3138.29,3991.235,,,,0.1472329,0.00378309,0.09589432,0.0754954,69.06125
std,,665119.6,194400.8,8315.815,10847.06,,,,0.3653799,0.06139039,0.3309349,0.2855614,29.99313
min,,2211607.0,10.0,1.0,-1.0,,,,0.0,0.0,0.0,0.0,1.0
25%,,2589007.0,58402.0,289.0,331.0,,,,0.0,0.0,0.0,0.0,45.0
50%,,3080787.0,149577.0,826.0,903.0,,,,0.0,0.0,0.0,0.0,90.0
75%,,3621597.0,306668.0,2441.0,2696.0,,,,0.0,0.0,0.0,0.0,90.0


In [4]:
# checking for duplicates
print("Duplicates:", df_appearances.duplicated().sum())

# dropping duplicate rows
df_appearances.drop_duplicates(inplace=True)

Duplicates: 0


In [5]:
# checking missing values
missing = df_appearances.isna().sum().sort_values(ascending=False)
print("Missing values:\n", missing[missing > 0])

Missing values:
 player_name    6
dtype: int64


In [6]:
# cleaning column names: lowercase, replace spaces with underscores
df_appearances.columns = (
    df_appearances.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
    .str.replace(r'[^\w_]', '', regex=True)
)

df_appearances.columns  # Check new column names

Index(['appearance_id', 'game_id', 'player_id', 'player_club_id',
       'player_current_club_id', 'date', 'player_name', 'competition_id',
       'yellow_cards', 'red_cards', 'goals', 'assists', 'minutes_played'],
      dtype='object')

In [7]:
# defining which text columns to clean
text_columns = [
    'player_name'
]

# and cleaning them
for col in text_columns:
    df_appearances[col] = df_appearances[col].astype(str).str.strip().str.title()

In [8]:
# converting dates to datetime
df_appearances['date'] = pd.to_datetime(df_appearances['date'], errors='coerce')

# and just to confirm changes
df_appearances.dtypes

appearance_id                     object
game_id                            int64
player_id                          int64
player_club_id                     int64
player_current_club_id             int64
date                      datetime64[ns]
player_name                       object
competition_id                    object
yellow_cards                       int64
red_cards                          int64
goals                              int64
assists                            int64
minutes_played                     int64
dtype: object

In [9]:
import os

# create the directory
os.makedirs("clean_data", exist_ok=True)

In [10]:
df_appearances.to_csv("clean_data/clean_appearances.csv", index=False)