### Importations et configuration du chemin racine du projet

In [156]:

import pandas as pd  # pour la manipulation de DataFrames
import os             # pour gérer les chemins et interactions système
import yaml           # pour lire et écrire des fichiers YAML
from typing import Dict   # pour typer les dictionnaires dans les fonctions
from pathlib import Path  # pour manipuler les chemins de fichiers de manière portable

# Définir le chemin racine du projet
# Ici on prend le dossier parent du répertoire courant
PROJECT_ROOT = Path.cwd().parent

# Afficher le chemin racine pour vérification
PROJECT_ROOT


WindowsPath('c:/Users/Admin/Documents/Ashley/A Simplon/Week 4 (15 dec - 19 dec)/Projet ETL Football/Brief-2-ETL-de-donn-es-footballistiques-Wickets-Sprinters-1')

### Fonctions utilisées pour l’extraction de données

In [157]:
def fct_load_config(config_filename: str = "config.yaml") -> dict:
    """
    Goal:
        Function to load configuration parameters from a YAML file.
    Parameters:
        config_filename (str): Relative or absolute path to the YAML file.
    Returns:
        dict: A dictionary containing the configuration parameters.
    """

    config_path = Path(config_filename)

    # Si le chemin est relatif, on le résout depuis la racine du projet
    if not config_path.is_absolute():
        project_root = Path(__file__).resolve().parents[1]
        config_path = project_root / config_path

    if not config_path.exists():
        raise FileNotFoundError(f"Config file not found: {config_path}")

    with open(config_path, "r", encoding="utf-8") as f:
        config = yaml.safe_load(f)

    return config

def fct_read_csv(root_file: str) -> pd.DataFrame:
    """
    Goal:
        Function to read a CSV file and return a pandas DataFrame.
    Parameters:
        root_file (str): Relative or absolute path to the CSV file.
    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV file.
    """
    seps = [',', ';', '|', '\t']

    file_path = Path(root_file)

    if not file_path.is_absolute():
        try:
            # Cas script Python
            project_root = Path(__file__).resolve().parents[1]
        except NameError:
            # Cas notebook Jupyter
            project_root = Path.cwd().parent

        file_path = project_root / file_path

    if not file_path.exists():
        print(f"Erreur : fichier {file_path} introuvable")
        return pd.DataFrame()

    for sep in seps:
        try:
            df = pd.read_csv(
                file_path,
                sep=sep,
                encoding="utf-8",
                skipinitialspace=True
            )

            # Si plus d'une colonne → bon séparateur
            if df.shape[1] > 1:
                return df

        except Exception:
            continue

    print(f"Aucun séparateur valide trouvé pour {file_path}")
    return pd.DataFrame()

def fct_add_prefix_to_df(df:pd.DataFrame, prefix:str) -> pd.DataFrame:
    """
    Goal:
        Function to add a prefix to all column names in a DataFrame.
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        prefix (str): The prefix to add to each column name.
    Returns:
        pd.DataFrame: The DataFrame with updated column names.
    """
    for col in df.columns:
        df.rename(columns={col: f"{prefix}_{col}"}, inplace=True)
    return df


In [158]:
#Récupération de la configuration
config_path = os.path.join(Path.cwd().parent, 'config.yaml')
config = fct_load_config(config_path)

# Extract

### Extraction à partir du fichier : "matches_19302010.csv"

In [159]:
root_csv_2010 = config['root_csv_2010']

# Lecture des données du fichier csv dans des DataFrames
df_2010 = fct_read_csv(root_csv_2010)

In [160]:
df_2010.shape

(7299, 8)

### Visualisation des dataframes

In [161]:
# Affichage des premières lignes du DataFrame
df_2010.head()

Unnamed: 0,edition,round,score,team1,team2,url,venue,year
0,1930-URUGUAY,GROUP_STAGE,4-1 (3-0),France,Mexico (México),1930_URUGUAY_FS.htm#1-WC-30-I,Montevideo.,1930
1,1930-URUGUAY,GROUP_STAGE,3-0 (2-0),USA,Belgium (België),1930_URUGUAY_FS.htm#13-WC-30-I,Montevideo.,1930
2,1930-URUGUAY,GROUP_STAGE,2-1 (2-0),Yugoslavia (Југославија),Brazil (Brasil),1930_URUGUAY_FS.htm#7-WC-30-I,Montevideo.,1930
3,1930-URUGUAY,GROUP_STAGE,3-1 (1-0),Romania (România),Peru (Perú),1930_URUGUAY_FS.htm#10-WC-30-I,Montevideo.,1930
4,1930-URUGUAY,GROUP_STAGE,1-0 (0-0),Argentina,France,1930_URUGUAY_FS.htm#2-WC-30-I,Montevideo.,1930


### Fonction finale d'extraction

In [162]:

#utiliser la fonction fct_read_csv pour lire le fichier CSV et obtenir le DataFrame
# df_2010 = fct_read_csv(root_csv_2010)

# Transform

### Information sur les données 

In [163]:
df_infos = df_2010.copy()
df_infos['year'] = pd.to_datetime(df_infos['year'], errors='coerce', utc=False).dt.year
# Calculer date début et date fin
date_debut = df_infos['year'].min()
date_fin = df_infos['year'].max()

print(f"Date début des matchs : {date_debut}")
print(f"Date fin des matchs : {date_fin}")

Date début des matchs : 1970
Date fin des matchs : 1970


### Début des transformations

In [164]:
#Visualiser le DataFrame
df_2010.head()

Unnamed: 0,edition,round,score,team1,team2,url,venue,year
0,1930-URUGUAY,GROUP_STAGE,4-1 (3-0),France,Mexico (México),1930_URUGUAY_FS.htm#1-WC-30-I,Montevideo.,1930
1,1930-URUGUAY,GROUP_STAGE,3-0 (2-0),USA,Belgium (België),1930_URUGUAY_FS.htm#13-WC-30-I,Montevideo.,1930
2,1930-URUGUAY,GROUP_STAGE,2-1 (2-0),Yugoslavia (Југославија),Brazil (Brasil),1930_URUGUAY_FS.htm#7-WC-30-I,Montevideo.,1930
3,1930-URUGUAY,GROUP_STAGE,3-1 (1-0),Romania (România),Peru (Perú),1930_URUGUAY_FS.htm#10-WC-30-I,Montevideo.,1930
4,1930-URUGUAY,GROUP_STAGE,1-0 (0-0),Argentina,France,1930_URUGUAY_FS.htm#2-WC-30-I,Montevideo.,1930


In [165]:
#valeurs nulles 
print(f"\nValeurs nulles dans df_2010:\n{df_2010.isnull().sum()}")


Valeurs nulles dans df_2010:
edition    0
round      0
score      0
team1      0
team2      0
url        0
venue      0
year       0
dtype: int64


In [166]:
# Nombre des valeurs dupliquées
number_duplicates = df_2010.duplicated().sum()

print(f"\nNombre des valeurs dupliquées: {number_duplicates}\n")
print(df_2010[df_2010.duplicated(keep = False)])

# supprimer les doublons
df_2010 = df_2010.drop_duplicates()

#verifier la suppression
print(f"\nNombre des valeurs dupliquées après suppression: {df_2010.duplicated().sum()}")


Nombre des valeurs dupliquées: 0

Empty DataFrame
Columns: [edition, round, score, team1, team2, url, venue, year]
Index: []

Nombre des valeurs dupliquées après suppression: 0


In [167]:
# Vérifier les types de colonnes : 
print("\nTypes de colonnes dans df_2010:")
print(df_2010.dtypes)


Types de colonnes dans df_2010:
edition    object
round      object
score      object
team1      object
team2      object
url        object
venue      object
year        int64
dtype: object


In [168]:
#creer colonnes 'home_result' et 'away_result'
df_2010['score'] = df_2010['score'].str.slice(0, 3)
df_2010[['home_result','away_result']] = df_2010['score'].str.split('-', expand=True)
df_2010.head()

Unnamed: 0,edition,round,score,team1,team2,url,venue,year,home_result,away_result
0,1930-URUGUAY,GROUP_STAGE,4-1,France,Mexico (México),1930_URUGUAY_FS.htm#1-WC-30-I,Montevideo.,1930,4,1
1,1930-URUGUAY,GROUP_STAGE,3-0,USA,Belgium (België),1930_URUGUAY_FS.htm#13-WC-30-I,Montevideo.,1930,3,0
2,1930-URUGUAY,GROUP_STAGE,2-1,Yugoslavia (Југославија),Brazil (Brasil),1930_URUGUAY_FS.htm#7-WC-30-I,Montevideo.,1930,2,1
3,1930-URUGUAY,GROUP_STAGE,3-1,Romania (România),Peru (Perú),1930_URUGUAY_FS.htm#10-WC-30-I,Montevideo.,1930,3,1
4,1930-URUGUAY,GROUP_STAGE,1-0,Argentina,France,1930_URUGUAY_FS.htm#2-WC-30-I,Montevideo.,1930,1,0


In [169]:
#convertir les colonnes 'home_result' et 'away_result' en type Int64
df_2010['home_result'] = (
    df_2010['home_result']
    .astype("string")
    .str.extract(r"(\d+)", expand=False)
    .astype("Int64").fillna(0)
)
df_2010['away_result'] = (
    df_2010['away_result']
    .astype("string")
    .str.extract(r"(\d+)", expand=False)
    .astype("Int64").fillna(0)
)
df_2010.dtypes

edition        object
round          object
score          object
team1          object
team2          object
url            object
venue          object
year            int64
home_result     Int64
away_result     Int64
dtype: object

In [170]:
# verifier les valeurs uniques dans la colonne 'home_result'
print(df_2010['home_result'].unique().tolist())
print(df_2010['away_result'].unique().tolist())

[4, 3, 2, 1, 6, 0, 9, 5, 7, 11, 8, 12, 10, 13, 19, 16, 31]
[1, 0, 3, 2, 6, 4, 9, 5, 8, 7]


In [171]:
df_2010.head()

Unnamed: 0,edition,round,score,team1,team2,url,venue,year,home_result,away_result
0,1930-URUGUAY,GROUP_STAGE,4-1,France,Mexico (México),1930_URUGUAY_FS.htm#1-WC-30-I,Montevideo.,1930,4,1
1,1930-URUGUAY,GROUP_STAGE,3-0,USA,Belgium (België),1930_URUGUAY_FS.htm#13-WC-30-I,Montevideo.,1930,3,0
2,1930-URUGUAY,GROUP_STAGE,2-1,Yugoslavia (Југославија),Brazil (Brasil),1930_URUGUAY_FS.htm#7-WC-30-I,Montevideo.,1930,2,1
3,1930-URUGUAY,GROUP_STAGE,3-1,Romania (România),Peru (Perú),1930_URUGUAY_FS.htm#10-WC-30-I,Montevideo.,1930,3,1
4,1930-URUGUAY,GROUP_STAGE,1-0,Argentina,France,1930_URUGUAY_FS.htm#2-WC-30-I,Montevideo.,1930,1,0


In [172]:
#rename columns pour etre homogène avec les autres datasets
dict_columns_2010 = config['dict_columns_2010']
df_2010 = df_2010.rename(columns=dict_columns_2010)
df_2010.head()

Unnamed: 0,edition,stage,score,home_team,away_team,url,city,date,home_result,away_result
0,1930-URUGUAY,GROUP_STAGE,4-1,France,Mexico (México),1930_URUGUAY_FS.htm#1-WC-30-I,Montevideo.,1930,4,1
1,1930-URUGUAY,GROUP_STAGE,3-0,USA,Belgium (België),1930_URUGUAY_FS.htm#13-WC-30-I,Montevideo.,1930,3,0
2,1930-URUGUAY,GROUP_STAGE,2-1,Yugoslavia (Југославија),Brazil (Brasil),1930_URUGUAY_FS.htm#7-WC-30-I,Montevideo.,1930,2,1
3,1930-URUGUAY,GROUP_STAGE,3-1,Romania (România),Peru (Perú),1930_URUGUAY_FS.htm#10-WC-30-I,Montevideo.,1930,3,1
4,1930-URUGUAY,GROUP_STAGE,1-0,Argentina,France,1930_URUGUAY_FS.htm#2-WC-30-I,Montevideo.,1930,1,0


In [None]:
#home_team et away_team, garder que le nom de pays en Anglais
df_2010[['home_team','home_team_lanorig']] = df_2010['home_team'].str.split('(', expand=True)
df_2010[['away_team','away_team_lanorig']] = df_2010['away_team'].str.split('(', expand=True)
df_2010['home_team'] = df_2010['home_team'].replace('FRG ', 'Germany ')
df_2010['away_team'] = df_2010['away_team'].replace('FRG ', 'Germany ')
df_2010['home_team'] = df_2010['home_team'].replace('GDR ', 'Germany ')
df_2010['away_team'] = df_2010['away_team'].replace('GDR ', 'Germany ')
df_2010['away_team'].unique()

array(['Mexico ', 'Belgium ', 'Brazil ', 'Peru ', 'France', 'Bolivia',
       'Paraguay', 'Romania ', 'Chile', 'USA', 'Yugoslavia ', 'Argentina',
       'Estonia ', 'Sweden ', 'Switzerland ', 'Czechoslovakia ', 'Cuba',
       'Portugal', 'Germany ', 'Palestine ', 'Spain ', 'Greece ',
       'Hungary ', 'Egypt ', 'Irish Free State ', 'Bulgaria ',
       'Netherlands ', 'Austria ', 'Finland ', 'Lithuania ', 'Latvia ',
       'Norway ', 'Luxembourg ', 'Poland ', 'Dutch East Indies ',
       'Ireland ', 'Israel ', 'Scotland', 'England', 'Wales ',
       'Northern Ireland ', 'Syria ', 'Italy ', 'Saarland', 'Haiti ',
       'Germany', 'Turkey ', 'South Korea ', 'Japan ', 'Uruguay',
       'Denmark ', 'Costa Rica', 'Guatemala', 'Curaçao', 'China ',
       'Sudan ', 'Iceland ', 'Indonesia', 'Colombia', 'Canada',
       'Soviet Union ', 'Nigeria', 'Ghana', 'Honduras', 'Dutch Antilles ',
       'Tunisia ', 'Morocco ', 'Cyprus ', 'Dutch Guyana ', 'Ecuador',
       'Ethiopia ', 'Albania ', 'Jamaic

In [174]:
# Convertir la colonne 'date' en format YYYY en string 
df_2010["date"] = df_2010["date"].astype("string")

#garder que l'année pour 'edition'
df_2010['edition'] =df_2010['date'].astype(int)

df_2010.head()


Unnamed: 0,edition,stage,score,home_team,away_team,url,city,date,home_result,away_result,home_team_lanorig,away_team_lanorig
0,1930,GROUP_STAGE,4-1,France,Mexico,1930_URUGUAY_FS.htm#1-WC-30-I,Montevideo.,1930,4,1,,México)
1,1930,GROUP_STAGE,3-0,USA,Belgium,1930_URUGUAY_FS.htm#13-WC-30-I,Montevideo.,1930,3,0,,België)
2,1930,GROUP_STAGE,2-1,Yugoslavia,Brazil,1930_URUGUAY_FS.htm#7-WC-30-I,Montevideo.,1930,2,1,Југославија),Brasil)
3,1930,GROUP_STAGE,3-1,Romania,Peru,1930_URUGUAY_FS.htm#10-WC-30-I,Montevideo.,1930,3,1,România),Perú)
4,1930,GROUP_STAGE,1-0,Argentina,France,1930_URUGUAY_FS.htm#2-WC-30-I,Montevideo.,1930,1,0,,


In [175]:
df_2010['date'].unique()

<StringArray>
['1930', '1933', '1934', '1937', '1938', '1949', '1950', '1953', '1954',
 '1956', '1957', '1958', '1960', '1961', '1962', '1964', '1965', '1966',
 '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1976', '1977',
 '1978', '1980', '1981', '1982', '1984', '1985', '1986', '1988', '1989',
 '1990', '1992', '1993', '1994', '1996', '1997', '1998', '2000', '2001',
 '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010',
 '2011', '2012', '2013', '2014']
Length: 58, dtype: string

In [176]:
df_2010.dtypes

edition                       int64
stage                        object
score                        object
home_team                    object
away_team                    object
url                          object
city                         object
date                 string[python]
home_result                   Int64
away_result                   Int64
home_team_lanorig            object
away_team_lanorig            object
dtype: object

In [177]:
#supprimer '.' dans colonne 'city'
df_2010['city'] = df_2010['city'].str.replace('.', '', regex=False)
df_2010.head()

Unnamed: 0,edition,stage,score,home_team,away_team,url,city,date,home_result,away_result,home_team_lanorig,away_team_lanorig
0,1930,GROUP_STAGE,4-1,France,Mexico,1930_URUGUAY_FS.htm#1-WC-30-I,Montevideo,1930,4,1,,México)
1,1930,GROUP_STAGE,3-0,USA,Belgium,1930_URUGUAY_FS.htm#13-WC-30-I,Montevideo,1930,3,0,,België)
2,1930,GROUP_STAGE,2-1,Yugoslavia,Brazil,1930_URUGUAY_FS.htm#7-WC-30-I,Montevideo,1930,2,1,Југославија),Brasil)
3,1930,GROUP_STAGE,3-1,Romania,Peru,1930_URUGUAY_FS.htm#10-WC-30-I,Montevideo,1930,3,1,România),Perú)
4,1930,GROUP_STAGE,1-0,Argentina,France,1930_URUGUAY_FS.htm#2-WC-30-I,Montevideo,1930,1,0,,


In [178]:
# verfier les valeurs uniques dans la colonne 'stage'
df_2010["stage"].unique()


array(['GROUP_STAGE', '1/2_FINAL', '_FINAL', 'PRELIMINARY-Europe',
       'PRELIMINARY-N/C.America', 'PRELIMINARY-N.E.', 'FIRST',
       '1/4_FINAL', 'PLACES_3&4', 'PRELIMINARY-Eur./N.E.',
       'PRELIMINARY-S.America', 'FINAL_ROUND', 'PRELIMINARY-Eu./Afr.',
       'PRELIMINARY-Asia', 'PRELIMINARY-Afr./As.', 'PRELIMINARY-Euro/As.',
       'PRELIMINARY-E./Afr./As.', 'PRELIMINARY-Af./As./O.',
       'PRELIMINARY-Africa', 'PRELIMINARY-As./O.', 'SEMIFINAL_STAGE',
       'QUARTERFINAL_STAGE', 'PRELIMINARY-O./As.', '1/8_FINAL',
       'PRELIMINARY-Oceania'], dtype=object)

In [179]:
# Harminser la colonne 'stage' avec les valeurs définies dans le fichier config.yaml
stage_net = config['stage_mapping_2010']
df_2010["stage"] = df_2010["stage"].replace(stage_net)

In [180]:
# verfier les valeurs uniques dans la colonne 'stage'
df_2010["stage"].unique()

array(['group_stage', 'semi_finals', 'final', 'preliminary', 'first',
       'quarter_finals', 'third_place', 'round_of_16'], dtype=object)

In [181]:
#garder que des colonnes nescessaires
columns_to_keep = config['columns_to_keep_2010']

df_2010 = df_2010[columns_to_keep]
print(df_2010.dtypes)
df_2010.head()


date           string[python]
home_team              object
away_team              object
home_result             Int64
away_result             Int64
stage                  object
edition                 int64
city                   object
dtype: object


Unnamed: 0,date,home_team,away_team,home_result,away_result,stage,edition,city
0,1930,France,Mexico,4,1,group_stage,1930,Montevideo
1,1930,USA,Belgium,3,0,group_stage,1930,Montevideo
2,1930,Yugoslavia,Brazil,2,1,group_stage,1930,Montevideo
3,1930,Romania,Peru,3,1,group_stage,1930,Montevideo
4,1930,Argentina,France,1,0,group_stage,1930,Montevideo


### fonction finale de transformations

In [182]:
def fct_transform_2010(df : pd.DataFrame , config : Dict) -> pd.DataFrame:
    """
    Goal:
        Fonction qui transforme le DataFrame du dataset 2010 selon les étapes définies.
    Parameters:
        df (pd.DataFrame): The input DataFrame for the 2010 dataset.
        config (str): Path to the configuration YAML file.
    Returns:
        pd.DataFrame: The transformed DataFrame.
    """
    
    # supprimer les doublons
    df = df.drop_duplicates()
    
    #creer colonnes 'home_result' et 'away_result'
    df['score'] = df['score'].str.slice(0, 3)
    df[['home_result','away_result']] = df['score'].str.split('-', expand=True)

    #convertir les colonnes 'home_result' et 'away_result' en type Int64
    df['home_result'] = (
        df['home_result']
        .astype("string")
        .str.extract(r"(\d+)", expand=False)
        .astype("Int64").fillna(0)
    )
    df['away_result'] = (
        df['away_result']
        .astype("string")
        .str.extract(r"(\d+)", expand=False)
        .astype("Int64").fillna(0)
    )
    
    #rename columns pour etre homogène avec les autres datasets
    dict_columns_2010 = config['dict_columns_2010']
    df = df.rename(columns=dict_columns_2010)
    
    #home_team et away_team, garder que le nom de pays en Anglais
    df[['home_team','home_team_lanorig']] = df['home_team'].str.split('(', expand=True)
    df[['away_team','away_team_lanorig']] = df['away_team'].str.split('(', expand=True)
    
    # Convertir la colonne 'date' en format YYYYMMDDhhmmss
    df["date"] = df["date"].astype(str)+"9999999999"
    
    #garder que l'année pour 'edition'
    df['edition'] =(df['edition'].str.slice(0, 4)).astype(int)
    df["date"] = df["date"].astype(int)
    
    #supprimer '.' dans colonne 'city'
    df['city'] = df['city'].str.replace('.', '', regex=False)
    
    # Harminser la colonne 'stage' avec les valeurs définies dans le fichier config.yaml
    stage_net = config['stage_mapping_2010']
    df["stage"] = df["stage"].replace(stage_net)
    
    #garder que des colonnes nescessaires
    columns_to_keep = config['columns_to_keep_2010']
    df = df[columns_to_keep]
    
    return df
    