In [37]:
import pandas as pd 
import json
import os
import yaml
from pathlib import Path

In [38]:
def fct_load_config(config_filename: str = "config.yaml") -> dict:
    """
    Goal:
        Function to load configuration parameters from a YAML file.
    Parameters:
        config_filename (str): Relative or absolute path to the YAML file.
    Returns:
        dict: A dictionary containing the configuration parameters.
    """

    config_path = Path(config_filename)

    # Si le chemin est relatif, on le résout depuis la racine du projet
    if not config_path.is_absolute():
        project_root = Path(__file__).resolve().parents[1]
        config_path = project_root / config_path

    if not config_path.exists():
        raise FileNotFoundError(f"Config file not found: {config_path}")

    with open(config_path, "r", encoding="utf-8") as f:
        config = yaml.safe_load(f)

    return config

def fct_read_csv(root_file: str) -> pd.DataFrame:
    """
    Goal:
        Function to read a CSV file and return a pandas DataFrame.
    Parameters:
        root_file (str): Relative or absolute path to the CSV file.
    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV file.
    """
    seps = [',', ';', '|', '\t']

    file_path = Path(root_file)

    if not file_path.is_absolute():
        try:
            # Cas script Python
            project_root = Path(__file__).resolve().parents[1]
        except NameError:
            # Cas notebook Jupyter
            project_root = Path.cwd().parent

        file_path = project_root / file_path

    if not file_path.exists():
        print(f"Erreur : fichier {file_path} introuvable")
        return pd.DataFrame()

    for sep in seps:
        try:
            df = pd.read_csv(
                file_path,
                sep=sep,
                encoding="utf-8",
                skipinitialspace=True
            )

            # Si plus d'une colonne → bon séparateur
            if df.shape[1] > 1:
                return df

        except Exception:
            continue

    print(f"Aucun séparateur valide trouvé pour {file_path}")
    return pd.DataFrame()


def fct_read_json_nested(root_file: str) -> dict:
    """
    Goal
        Function to read a JSON file and return a pandas DataFrame.
    Parameters:
        root_file (str): The path to the JSON file.
    Returns:
        pd.DataFrame: The DataFrame containing the data from the JSON file.
    """
    with open(root_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    dfs = {}

    # --------------------
    # DIMENSIONS
    # --------------------
    dfs['stadiums'] = pd.DataFrame(data.get('stadiums', []))
    dfs['tvchannels'] = pd.DataFrame(data.get('tvchannels', []))
    dfs['teams'] = pd.DataFrame(data.get('teams', []))

    # --------------------
    # MATCHES - GROUP STAGE
    # --------------------
    matches = []

    for group_key, group_data in data.get('groups', {}).items():
        for match in group_data.get('matches', []):
            match_flat = match.copy()
            match_flat['group'] = group_key
            match_flat['stage'] = 'group'
            matches.append(match_flat)

    # --------------------
    # MATCHES - KNOCKOUT
    # --------------------
    for round_key, round_data in data.get('knockout', {}).items():
        for match in round_data.get('matches', []):
            match_flat = match.copy()
            match_flat['group'] = None
            match_flat['stage'] = round_key
            matches.append(match_flat)

    dfs['matches'] = pd.DataFrame(matches)

    return dfs



def fct_add_prefix_to_df(df:pd.DataFrame, prefix:str) -> pd.DataFrame:
    """
    Goal:
        Function to add a prefix to all column names in a DataFrame.
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        prefix (str): The prefix to add to each column name.
    Returns:
        pd.DataFrame: The DataFrame with updated column names.
    """
    for col in df.columns:
        df.rename(columns={col: f"{prefix}_{col}"}, inplace=True)
    return df



def fct_extract_data(
    root_csv_2010: str,
    root_csv_2014: str,
    root_csv_2022: str,
    root_json_2018: str
) -> None:
    """
    Goal:
        Function to extract data from a CSV files and JSON file to a consolidated DataFrame df.
    Parameters:
        root_csv_2010 (str): The path to the first CSV file.
        root_csv_2014 (str): The path to the second CSV file.
        root_csv_2022 (str): The path to the third CSV file.
        root_json_2018 (str): The path to the JSON file.
    Returns:
        pd.DataFrame: The consolidated DataFrame containing data from all files.
    """
    df_2010 = fct_read_csv(root_csv_2010)
    df_2014 = fct_read_csv(root_csv_2014)
    df_2022 = fct_read_csv(root_csv_2022)
    df_2018 = fct_read_json_nested(root_json_2018)
    
    print(df_2010.head())
    print(df_2014.head())
    print(df_2022.head())
    print(df_2018['matches'].head())
    # df = pd.concat([df_2010, df_2014, df_2022, df_2018], ignore_index=True)
    return None


In [39]:
# Load configuration parameters from config.yaml
config_path = os.path.join(Path.cwd().parent, 'config.yaml')
config = fct_load_config(config_path)

root_csv_2022 = config['root_csv_2022']

# read data in dfs
df_2022 = fct_read_csv(root_csv_2022)


In [40]:
list_wanted_columns = [
    'team1', 
    'team2',
    'number of goals team1', 
    'number of goals team2', 
    'date',
    'hour',
    'category'
]
df_2022_filtered = df_2022[list_wanted_columns].copy()
df_2022_filtered.head()
    

Unnamed: 0,team1,team2,number of goals team1,number of goals team2,date,hour,category
0,QATAR,ECUADOR,0,2,20 NOV 2022,17 : 00,Group A
1,ENGLAND,IRAN,6,2,21 NOV 2022,14 : 00,Group B
2,SENEGAL,NETHERLANDS,0,2,21 NOV 2022,17 : 00,Group A
3,UNITED STATES,WALES,1,1,21 NOV 2022,20 : 00,Group B
4,ARGENTINA,SAUDI ARABIA,1,2,22 NOV 2022,11 : 00,Group C


In [41]:
df = df_2022_filtered.copy()
df = df.rename(columns={
    "team1": "home_team",
    "team2": "away_team",
    "number of goals team1": "home_result",
    "number of goals team2": "away_result",
    "category": "stage",
})

In [42]:
# nettoyer l'heure "17 : 00" -> "17:00"
df["hour"] = df["hour"].astype("string").str.replace(" ", "", regex=False)

# parse date + hour (mois en anglais)
dt = pd.to_datetime(
    df["date"].astype("string").str.strip() + " " + df["hour"].astype("string"),
    errors="coerce"
)

df["date"] = dt.dt.strftime("%Y%m%d%H%M%S")
df = df.drop(columns=["hour"])

  dt = pd.to_datetime(


In [43]:
#Noms des équipes avec la première lettre en majuscule
df["home_team"] = df["home_team"].astype("string").str.strip().str.lower().str.title()
df["away_team"] = df["away_team"].astype("string").str.strip().str.lower().str.title()

In [44]:
# Résultats en int 
df["home_result"] = pd.to_numeric(df["home_result"], errors="coerce").astype("Int64")
df["away_result"] = pd.to_numeric(df["away_result"], errors="coerce").astype("Int64")

In [45]:
df

Unnamed: 0,home_team,away_team,home_result,away_result,date,stage
0,Qatar,Ecuador,0,2,20221120170000,Group A
1,England,Iran,6,2,20221121140000,Group B
2,Senegal,Netherlands,0,2,20221121170000,Group A
3,United States,Wales,1,1,20221121200000,Group B
4,Argentina,Saudi Arabia,1,2,20221122110000,Group C
...,...,...,...,...,...,...
59,England,France,1,2,20221210200000,Quarter-final
60,Argentina,Croatia,3,0,20221213200000,Semi-final
61,France,Morocco,2,0,20221214200000,Semi-final
62,Croatia,Morocco,2,1,20221217160000,Play-off for third place


In [None]:
def transform_2022_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Transforme les données brutes des matchs 2022 en un format nettoyé et standardisé.

    Les opérations réalisées sont :
    - Sélection et renommage des colonnes utiles
    - Nettoyage du format de l'heure
    - Fusion de la date et de l'heure en un timestamp unique (YYYYMMDDHHMMSS)
    - Normalisation des noms d'équipes (Title Case)
    - Conversion des scores en entiers (nullable Int64)

    Parameters
    ----------
    raw_df : pandas.DataFrame
        DataFrame contenant les données brutes des matchs avec au minimum
        les colonnes suivantes :
        - 'team1'
        - 'team2'
        - 'number of goals team1'
        - 'number of goals team2'
        - 'date'
        - 'hour'
        - 'category'

    Returns
    -------
    pandas.DataFrame
        DataFrame transformé avec les colonnes :
        - home_team (str)
        - away_team (str)
        - home_result (Int64)
        - away_result (Int64)
        - date (str, format YYYYMMDDHHMMSS)
        - stage (str)

    Notes
    -----
    - Les dates invalides ou mal formées sont converties en NaT puis en NaN.
    - Les scores non numériques sont convertis en valeurs manquantes (pd.NA).
    """
    
    list_wanted_columns = [
    'team1', 
    'team2',
    'number of goals team1', 
    'number of goals team2', 
    'date',
    'hour',
    'category'
    ]

    # Filtrage des colonnes
    df_filtered = df[list_wanted_columns].copy()

    df_filtered = df_filtered.rename(columns={
    "team1": "home_team",
    "team2": "away_team",
    "number of goals team1": "home_result",
    "number of goals team2": "away_result",
    "category": "stage",
    })

    # nettoyer l'heure "17 : 00" -> "17:00"
    df_filtered["hour"] = df_filtered["hour"].astype("string").str.replace(" ", "", regex=False)

    # parse date + hour (mois en anglais)
    dt = pd.to_datetime(
    df_filtered["date"].astype("string").str.strip() + " " + df_filtered["hour"].astype("string"),
    errors="coerce"
    )

    df_filtered["date"] = dt.dt.strftime("%Y%m%d%H%M%S")
    df_filtered = df_filtered.drop(columns=["hour"])

    #Noms des équipes avec la première lettre en majuscule
    df_filtered["home_team"] = df_filtered["home_team"].astype("string").str.strip().str.lower().str.title()
    df_filtered["away_team"] = df_filtered["away_team"].astype("string").str.strip().str.lower().str.title()

    # Résultats en int 
    df_filtered["home_result"] = pd.to_numeric(df_filtered["home_result"], errors="coerce").astype("Int64")
    df_filtered["away_result"] = pd.to_numeric(df_filtered["away_result"], errors="coerce").astype("Int64")

    return df_filtered