In [814]:
import pandas as pd 
import json
import os
import yaml
from pathlib import Path

In [815]:
def fct_load_config(config_filename: str = "config.yaml") -> dict:
    """
    Goal:
        Function to load configuration parameters from a YAML file.
    Parameters:
        config_filename (str): Relative or absolute path to the YAML file.
    Returns:
        dict: A dictionary containing the configuration parameters.
    """

    config_path = Path(config_filename)

    # Si le chemin est relatif, on le résout depuis la racine du projet
    if not config_path.is_absolute():
        project_root = Path(__file__).resolve().parents[1]
        config_path = project_root / config_path

    if not config_path.exists():
        raise FileNotFoundError(f"Config file not found: {config_path}")

    with open(config_path, "r", encoding="utf-8") as f:
        config = yaml.safe_load(f)

    return config

def fct_read_csv(root_file: str) -> pd.DataFrame:
    """
    Goal:
        Function to read a CSV file and return a pandas DataFrame.
    Parameters:
        root_file (str): Relative or absolute path to the CSV file.
    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV file.
    """
    seps = [',', ';', '|', '\t']

    file_path = Path(root_file)

    if not file_path.is_absolute():
        try:
            # Cas script Python
            project_root = Path(__file__).resolve().parents[1]
        except NameError:
            # Cas notebook Jupyter
            project_root = Path.cwd().parent

        file_path = project_root / file_path

    if not file_path.exists():
        print(f"Erreur : fichier {file_path} introuvable")
        return pd.DataFrame()

    for sep in seps:
        try:
            df = pd.read_csv(
                file_path,
                sep=sep,
                encoding="utf-8",
                skipinitialspace=True
            )

            # Si plus d'une colonne → bon séparateur
            if df.shape[1] > 1:
                return df

        except Exception:
            continue

    print(f"Aucun séparateur valide trouvé pour {file_path}")
    return pd.DataFrame()


def fct_read_json_nested(root_file: str) -> dict:
    """
    Goal
        Function to read a JSON file and return a pandas DataFrame.
    Parameters:
        root_file (str): The path to the JSON file.
    Returns:
        pd.DataFrame: The DataFrame containing the data from the JSON file.
    """
    with open(root_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    dfs = {}

    # --------------------
    # DIMENSIONS
    # --------------------
    dfs['stadiums'] = pd.DataFrame(data.get('stadiums', []))
    dfs['tvchannels'] = pd.DataFrame(data.get('tvchannels', []))
    dfs['teams'] = pd.DataFrame(data.get('teams', []))

    # --------------------
    # MATCHES - GROUP STAGE
    # --------------------
    matches = []

    for group_key, group_data in data.get('groups', {}).items():
        for match in group_data.get('matches', []):
            match_flat = match.copy()
            match_flat['group'] = group_key
            match_flat['stage'] = 'group'
            matches.append(match_flat)

    # --------------------
    # MATCHES - KNOCKOUT
    # --------------------
    for round_key, round_data in data.get('knockout', {}).items():
        for match in round_data.get('matches', []):
            match_flat = match.copy()
            match_flat['group'] = None
            match_flat['stage'] = round_key
            matches.append(match_flat)

    dfs['matches'] = pd.DataFrame(matches)

    return dfs



def fct_add_prefix_to_df(df:pd.DataFrame, prefix:str) -> pd.DataFrame:
    """
    Goal:
        Function to add a prefix to all column names in a DataFrame.
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        prefix (str): The prefix to add to each column name.
    Returns:
        pd.DataFrame: The DataFrame with updated column names.
    """
    for col in df.columns:
        df.rename(columns={col: f"{prefix}_{col}"}, inplace=True)
    return df



def fct_extract_data(
    root_csv_2010: str,
    root_csv_2014: str,
    root_csv_2022: str,
    root_json_2018: str
) -> None:
    """
    Goal:
        Function to extract data from a CSV files and JSON file to a consolidated DataFrame df.
    Parameters:
        root_csv_2010 (str): The path to the first CSV file.
        root_csv_2014 (str): The path to the second CSV file.
        root_csv_2022 (str): The path to the third CSV file.
        root_json_2018 (str): The path to the JSON file.
    Returns:
        pd.DataFrame: The consolidated DataFrame containing data from all files.
    """
    df_2010 = fct_read_csv(root_csv_2010)
    df_2014 = fct_read_csv(root_csv_2014)
    df_2022 = fct_read_csv(root_csv_2022)
    df_2018 = fct_read_json_nested(root_json_2018)
    
    print(df_2010.head())
    print(df_2014.head())
    print(df_2022.head())
    print(df_2018['matches'].head())
    # df = pd.concat([df_2010, df_2014, df_2022, df_2018], ignore_index=True)
    return None


In [816]:
# Load configuration parameters from config.yaml
config_path = os.path.join(Path.cwd().parent, 'config.yaml')
config = fct_load_config(config_path)

root_csv_2010 = config['root_csv_2010']
root_csv_2014 = config['root_csv_2014']
root_csv_2022 = config['root_csv_2022']
root_json_2018 = config['root_json_2018']       

# read data in dfs
df_2010 = fct_read_csv(root_csv_2010)
df_2014 = fct_read_csv(root_csv_2014)
df_2022 = fct_read_csv(root_csv_2022)


In [817]:
df_2010.head()

Unnamed: 0,edition,round,score,team1,team2,url,venue,year
0,1930-URUGUAY,GROUP_STAGE,4-1 (3-0),France,Mexico (México),1930_URUGUAY_FS.htm#1-WC-30-I,Montevideo.,1930
1,1930-URUGUAY,GROUP_STAGE,3-0 (2-0),USA,Belgium (België),1930_URUGUAY_FS.htm#13-WC-30-I,Montevideo.,1930
2,1930-URUGUAY,GROUP_STAGE,2-1 (2-0),Yugoslavia (Југославија),Brazil (Brasil),1930_URUGUAY_FS.htm#7-WC-30-I,Montevideo.,1930
3,1930-URUGUAY,GROUP_STAGE,3-1 (1-0),Romania (România),Peru (Perú),1930_URUGUAY_FS.htm#10-WC-30-I,Montevideo.,1930
4,1930-URUGUAY,GROUP_STAGE,1-0 (0-0),Argentina,France,1930_URUGUAY_FS.htm#2-WC-30-I,Montevideo.,1930


In [818]:
#remplir des valeurs vides 
df_2010 = df_2010.fillna(0)

In [819]:
#creer colonnes 'home_result' et 'away_result'
df_2010['score'] = df_2010['score'].str.slice(0, 3)
df_2010[['home_result','away_result']] = df_2010['score'].str.split('-', expand=True)
df_2010.head()

Unnamed: 0,edition,round,score,team1,team2,url,venue,year,home_result,away_result
0,1930-URUGUAY,GROUP_STAGE,4-1,France,Mexico (México),1930_URUGUAY_FS.htm#1-WC-30-I,Montevideo.,1930,4,1
1,1930-URUGUAY,GROUP_STAGE,3-0,USA,Belgium (België),1930_URUGUAY_FS.htm#13-WC-30-I,Montevideo.,1930,3,0
2,1930-URUGUAY,GROUP_STAGE,2-1,Yugoslavia (Југославија),Brazil (Brasil),1930_URUGUAY_FS.htm#7-WC-30-I,Montevideo.,1930,2,1
3,1930-URUGUAY,GROUP_STAGE,3-1,Romania (România),Peru (Perú),1930_URUGUAY_FS.htm#10-WC-30-I,Montevideo.,1930,3,1
4,1930-URUGUAY,GROUP_STAGE,1-0,Argentina,France,1930_URUGUAY_FS.htm#2-WC-30-I,Montevideo.,1930,1,0


In [820]:
#rename columns pour etre homogène avec les autres datasets
df_2010 = df_2010.rename(columns={'year': 'date', 'team1': 'home_team', 'team2' : 'away_team', 'venue' : 'city', 'round': 'stage'})

In [821]:
#home_team et away_team, garder que le nom de pays en Anglais
df_2010[['home_team','home_team_lanorig']] = df_2010['home_team'].str.split('(', expand=True)
df_2010[['away_team','away_team_lanorig']] = df_2010['away_team'].str.split('(', expand=True)
df_2010.head()

Unnamed: 0,edition,stage,score,home_team,away_team,url,city,date,home_result,away_result,home_team_lanorig,away_team_lanorig
0,1930-URUGUAY,GROUP_STAGE,4-1,France,Mexico,1930_URUGUAY_FS.htm#1-WC-30-I,Montevideo.,1930,4,1,,México)
1,1930-URUGUAY,GROUP_STAGE,3-0,USA,Belgium,1930_URUGUAY_FS.htm#13-WC-30-I,Montevideo.,1930,3,0,,België)
2,1930-URUGUAY,GROUP_STAGE,2-1,Yugoslavia,Brazil,1930_URUGUAY_FS.htm#7-WC-30-I,Montevideo.,1930,2,1,Југославија),Brasil)
3,1930-URUGUAY,GROUP_STAGE,3-1,Romania,Peru,1930_URUGUAY_FS.htm#10-WC-30-I,Montevideo.,1930,3,1,România),Perú)
4,1930-URUGUAY,GROUP_STAGE,1-0,Argentina,France,1930_URUGUAY_FS.htm#2-WC-30-I,Montevideo.,1930,1,0,,


In [822]:
#garder que l'année pour 'edition'
df_2010['edition'] = df_2010['edition'].str.slice(0, 4)

In [823]:
#supprimer '.' dans colonne 'city'
df_2010['city'] = df_2010['city'].str.replace('.', '', regex=False)
df_2010.head()

Unnamed: 0,edition,stage,score,home_team,away_team,url,city,date,home_result,away_result,home_team_lanorig,away_team_lanorig
0,1930,GROUP_STAGE,4-1,France,Mexico,1930_URUGUAY_FS.htm#1-WC-30-I,Montevideo,1930,4,1,,México)
1,1930,GROUP_STAGE,3-0,USA,Belgium,1930_URUGUAY_FS.htm#13-WC-30-I,Montevideo,1930,3,0,,België)
2,1930,GROUP_STAGE,2-1,Yugoslavia,Brazil,1930_URUGUAY_FS.htm#7-WC-30-I,Montevideo,1930,2,1,Југославија),Brasil)
3,1930,GROUP_STAGE,3-1,Romania,Peru,1930_URUGUAY_FS.htm#10-WC-30-I,Montevideo,1930,3,1,România),Perú)
4,1930,GROUP_STAGE,1-0,Argentina,France,1930_URUGUAY_FS.htm#2-WC-30-I,Montevideo,1930,1,0,,


In [824]:
df_2010["stage"].unique()


array(['GROUP_STAGE', '1/2_FINAL', '_FINAL', 'PRELIMINARY-Europe',
       'PRELIMINARY-N/C.America', 'PRELIMINARY-N.E.', 'FIRST',
       '1/4_FINAL', 'PLACES_3&4', 'PRELIMINARY-Eur./N.E.',
       'PRELIMINARY-S.America', 'FINAL_ROUND', 'PRELIMINARY-Eu./Afr.',
       'PRELIMINARY-Asia', 'PRELIMINARY-Afr./As.', 'PRELIMINARY-Euro/As.',
       'PRELIMINARY-E./Afr./As.', 'PRELIMINARY-Af./As./O.',
       'PRELIMINARY-Africa', 'PRELIMINARY-As./O.', 'SEMIFINAL_STAGE',
       'QUARTERFINAL_STAGE', 'PRELIMINARY-O./As.', '1/8_FINAL',
       'PRELIMINARY-Oceania'], dtype=object)

In [825]:
stage_net = {
    'GROUP_STAGE':'group_stage', 
    '1/2_FINAL':'semi_finals', 
    '_FINAL':'final', 
    'PRELIMINARY-Europe':'preliminary',
    'PRELIMINARY-N/C.America':'preliminary', 
    'PRELIMINARY-N.E.':'preliminary', 
    'FIRST':'first',
    '1/4_FINAL':'quarter_finals', 
    'PLACES_3&4':'third place', 
    'PRELIMINARY-Eur./N.E.':'preliminary',
    'PRELIMINARY-S.America':'preliminary', 
    'FINAL_ROUND':'final', 
    'PRELIMINARY-Eu./Afr.':'preliminary',
    'PRELIMINARY-Asia':'preliminary', 
    'PRELIMINARY-Afr./As.':'preliminary', 
    'PRELIMINARY-Euro/As.':'preliminary',
    'PRELIMINARY-E./Afr./As.':'preliminary', 
    'PRELIMINARY-Af./As./O.':'preliminary',
    'PRELIMINARY-Africa':'preliminary', 
    'PRELIMINARY-As./O.':'preliminary', 
    'SEMIFINAL_STAGE':'semi_finals',
    'QUARTERFINAL_STAGE':'quarter_finals', 
    'PRELIMINARY-O./As.':'preliminary', 
    '1/8_FINAL':'round_of_16',
    'PRELIMINARY-Oceania':'preliminary'
}
df_2010["stage"] = df_2010["stage"].replace(stage_net)

In [826]:
#garder que des colonnes nescessaires
df_2010[['date', 'home_team', 'away_team', 'home_result', 'away_result', 'stage', 'edition', 'city']].to_csv('matches_19302010.csv', index=False)
df_2010 = pd.read_csv('matches_19302010.csv',usecols=['date', 'home_team', 'away_team', 'home_result', 'away_result', 'stage', 'edition', 'city']) 
print(df_2010)


      date    home_team away_team home_result  away_result           stage  \
0     1930       France   Mexico            4          1.0     group_stage   
1     1930          USA  Belgium            3          0.0     group_stage   
2     1930  Yugoslavia    Brazil            2          1.0     group_stage   
3     1930     Romania      Peru            3          1.0     group_stage   
4     1930    Argentina    France           1          0.0     group_stage   
...    ...          ...       ...         ...          ...             ...   
7294  2014            3         4         (C)          NaN  quarter_finals   
7295  2014            A         B         (X)          NaN     semi_finals   
7296  2014            C         D         (Y)          NaN     semi_finals   
7297  2014      LOSER X   LOSER Y         xxx          NaN     third place   
7298  2014     WINNER X  WINNER Y         xxx          NaN           final   

      edition            city  
0        1930      Montevideo  

In [827]:
df_2010['stage'].unique()

array(['group_stage', 'semi_finals', 'final', 'preliminary', 'first',
       'quarter_finals', 'third place', 'round_of_16'], dtype=object)