In [1]:
import pandas as pd
import ast
import re
from pathlib import Path

# Schéma cible attendu (14 colonnes)
EXPECTED_COLS = [
    'id_match', 'home_team_id', 'away_team_id',
    'home_result', 'away_result', 'result',
    'extra_time', 'penalties', 'replay',
    'date', 'round', 'city', 'id_stadium', 'edition'
]

print(f"Schéma cible : {len(EXPECTED_COLS)} colonnes")

Schéma cible : 14 colonnes


In [2]:
# Chargement des 3 datasets standardisés
# -------------------------------------------

# 2022 World Cup
df_2022 = pd.read_csv('../data/processed/df_matches_final.csv')
print(f"2022 : {df_2022.shape}")
print(f"  Colonnes : {list(df_2022.columns)}")

# Vérification du schéma
missing_2022 = set(EXPECTED_COLS) - set(df_2022.columns)
if missing_2022:
    print(f"  ⚠️ Colonnes manquantes : {missing_2022}")
else:
    print(f"  ✅ Schéma valide")

df_2022.head(3)

2022 : (64, 14)
  Colonnes : ['id_match', 'home_team_id', 'away_team_id', 'home_result', 'away_result', 'result', 'extra_time', 'penalties', 'replay', 'date', 'round', 'city', 'id_stadium', 'edition']
  ✅ Schéma valide


Unnamed: 0,id_match,home_team_id,away_team_id,home_result,away_result,result,extra_time,penalties,replay,date,round,city,id_stadium,edition
0,1,Qatar,Ecuador,0,2,away_team,False,False,False,2022-11-20,Group Stage,Al Khor,Al Bayt Stadium,2022
1,2,England,Iran,6,2,home_team,False,False,False,2022-11-21,Group Stage,Ar-Rayyan,Khalifa International Stadium,2022
2,3,Senegal,Netherlands,0,2,away_team,False,False,False,2022-11-21,Group Stage,Doha,Al Thumama Stadium,2022


In [3]:
# 2018 World Cup
df_2018 = pd.read_csv('../data/processed/matches_2018_clean.csv')
print(f"2018 : {df_2018.shape}")
print(f"  Colonnes : {list(df_2018.columns)}")

# Vérification du schéma
missing_2018 = set(EXPECTED_COLS) - set(df_2018.columns)
if missing_2018:
    print(f"  ⚠️ Colonnes manquantes : {missing_2018}")
else:
    print(f"  ✅ Schéma valide")

df_2018.head(3)

2018 : (64, 14)
  Colonnes : ['id_match', 'home_team_id', 'away_team_id', 'home_result', 'away_result', 'result', 'extra_time', 'penalties', 'replay', 'date', 'round', 'city', 'id_stadium', 'edition']
  ✅ Schéma valide


Unnamed: 0,id_match,home_team_id,away_team_id,home_result,away_result,result,extra_time,penalties,replay,date,round,city,id_stadium,edition
0,1,Russia,Saudi Arabia,5,0,home_team,False,False,False,2018-06-14,Group Stage,Moscow,Luzhniki Stadium,2018
1,2,Egypt,Uruguay,0,1,away_team,False,False,False,2018-06-15,Group Stage,Yekaterinburg,Central Stadium,2018
2,17,Russia,Egypt,3,1,home_team,False,False,False,2018-06-19,Group Stage,Saint Petersburg,Krestovsky Stadium,2018


In [4]:
# 1930-2014 World Cups
df_1930_2014 = pd.read_csv('../data/processed/matches_1930_2014.csv')
print(f"1930-2014 : {df_1930_2014.shape}")
print(f"  Colonnes : {list(df_1930_2014.columns)}")

# Vérification du schéma
missing_1930_2014 = set(EXPECTED_COLS) - set(df_1930_2014.columns)
if missing_1930_2014:
    print(f"  ⚠️ Colonnes manquantes : {missing_1930_2014}")
else:
    print(f"  ✅ Schéma valide")

df_1930_2014.head(3)

1930-2014 : (7299, 14)
  Colonnes : ['id_match', 'home_team_id', 'away_team_id', 'home_result', 'away_result', 'result', 'extra_time', 'penalties', 'replay', 'date', 'round', 'city', 'id_stadium', 'edition']
  ✅ Schéma valide


Unnamed: 0,id_match,home_team_id,away_team_id,home_result,away_result,result,extra_time,penalties,replay,date,round,city,id_stadium,edition
0,1,France,Mexico (México),4,1,home_team,False,False,False,,Group Stage,Montevideo,,1930
1,2,USA,Belgium (België),3,0,home_team,False,False,False,,Group Stage,Montevideo,,1930
2,3,Yugoslavia (Југославија),Brazil (Brasil),2,1,home_team,False,False,False,,Group Stage,Montevideo,,1930


In [5]:
# Concaténation des 3 datasets
# -----------------------------

# 1. Réordonner les colonnes selon le schéma cible avant concaténation
df_2022 = df_2022[EXPECTED_COLS]
df_2018 = df_2018[EXPECTED_COLS]
df_1930_2014 = df_1930_2014[EXPECTED_COLS]

# 2. Concaténer dans l'ordre chronologique
df_all = pd.concat([df_1930_2014, df_2018, df_2022], ignore_index=True)
print(f"Concaténation : {df_all.shape}")

# 3. Supprimer l'id_match existant (sera régénéré)
df_all = df_all.drop(columns=['id_match'])

# 4. Trier par édition et date
df_all = df_all.sort_values(by=['edition', 'date']).reset_index(drop=True)

# 5. Régénérer id_match séquentiel
df_all.insert(0, 'id_match', range(1, len(df_all) + 1))

print(f"\nDataFrame final : {df_all.shape}")
print(f"Colonnes : {list(df_all.columns)}")
print(f"\nRépartition par édition:")
print(df_all['edition'].value_counts().sort_index().tail(10))

df_all.head(10)

Concaténation : (7427, 14)

DataFrame final : (7427, 14)
Colonnes : ['id_match', 'home_team_id', 'away_team_id', 'home_result', 'away_result', 'result', 'extra_time', 'penalties', 'replay', 'date', 'round', 'city', 'id_stadium', 'edition']

Répartition par édition:
edition
1986    364
1990    367
1994    550
1998    708
2002    841
2006    911
2010    917
2014    884
2018     64
2022     64
Name: count, dtype: int64


Unnamed: 0,id_match,home_team_id,away_team_id,home_result,away_result,result,extra_time,penalties,replay,date,round,city,id_stadium,edition
0,1,France,Mexico (México),4,1,home_team,False,False,False,,Group Stage,Montevideo,,1930
1,2,USA,Belgium (België),3,0,home_team,False,False,False,,Group Stage,Montevideo,,1930
2,3,Yugoslavia (Југославија),Brazil (Brasil),2,1,home_team,False,False,False,,Group Stage,Montevideo,,1930
3,4,Romania (România),Peru (Perú),3,1,home_team,False,False,False,,Group Stage,Montevideo,,1930
4,5,Argentina,France,1,0,home_team,False,False,False,,Group Stage,Montevideo,,1930
5,6,Chile,Mexico (México),3,0,home_team,False,False,False,,Group Stage,Montevideo,,1930
6,7,Yugoslavia (Југославија),Bolivia,4,0,home_team,False,False,False,,Group Stage,Montevideo,,1930
7,8,USA,Paraguay,3,0,home_team,False,False,False,,Group Stage,Montevideo,,1930
8,9,Uruguay,Peru (Perú),1,0,home_team,False,False,False,,Group Stage,Montevideo,,1930
9,10,Chile,France,1,0,home_team,False,False,False,,Group Stage,Montevideo,,1930


In [6]:
# Phase 4.5: Fonctions pour corriger les résultats des matchs nuls en knockout
# =============================================================================

# Constantes
FINALS_PENALTY_WINNERS = {1994: 'Brazil (Brasil)', 2006: 'Italy (Italia)'}
GROUP_ROUNDS = ['Preliminary', 'Group Stage', 'Second Group Stage']
WINNER_ROUND_MAP = {
    'Round of 16': 'Quarter-finals',
    'Quarter-finals': 'Semi-finals',
    'Semi-finals': 'Final',
}
LOSER_ROUND_MAP = {'Semi-finals': 'Third Place'}

def find_winner_from_next_round(row, df_full):
    """Trouve le vainqueur en vérifiant qui apparaît au tour suivant"""
    edition = row['edition']
    team1 = row['home_team_id']
    team2 = row['away_team_id']
    current_round = row['round']

    if current_round in WINNER_ROUND_MAP:
        next_round = WINNER_ROUND_MAP[current_round]
        next_matches = df_full[(df_full['edition'] == edition) & (df_full['round'] == next_round)]
        teams_in_next = set(next_matches['home_team_id'].tolist() + next_matches['away_team_id'].tolist())

        if team1 in teams_in_next and team2 not in teams_in_next:
            return 'home_team'
        elif team2 in teams_in_next and team1 not in teams_in_next:
            return 'away_team'

    if current_round in LOSER_ROUND_MAP:
        loser_round = LOSER_ROUND_MAP[current_round]
        loser_matches = df_full[(df_full['edition'] == edition) & (df_full['round'] == loser_round)]
        teams_in_loser = set(loser_matches['home_team_id'].tolist() + loser_matches['away_team_id'].tolist())

        if team1 in teams_in_loser and team2 not in teams_in_loser:
            return 'away_team'
        elif team2 in teams_in_loser and team1 not in teams_in_loser:
            return 'home_team'

    return None

def get_result(row, df_full):
    """Détermine le résultat du match"""
    score1 = row['home_result']
    score2 = row['away_result']
    round_name = row['round']
    edition = row['edition']
    team1 = row['home_team_id']
    team2 = row['away_team_id']

    if score1 > score2:
        return 'home_team'
    elif score1 < score2:
        return 'away_team'

    if round_name in GROUP_ROUNDS:
        return 'draw'

    if row.get('replay', False):
        return 'draw'

    if round_name == 'Final' and edition in FINALS_PENALTY_WINNERS:
        winner = FINALS_PENALTY_WINNERS[edition]
        if team1 == winner:
            return 'home_team'
        elif team2 == winner:
            return 'away_team'

    winner = find_winner_from_next_round(row, df_full)
    if winner:
        return winner

    return 'draw'

print("✅ Fonctions get_result et find_winner_from_next_round définies")

✅ Fonctions get_result et find_winner_from_next_round définies


In [7]:
# Application de get_result au dataset concaténé
# -----------------------------------------------
# IMPORTANT: On ne recalcule le résultat que si:
#   1. Le résultat actuel est "draw" ET
#   2. C'est une phase éliminatoire (où un draw n'est pas possible)
# Sinon, on garde le résultat du CSV parent (qui est déjà correct pour 2018/2022)

print("Application de get_result au dataset concaténé...")
print(f"Avant correction - Répartition par result:")
print(df_all['result'].value_counts())

knockout_rounds = ['Round of 16', 'Quarter-finals', 'Semi-finals', 'Third Place', 'Final']

def apply_get_result(row, df_full):
    """Applique get_result seulement si nécessaire"""
    current_result = row['result']
    round_name = row['round']
    
    # Si le résultat n'est pas "draw", on le garde tel quel
    if current_result != 'draw':
        return current_result
    
    # Si c'est un draw en phase de groupes, c'est normal
    if round_name in GROUP_ROUNDS:
        return 'draw'
    
    # Si c'est un replay, le draw est aussi normal
    if row.get('replay', False):
        return 'draw'
    
    # Sinon, on recalcule pour les matchs éliminatoires avec draw
    return get_result(row, df_full)

# Appliquer la correction conditionnelle
df_all['result'] = df_all.apply(lambda row: apply_get_result(row, df_all), axis=1)

print(f"\nAprès correction - Répartition par result:")
print(df_all['result'].value_counts())

# Vérification: matchs nuls en phase éliminatoire (devrait être 0 sauf replays)
draws_in_knockout = df_all[(df_all['round'].isin(knockout_rounds)) & (df_all['result'] == 'draw') & (~df_all['replay'])]
print(f"\n⚠️ Matchs nuls en phase éliminatoire (hors replays): {len(draws_in_knockout)}")
if len(draws_in_knockout) > 0:
    print(draws_in_knockout[['edition', 'home_team_id', 'away_team_id', 'home_result', 'away_result', 'round', 'replay']].head(10))

Application de get_result au dataset concaténé...
Avant correction - Répartition par result:
result
home_team    3908
away_team    1961
draw         1558
Name: count, dtype: int64

Après correction - Répartition par result:
result
home_team    3911
away_team    1964
draw         1552
Name: count, dtype: int64

⚠️ Matchs nuls en phase éliminatoire (hors replays): 0


In [8]:
# Extraction des équipes uniques et création de teams.csv
# --------------------------------------------------------

# 1. Extraire les équipes uniques
equipes = sorted(pd.concat([df_all['home_team_id'], df_all['away_team_id']]).unique())
print(f"Nombre d'équipes uniques : {len(equipes)}")

# 2. Créer le DataFrame teams
df_teams = pd.DataFrame({
    'id_team': range(1, len(equipes) + 1),
    'nom_standard': equipes,
    'confederation': None,
    'aliases': '[]'
})

print(f"\nDataFrame teams créé : {df_teams.shape}")
print(f"Aperçu:")
print(df_teams.head(10))

# 3. Créer le mapping team_to_id (pour utilisation dans la normalisation)
team_to_id = dict(zip(df_teams['nom_standard'], df_teams['id_team']))
print(f"\nMapping team_to_id créé avec {len(team_to_id)} entrées")

Nombre d'équipes uniques : 264

DataFrame teams créé : (264, 4)
Aperçu:
   id_team             nom_standard confederation aliases
0        1  Afghanistan (افغانستان)          None      []
1        2       Albania (Shqipëri)          None      []
2        3                  Algeria          None      []
3        4        Algeria (الجزائر)          None      []
4        5           American Samoa          None      []
5        6                  Andorra          None      []
6        7                   Angola          None      []
7        8                 Anguilla          None      []
8        9                  Antigua          None      []
9       10      Antigua and Barbuda          None      []

Mapping team_to_id créé avec 264 entrées


In [9]:
# Export des DataFrames finaux
# -----------------------------

Path("../data/processed").mkdir(parents=True, exist_ok=True)

# Export matches.csv (avec IDs numériques)
df_all.to_csv('../data/processed/matches.csv', index=False)
print(f"✅ Export terminé : matches.csv")
print(f"   - {len(df_all)} matchs")
print(f"   - Éditions : {df_all['edition'].min()} - {df_all['edition'].max()}")
print(f"   - home_team_id : type={df_all['home_team_id'].dtype}, min={df_all['home_team_id'].min()}, max={df_all['home_team_id'].max()}")
print(f"   - away_team_id : type={df_all['away_team_id'].dtype}, min={df_all['away_team_id'].min()}, max={df_all['away_team_id'].max()}")
print(f"   - Matchs avec prolongation : {df_all['extra_time'].sum()}")
print(f"   - Matchs avec TAB : {df_all['penalties'].sum()}")

# Export teams.csv
df_teams.to_csv('../data/processed/teams.csv', index=False)
print(f"\n✅ Export terminé : teams.csv")
print(f"   - {len(df_teams)} équipes uniques")

print(f"\nRépartition par result:")
print(df_all['result'].value_counts())

✅ Export terminé : matches.csv
   - 7427 matchs
   - Éditions : 1930 - 2022
   - home_team_id : type=object, min=Afghanistan (افغانستان), max=Zimbabwe
   - away_team_id : type=object, min=Afghanistan (افغانستان), max=Zimbabwe
   - Matchs avec prolongation : 72
   - Matchs avec TAB : 13

✅ Export terminé : teams.csv
   - 264 équipes uniques

Répartition par result:
result
home_team    3911
away_team    1964
draw         1552
Name: count, dtype: int64


In [10]:
def create_team_mapping(teams_df):
    mapping = {}
    
    for _, row in teams_df.iterrows():
        team_id = row['id_team']
        nom_standard = row['nom_standard']
        
        mapping[nom_standard.lower()] = team_id
        mapping[nom_standard.upper()] = team_id
        mapping[nom_standard] = team_id
        
        aliases_str = row['aliases']
        if pd.notna(aliases_str) and aliases_str != '[]':
            try:
                aliases = ast.literal_eval(aliases_str)
                for alias in aliases:
                    mapping[alias.lower()] = team_id
                    mapping[alias.upper()] = team_id
                    mapping[alias] = team_id
            except (ValueError, SyntaxError):
                pass
    
    return mapping

In [11]:
def get_team_id(team_name, mapping):

    if pd.isna(team_name):
        return None
    
    team_name_str = str(team_name).strip()
    
    if team_name_str in mapping:
        return mapping[team_name_str]
    
    if team_name_str.lower() in mapping:
        return mapping[team_name_str.lower()]
    
    if team_name_str.upper() in mapping:
        return mapping[team_name_str.upper()]
    
    if team_name_str.title() in mapping:
        return mapping[team_name_str.title()]
    
    return None

In [12]:
def transform_matches_dataframe(matches_df, teams_df):

    df = matches_df.copy()
    
    team_mapping = create_team_mapping(teams_df)
    
    # Ne pas réinsérer id_match si elle existe déjà
    if 'id_match' not in df.columns:
        df.insert(0, 'id_match', range(1, len(df) + 1))
    
    not_found = set()
    
    def map_home_team(name):
        team_id = get_team_id(name, team_mapping)
        if team_id is None and pd.notna(name):
            not_found.add(str(name))
        return team_id
    
    def map_away_team(name):
        team_id = get_team_id(name, team_mapping)
        if team_id is None and pd.notna(name):
            not_found.add(str(name))
        return team_id
    
    df['home_team_id'] = df['home_team_id'].apply(map_home_team)
    df['away_team_id'] = df['away_team_id'].apply(map_away_team)
    
    return df, list(not_found)

In [13]:
# Application du mapping des équipes vers IDs numériques
# (Optionnel - sera fait dans 02b-normalize-teams.ipynb)
# --------------------------------------------------------

# Cette étape peut être exécutée si vous avez déjà le fichier teams_traitees.csv
# Sinon, elle sera faite par le notebook de normalisation

try:
    teams_df = pd.read_csv('../data/processed/teams_traitees.csv')
    matches_df = pd.read_csv('../data/processed/matches.csv')
    
    transformed_df, not_found_teams = transform_matches_dataframe(matches_df, teams_df)
    
    if not_found_teams:
        print(f"⚠️ {len(not_found_teams)} équipes non trouvées dans le mapping:")
        for team in sorted(not_found_teams)[:20]:
            print(f"   - {team}")
    else:
        print("✅ Toutes les équipes mappées avec succès")
    
    # Export du résultat final
    transformed_df.to_csv('../data/processed/matches_with_ids.csv', index=False)
    print(f"\n✅ Export : matches_with_ids.csv")
    
except FileNotFoundError as e:
    print(f"ℹ️ Fichier teams_traitees.csv non trouvé - le mapping sera fait dans 02b-normalize-teams.ipynb")
    print(f"   Erreur : {e}")

ℹ️ Fichier teams_traitees.csv non trouvé - le mapping sera fait dans 02b-normalize-teams.ipynb
   Erreur : [Errno 2] No such file or directory: '../data/processed/teams_traitees.csv'


In [14]:
transformed_df

NameError: name 'transformed_df' is not defined