Imports et chargement des donn√©es (staging ‚Üí transform)

In [111]:
# Transformation des donn√©es Coupe du Monde 2018
# Chargement des donn√©es extraites (zone staging)

import pandas as pd
from pathlib import Path

# Lecture du fichier issu du notebook Extract
df = pd.read_csv("../data/staging/matches_2018_raw.csv")

# V√©rification volume
df.shape

(64, 10)

In [112]:
# V√©rification colonnes et aper√ßu

display(df.head())

Unnamed: 0,match_id,date,round,home_team,away_team,home_goals,away_goals,stadium,city,edition
0,1,2018-06-14T18:00:00+03:00,Group Stage,Russia,Saudi Arabia,5,0,Luzhniki Stadium,Moscow,2018
1,2,2018-06-15T17:00:00+05:00,Group Stage,Egypt,Uruguay,0,1,Central Stadium,Yekaterinburg,2018
2,17,2018-06-19T21:00:00+03:00,Group Stage,Russia,Egypt,3,1,Krestovsky Stadium,Saint Petersburg,2018
3,18,2018-06-20T18:00:00+03:00,Group Stage,Uruguay,Saudi Arabia,1,0,Rostov Arena,Rostov-on-Don,2018
4,33,2018-06-25T18:00:00+04:00,Group Stage,Uruguay,Russia,3,0,Cosmos Arena,Samara,2018


Validation du sch√©ma (validation minimale)

In [113]:
# Validation du sch√©ma minimal requis pour transformations de scores 
required_cols = {"home_goals", "away_goals"}

missing = required_cols - set(df.columns)

if missing:
    raise ValueError(f"Colonnes manquantes : {missing}")

V√©rifications initiales (qualit√© et coh√©rence)

In [114]:
# V√©rifier les noms et pr√©sence de toutes colonnes
df.columns

# V√©rifier la r√©partition des matchs par tour (Group Stage, Round of 16, etc.)
df["round"].value_counts(dropna=False)

round
Group Stage       48
Round of 16        8
Quarter-finals     4
Semi-finals        2
Third place        1
Final              1
Name: count, dtype: int64

Normalisation des types (date) üëâ Objectif : faciliter les calculs, √©viter les erreurs en aval (Load / analyses)

In [115]:
# Normalisation de la date :
# - conversion en datetime timezone-aware (UTC)
# - suppression de l'heure tout en conservant un type datetime (robuste --> SQL)
df["date"] = pd.to_datetime(
    df["date"],
    errors="coerce",
    utc=True
).dt.normalize()

# V√©rification apr√®s conversion
df["date"].head()

0   2018-06-14 00:00:00+00:00
1   2018-06-15 00:00:00+00:00
2   2018-06-19 00:00:00+00:00
3   2018-06-20 00:00:00+00:00
4   2018-06-25 00:00:00+00:00
Name: date, dtype: datetime64[ns, UTC]

Normalisation des types (scores)

In [116]:
# Normalisation des scores (staging ‚Üí transform)
# Conversion en num√©rique
df["home_goals"] = pd.to_numeric(df["home_goals"], errors="coerce")
df["away_goals"] = pd.to_numeric(df["away_goals"], errors="coerce")

# V√©rification des types apr√®s conversion
df.dtypes

match_id                    int64
date          datetime64[ns, UTC]
round                      object
home_team                  object
away_team                  object
home_goals                  int64
away_goals                  int64
stadium                    object
city                       object
edition                     int64
dtype: object

Cr√©ation de la colonne m√©tier result --> attendue au sch√©ma final 

In [117]:
# Cr√©ation des colonnes scores attendues par le sch√©ma final
df["home_result"] = df["home_goals"]
df["away_result"] = df["away_goals"]

# D√©termination du r√©sultat du match
# - √©quipe √† domicile si victoire
# - √©quipe √† l'ext√©rieur si victoire
# - "draw" en cas d'√©galit√©
def compute_result(row):
    if row["home_result"] > row["away_result"]:
        return row["home_team"]
    elif row["home_result"] < row["away_result"]:
        return row["away_team"]
    else:
        return "draw"

df["result"] = df.apply(compute_result, axis=1)

# V√©rification sur quelques lignes
df[["home_team", "away_team", "home_result", "away_result", "result"]].head()

Unnamed: 0,home_team,away_team,home_result,away_result,result
0,Russia,Saudi Arabia,5,0,Russia
1,Egypt,Uruguay,0,1,Uruguay
2,Russia,Egypt,3,1,Russia
3,Uruguay,Saudi Arabia,1,0,Uruguay
4,Uruguay,Russia,3,0,Uruguay


Renommage des colonnes (alignement sch√©ma cible)

In [118]:
# Renommage des colonnes pour correspondre au sch√©ma final
matches_2018_final = df.rename(columns={
    "match_id": "id_match",
    "home_team": "home_team_id",
    "away_team": "away_team_id",
})

# V√©rification
matches_2018_final[["id_match", "home_team_id", "away_team_id"]].head()

Unnamed: 0,id_match,home_team_id,away_team_id
0,1,Russia,Saudi Arabia
1,2,Egypt,Uruguay
2,17,Russia,Egypt
3,18,Uruguay,Saudi Arabia
4,33,Uruguay,Russia


Supprimer les colonnes inutiles pour la table finale

In [119]:
# La table finale des matchs ne conserve pas le nom du stade
# (seule la ville est requise)

# df = df.drop(columns=["stadium"])

# df.columns

Construction de la table finale matches_2018_clean

In [120]:
# S√©lection et ordonnancement des colonnes attendues

matches_2018_clean = matches_2018_final[
    [
        "id_match",
        "home_team_id",
        "away_team_id",
        "home_result",
        "away_result",
        "result",
        "date",
        "round",
        "city",
        "edition",
    ]
].copy()

# V√©rification finale
display(matches_2018_clean.assign(date=matches_2018_clean["date"].dt.strftime("%Y-%m-%d")).head(10))

Unnamed: 0,id_match,home_team_id,away_team_id,home_result,away_result,result,date,round,city,edition
0,1,Russia,Saudi Arabia,5,0,Russia,2018-06-14,Group Stage,Moscow,2018
1,2,Egypt,Uruguay,0,1,Uruguay,2018-06-15,Group Stage,Yekaterinburg,2018
2,17,Russia,Egypt,3,1,Russia,2018-06-19,Group Stage,Saint Petersburg,2018
3,18,Uruguay,Saudi Arabia,1,0,Uruguay,2018-06-20,Group Stage,Rostov-on-Don,2018
4,33,Uruguay,Russia,3,0,Uruguay,2018-06-25,Group Stage,Samara,2018
5,34,Saudi Arabia,Egypt,2,1,Saudi Arabia,2018-06-25,Group Stage,Volgograd,2018
6,3,Portugal,Spain,3,3,draw,2018-06-15,Group Stage,Sochi,2018
7,4,Morocco,Iran,0,1,Iran,2018-06-15,Group Stage,Saint Petersburg,2018
8,19,Portugal,Morocco,1,0,Portugal,2018-06-20,Group Stage,Moscow,2018
9,20,Iran,Spain,0,1,Spain,2018-06-20,Group Stage,Kazan,2018


Construction de la table de r√©f√©rence des √©quipes
üëâ Table de r√©f√©rence (dimension)

In [121]:
# Construction de la table de r√©f√©rence des √©quipes (dimension)
teams_ref_2018 = (
    pd.concat(
        [
            matches_2018_clean["home_team_id"],
            matches_2018_clean["away_team_id"],
        ],
        ignore_index=True,
    )
    .drop_duplicates()
    .sort_values()
    .reset_index(drop=True)
    .to_frame(name="team")
)

# V√©rification
teams_ref_2018.shape, teams_ref_2018.head(10)

((32, 1),
          team
 0   Argentina
 1   Australia
 2     Belgium
 3      Brazil
 4    Colombia
 5  Costa Rica
 6     Croatia
 7     Denmark
 8       Egypt
 9     England)

Mapping Rounds

In [122]:
# Mapping des rounds (robuste ETL : nettoyage + harmonisation, sans NaN)
round_mapping = {
    "Preliminary": "Preliminary",   # utile pour la fusion (m√™me si absent en 2018)
    "Group Stage": "Group Stage",
    "Round of 16": "Round of 16",
    "Quarter-finals": "Quarter-finals",
    "Semi-finals": "Semi-finals",
    "Third place": "Third Place",
    "Third Place": "Third Place",
    "Final": "Final",
}

matches_2018_clean["round"] = (
    matches_2018_clean["round"]
    .astype(str)
    .str.strip()
    .map(round_mapping)
    .fillna(matches_2018_clean["round"])
)

# Contr√¥le
matches_2018_clean["round"].value_counts(dropna=False)

round
Group Stage       48
Round of 16        8
Quarter-finals     4
Semi-finals        2
Third Place        1
Final              1
Name: count, dtype: int64

Validation du SCH√âMA FINAL 

In [123]:
# Validation du sch√©ma final pour matches_2018_clean (√©dition 2018)

expected_cols_2018 = {
    "id_match",
    "home_team_id",
    "away_team_id",
    "home_result",
    "away_result",
    "result",
    "date",
    "round",
    "city",
    "edition",
}

missing = expected_cols_2018 - set(matches_2018_clean.columns)

if missing:
    raise ValueError(f"Colonnes manquantes dans matches_2018_clean : {missing}")

In [124]:
# Check final avant export :
# - volume attendu (64 matchs, 10 colonnes)
# - types coh√©rents pour le chargement SQL
# - table de r√©f√©rence √©quipes compl√®te (32 √©quipes)
matches_2018_clean.shape
matches_2018_clean.dtypes
teams_ref_2018.shape

(32, 1)

Formatage de la date avant export (affichage de l'heure supprim√©)

In [125]:
# Pr√©paration √† l'export (format d'affichage uniquement)
matches_2018_export = matches_2018_clean.copy()
matches_2018_export["date"] = matches_2018_export["date"].dt.strftime("%Y-%m-%d")

# Export
matches_2018_export.to_csv(
    "../data/processed/matches_2018_clean.csv",
    index=False
)

Export des donn√©es transform√©es

In [134]:
# Cr√©ation du dossier processed si n√©cessaire
Path("../data/processed").mkdir(parents=True, exist_ok=True)

# Export des tables finales
matches_2018_export = matches_2018_clean.copy()
matches_2018_export["date"] = matches_2018_export["date"].dt.strftime("%Y-%m-%d")

matches_2018_export.to_csv("../data/processed/matches_2018_clean.csv", index=False)

print("‚úÖ Transformation 2018 termin√©e : tables matches et √©quipes g√©n√©r√©es")

‚úÖ Transformation 2018 termin√©e : tables matches et √©quipes g√©n√©r√©es


In [135]:
matches_2018_export[["date"]].head()

Unnamed: 0,date
0,2018-06-14
1,2018-06-15
2,2018-06-19
3,2018-06-20
4,2018-06-25
