In [1]:
from pathlib import Path

data_path = Path("../data/raw")
list(data_path.iterdir())

[PosixPath('../data/raw/matches_19302010 (1).csv'),
 PosixPath('../data/raw/WorldCupMatches2014 (1).csv'),
 PosixPath('../data/raw/matches_wc2022_en.json'),
 PosixPath('../data/raw/matche_2014.csv'),
 PosixPath('../data/raw/data_2018.json')]

In [2]:
import json

file_path = data_path / "data_2018.json"

with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

type(data)

dict

In [3]:
data.keys() if isinstance(data, dict) else len(data)

dict_keys(['stadiums', 'tvchannels', 'teams', 'groups', 'knockout'])

In [4]:
groups = data["groups"]
type(groups), groups.keys()

(dict, dict_keys(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']))

In [5]:
knockout = data["knockout"]
type(knockout), knockout.keys()

(dict,
 dict_keys(['round_16', 'round_8', 'round_4', 'round_2_loser', 'round_2']))

In [6]:
first_group = list(groups.keys())[0]
first_match = groups[first_group]["matches"][0]

first_match

{'name': 1,
 'type': 'group',
 'home_team': 1,
 'away_team': 2,
 'home_result': 5,
 'away_result': 0,
 'date': '2018-06-14T18:00:00+03:00',
 'stadium': 1,
 'channels': [4, 6, 13, 17, 20, 22],
 'finished': True,
 'matchday': 1}

In [7]:
first_ko = list(knockout.keys())[0]
knockout[first_ko]["matches"][0]

{'name': 49,
 'type': 'qualified',
 'home_team': 4,
 'away_team': 5,
 'home_result': 2,
 'away_result': 1,
 'home_penalty': None,
 'away_penalty': None,
 'winner': 4,
 'date': '2018-06-30T21:00:00+03:00',
 'stadium': 11,
 'channels': [4, 13, 15],
 'finished': True,
 'matchday': 4}

In [8]:
import pandas as pd
from pathlib import Path

# ----------------------------
# 1) Lookups
# ----------------------------

teams_lookup = {t["id"]: t["name"] for t in data["teams"]}

stadiums_lookup_name = {s["id"]: s.get("name") for s in data["stadiums"]}
stadiums_lookup_city = {s["id"]: s.get("city") for s in data["stadiums"]}

KO_ROUND_MAP = {
    "round_16": "Round of 16",
    "round_8": "Quarter-finals",
    "round_4": "Semi-finals",
    "round_2_loser": "Third Place",
    "round_2": "Final"
}

# ----------------------------
# 2) Extraction des matchs
# ----------------------------

matches = []

# ---- GROUP STAGE (48 matchs)
for group_data in data["groups"].values():
    for m in group_data["matches"]:
        stadium_id = m.get("stadium")

        matches.append({
            "match_id": m.get("name"),
            "date": m.get("date"),
            "round": "Group Stage",
            "home_team": teams_lookup.get(m.get("home_team")),
            "away_team": teams_lookup.get(m.get("away_team")),
            "home_goals": m.get("home_result"),
            "away_goals": m.get("away_result"),
            "home_penalty": None,  # Pas de TAB en phase de groupes
            "away_penalty": None,
            "id_stadium": stadiums_lookup_name.get(stadium_id),
            "city": stadiums_lookup_city.get(stadium_id),
            "edition": 2018
        })

# ---- KNOCKOUT (16 matchs)
for ko_key, ko_data in data["knockout"].items():
    round_name = KO_ROUND_MAP.get(ko_key)

    for m in ko_data["matches"]:
        stadium_id = m.get("stadium")

        matches.append({
            "match_id": m.get("name"),
            "date": m.get("date"),
            "round": round_name,
            "home_team": teams_lookup.get(m.get("home_team")),
            "away_team": teams_lookup.get(m.get("away_team")),
            "home_goals": m.get("home_result"),
            "away_goals": m.get("away_result"),
            "home_penalty": m.get("home_penalty"),  # Score TAB si applicable
            "away_penalty": m.get("away_penalty"),
            "id_stadium": stadiums_lookup_name.get(stadium_id),
            "city": stadiums_lookup_city.get(stadium_id),
            "edition": 2018
        })

# ----------------------------
# 3) DataFrame + export
# ----------------------------

df_matches = pd.DataFrame(matches)

print(df_matches.shape)
print(df_matches["round"].value_counts())

# Vérification des matchs avec TAB
print("\nMatchs avec tirs au but:")
print(df_matches[df_matches["home_penalty"].notna()][["home_team", "away_team", "home_goals", "away_goals", "home_penalty", "away_penalty", "round"]])

Path("../data/staging").mkdir(parents=True, exist_ok=True)
df_matches.to_csv("../data/staging/matches_2018_raw.csv", index=False)

print("\n✅ Extract terminé : matches_2018_raw.csv")

(64, 12)
round
Group Stage       48
Round of 16        8
Quarter-finals     4
Semi-finals        2
Third Place        1
Final              1
Name: count, dtype: int64

Matchs avec tirs au but:
   home_team away_team  home_goals  away_goals  home_penalty  away_penalty  \
50     Spain    Russia           1           1           3.0           4.0   
51   Croatia   Denmark           1           1           3.0           2.0   
55  Colombia   England           1           1           3.0           4.0   
58    Russia   Croatia           2           2           3.0           4.0   

             round  
50     Round of 16  
51     Round of 16  
55     Round of 16  
58  Quarter-finals  

✅ Extract terminé : matches_2018_raw.csv


In [9]:
df_matches.shape

(64, 12)

In [10]:
df_matches["round"].value_counts(dropna=False)

round
Group Stage       48
Round of 16        8
Quarter-finals     4
Semi-finals        2
Third Place        1
Final              1
Name: count, dtype: int64

In [11]:
df_matches.columns

Index(['match_id', 'date', 'round', 'home_team', 'away_team', 'home_goals',
       'away_goals', 'home_penalty', 'away_penalty', 'id_stadium', 'city',
       'edition'],
      dtype='object')

In [12]:
df_matches[["home_team","away_team","home_goals","away_goals","date"]].head(3)

Unnamed: 0,home_team,away_team,home_goals,away_goals,date
0,Russia,Saudi Arabia,5,0,2018-06-14T18:00:00+03:00
1,Egypt,Uruguay,0,1,2018-06-15T17:00:00+05:00
2,Russia,Egypt,3,1,2018-06-19T21:00:00+03:00


In [13]:
df_matches[df_matches["round"] != "Group Stage"].head(3)

Unnamed: 0,match_id,date,round,home_team,away_team,home_goals,away_goals,home_penalty,away_penalty,id_stadium,city,edition
48,49,2018-06-30T21:00:00+03:00,Round of 16,Uruguay,Portugal,2,1,,,Fisht Olympic Stadium,Sochi,2018
49,50,2018-06-30T17:00:00+03:00,Round of 16,France,Argentina,4,3,,,Kazan Arena,Kazan,2018
50,51,2018-07-01T17:00:00+03:00,Round of 16,Spain,Russia,1,1,3.0,4.0,Luzhniki Stadium,Moscow,2018


In [14]:
from pathlib import Path

Path("../data/staging").mkdir(parents=True, exist_ok=True)

df_matches.to_csv("../data/staging/matches_2018_raw.csv", index=False)