In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('../data/raw/matches_19302010 (1).csv')

In [3]:
df2 = pd.read_csv('../data/raw/matche_2014.csv')

In [4]:
df

Unnamed: 0,edition,round,score,team1,team2,url,venue,year
0,1930-URUGUAY,GROUP_STAGE,4-1 (3-0),France,Mexico (México),1930_URUGUAY_FS.htm#1-WC-30-I,Montevideo.,1930
1,1930-URUGUAY,GROUP_STAGE,3-0 (2-0),USA,Belgium (België),1930_URUGUAY_FS.htm#13-WC-30-I,Montevideo.,1930
2,1930-URUGUAY,GROUP_STAGE,2-1 (2-0),Yugoslavia (Југославија),Brazil (Brasil),1930_URUGUAY_FS.htm#7-WC-30-I,Montevideo.,1930
3,1930-URUGUAY,GROUP_STAGE,3-1 (1-0),Romania (România),Peru (Perú),1930_URUGUAY_FS.htm#10-WC-30-I,Montevideo.,1930
4,1930-URUGUAY,GROUP_STAGE,1-0 (0-0),Argentina,France,1930_URUGUAY_FS.htm#2-WC-30-I,Montevideo.,1930
...,...,...,...,...,...,...,...,...
7294,2014-BRAZIL,1/4_FINAL,(C),3,4,2014_BRAZIL_FS.htm#832-WC-14-III,Salvador.,2014
7295,2014-BRAZIL,1/2_FINAL,(X),A,B,2014_BRAZIL_FS.htm#833-WC-14-IV,Belo Horizonte.,2014
7296,2014-BRAZIL,1/2_FINAL,(Y),C,D,2014_BRAZIL_FS.htm#834-WC-14-IV,São_Paulo.,2014
7297,2014-BRAZIL,PLACES_3&4,xxx,LOSER X,LOSER Y,2014_BRAZIL_FS.htm#835-WC-14-V,Brasília.,2014


In [5]:
df["round"].unique()

array(['GROUP_STAGE', '1/2_FINAL', '_FINAL', 'PRELIMINARY-Europe',
       'PRELIMINARY-N/C.America', 'PRELIMINARY-N.E.', 'FIRST',
       '1/4_FINAL', 'PLACES_3&4', 'PRELIMINARY-Eur./N.E.',
       'PRELIMINARY-S.America', 'FINAL_ROUND', 'PRELIMINARY-Eu./Afr.',
       'PRELIMINARY-Asia', 'PRELIMINARY-Afr./As.', 'PRELIMINARY-Euro/As.',
       'PRELIMINARY-E./Afr./As.', 'PRELIMINARY-Af./As./O.',
       'PRELIMINARY-Africa', 'PRELIMINARY-As./O.', 'SEMIFINAL_STAGE',
       'QUARTERFINAL_STAGE', 'PRELIMINARY-O./As.', '1/8_FINAL',
       'PRELIMINARY-Oceania'], dtype=object)

# Mapping des rounds

Valeur standardisée -> Valeurs brutes correspondantes 

`Preliminary` -> PRELIMINARY-*

`Group Stage` -> GROUP_STAGE

`Second Group Stage` -> FINAL_ROUND (1950), SEMIFINAL_STAGE (1974/1978), QUARTERFINAL_STAGE (1982)

`Round of 16` -> 1/8_FINAL, FIRST (1934/1938)

`Quarter-finals` -> 1/4_FINAL

`Semi-finals` -> 1/2_FINAL

`Third Place` -> PLACES_3&4

`Final` -> _FINAL

## Notes importantes:
- **1950**: Pas de phase à élimination directe, poule finale à 4 équipes (FINAL_ROUND → Second Group Stage)
- **1974/1978**: Deuxième tour en poules (SEMIFINAL_STAGE → Second Group Stage)
- **1982**: Deuxième tour en poules (QUARTERFINAL_STAGE → Second Group Stage)
- **1934/1938**: FIRST = Round of 16 (premier tour à élimination directe)

In [6]:
df2

Unnamed: 0,edition_year,host_country,round,team1,team2,venue,year,score_team1,score_team2,extra_time,penalty_shootout,replay
0,2014,BRAZIL,Group Stage,Brazil,Croatia,Sao Paulo,2014,3,1,False,False,False
1,2014,BRAZIL,Group Stage,Mexico,Cameroon,Natal,2014,1,0,False,False,False
2,2014,BRAZIL,Group Stage,Netherlands,Spain,Salvador,2014,5,1,False,False,False
3,2014,BRAZIL,Group Stage,Chile,Australia,Cuiaba,2014,3,1,False,False,False
4,2014,BRAZIL,Group Stage,Colombia,Greece,Belo Horizonte,2014,3,0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
59,2014,BRAZIL,Quarter-finals,Netherlands,Costa Rica,Salvador,2014,0,0,True,True,False
60,2014,BRAZIL,Semi-finals,Germany,Brazil,Belo Horizonte,2014,7,1,False,False,False
61,2014,BRAZIL,Semi-finals,Argentina,Netherlands,Sao Paulo,2014,0,0,True,True,False
62,2014,BRAZIL,Third Place,Netherlands,Brazil,Brasilia,2014,3,0,False,False,False


In [7]:
df2["round"].unique()

array(['Group Stage', 'Round of 16', 'Quarter-finals', 'Semi-finals',
       'Third Place', 'Final'], dtype=object)

In [8]:
df[['edition_year', 'host_country']] = df['edition'].str.split('-', n=1, expand=True)
df['edition_year'] = df['edition_year'].astype(int)
df = df.drop(columns=['edition'])

In [9]:
def clean_round(r):
    r = str(r).replace('_', ' ').strip().upper()
    
    # Ordre important: traiter les cas spéciaux AVANT les cas généraux
    
    # Préliminaires et premier tour (1934, 1938 utilisent FIRST comme Round of 16)
    if 'PRELIMINARY' in r:
        return 'Preliminary'
    
    # Phase de groupes
    elif 'GROUP STAGE' in r or r == 'GROUP':
        return 'Group Stage'
    
    # Deuxième tour en poules (1950: FINAL_ROUND, 1974/1978: SEMIFINAL_STAGE, 1982: QUARTERFINAL_STAGE)
    elif 'FINAL ROUND' in r or 'SEMIFINAL STAGE' in r or 'QUARTERFINAL STAGE' in r:
        return 'Second Group Stage'
    
    # Round of 16 (1934/1938 utilisent FIRST)
    elif '1/8' in r or r == 'FIRST':
        return 'Round of 16'
    
    # Quarts de finale
    elif '1/4' in r:
        return 'Quarter-finals'
    
    # Demi-finales
    elif '1/2' in r:
        return 'Semi-finals'
    
    # Match pour la 3ème place
    elif 'PLACES' in r or '3&4' in r or '3RD' in r or 'THIRD' in r:
        return 'Third Place'
    
    # Finale (doit être en dernier car contient "FINAL")
    elif 'FINAL' in r:
        return 'Final'
    
    else:
        return r

df['round'] = df['round'].apply(clean_round)

In [10]:
def parse_score(score):
    score = str(score)
    
    # Détecter prolongation et replay
    extra_time = '(a.e.t.)' in score or 'aet' in score.lower()
    replay = '(r.)' in score
    penalty = bool(re.search(r'\(\d+-\d+\)', score) and ('p' in score.lower() or 'pso' in score.lower()))
    
    # Extraire le score final (premier pattern X-X)
    match = re.match(r'(\d+)-(\d+)', score)
    if match:
        score_team1 = int(match.group(1))
        score_team2 = int(match.group(2))
    else:
        score_team1 = None
        score_team2 = None
    
    return pd.Series({
        'score_team1': score_team1,
        'score_team2': score_team2,
        'extra_time': extra_time,
        'penalty_shootout': penalty,
        'replay': replay
    })

score_df = df['score'].apply(parse_score)
df = pd.concat([df, score_df], axis=1)
df = df.drop(columns=['score'])

df['score_team1'] = df['score_team1'].astype('Int64')
df['score_team2'] = df['score_team2'].astype('Int64')

In [11]:
def extract_english_name(name):
    name = str(name)
    return re.sub(r'\s*\([^)]*\)', '', name).strip()

df['team1'] = df['team1'].apply(extract_english_name)
df['team2'] = df['team2'].apply(extract_english_name)

In [12]:
df['venue'] = df['venue'].str.replace('.', '', regex=False)
df['venue'] = df['venue'].str.replace('_', ' ', regex=False)

In [13]:
df = df.drop(columns=['url'])

In [14]:
supp = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2', 'D1', 'D2', 'E1', 'E2', 'F1', 'F2', 'G1', 'G2', 'H1', 'H2',
                'WINNER', 'LOSER', '1', '2', '3', '4', '5', '6', '7', '8', 'A', 'B', 'C', 'D']

df = df[~df['team1'].isin(supp)]
df = df[~df['team2'].isin(supp)]
df = df[df['score_team1'].notna()]

df = pd.concat([df, df2], ignore_index=True)

In [15]:
df = df[['edition_year', 'host_country', 'round', 'team1', 'team2', 'score_team1', 'score_team2', 'extra_time', 'penalty_shootout', 'replay', 'venue', 'year']]

In [16]:
df

Unnamed: 0,edition_year,host_country,round,team1,team2,score_team1,score_team2,extra_time,penalty_shootout,replay,venue,year
0,1930,URUGUAY,Group Stage,France,Mexico,4,1,False,False,False,Montevideo,1930
1,1930,URUGUAY,Group Stage,USA,Belgium,3,0,False,False,False,Montevideo,1930
2,1930,URUGUAY,Group Stage,Yugoslavia,Brazil,2,1,False,False,False,Montevideo,1930
3,1930,URUGUAY,Group Stage,Romania,Peru,3,1,False,False,False,Montevideo,1930
4,1930,URUGUAY,Group Stage,Argentina,France,1,0,False,False,False,Montevideo,1930
...,...,...,...,...,...,...,...,...,...,...,...,...
7294,2014,BRAZIL,Quarter-finals,Netherlands,Costa Rica,0,0,True,True,False,Salvador,2014
7295,2014,BRAZIL,Semi-finals,Germany,Brazil,7,1,False,False,False,Belo Horizonte,2014
7296,2014,BRAZIL,Semi-finals,Argentina,Netherlands,0,0,True,True,False,Sao Paulo,2014
7297,2014,BRAZIL,Third Place,Netherlands,Brazil,3,0,False,False,False,Brasilia,2014


In [17]:
equipes = sorted(pd.concat([df['team1'], df['team2']]).unique())

In [18]:
df_teams = pd.DataFrame({
    'id_team': range(1, len(equipes) + 1),
    'nom_standard': equipes,
    'confederation': None,
    'aliases': '[]'
})

In [19]:
df_teams

Unnamed: 0,id_team,nom_standard,confederation,aliases
0,1,Afghanistan,,[]
1,2,Albania,,[]
2,3,Algeria,,[]
3,4,American Samoa,,[]
4,5,Andorra,,[]
...,...,...,...,...
224,225,Yemen,,[]
225,226,Yugoslavia,,[]
226,227,Zaire,,[]
227,228,Zambia,,[]


In [20]:
team_to_id = dict(zip(df_teams['nom_standard'], df_teams['id_team']))

In [21]:
# Constantes pour la gestion des résultats en knockout

# Dictionnaire pour les finales aux tirs au but: (année) -> équipe gagnante
FINALS_PENALTY_WINNERS = {
    1994: 'Brazil',   # Brazil bat Italy aux TAB
    2006: 'Italy',    # Italy bat France aux TAB
}

# Rounds où le draw est possible (poules)
GROUP_ROUNDS = ['Preliminary', 'Group Stage', 'Second Group Stage']

# Mapping: round actuel -> round où chercher le vainqueur
WINNER_ROUND_MAP = {
    'Round of 16': 'Quarter-finals',
    'Quarter-finals': 'Semi-finals',
    'Semi-finals': 'Final',
}

# Pour Semi-finals, le perdant va en Third Place
LOSER_ROUND_MAP = {
    'Semi-finals': 'Third Place',
}

def find_winner_from_next_round(row, df_full):
    """
    Trouve le vainqueur d'un match nul en vérifiant:
    - Qui apparaît au tour suivant (vainqueur)
    - Ou qui apparaît en Third Place (perdant de Semi-finals)
    """
    edition = row['edition_year']
    team1 = row['team1']
    team2 = row['team2']
    current_round = row['round']
    
    # Chercher le vainqueur dans le tour suivant
    if current_round in WINNER_ROUND_MAP:
        next_round = WINNER_ROUND_MAP[current_round]
        next_matches = df_full[(df_full['edition_year'] == edition) & (df_full['round'] == next_round)]
        teams_in_next = set(next_matches['team1'].tolist() + next_matches['team2'].tolist())
        
        if team1 in teams_in_next and team2 not in teams_in_next:
            return 'home_team'
        elif team2 in teams_in_next and team1 not in teams_in_next:
            return 'away_team'
    
    # Pour Semi-finals: vérifier aussi Third Place (perdant)
    if current_round in LOSER_ROUND_MAP:
        loser_round = LOSER_ROUND_MAP[current_round]
        loser_matches = df_full[(df_full['edition_year'] == edition) & (df_full['round'] == loser_round)]
        teams_in_loser = set(loser_matches['team1'].tolist() + loser_matches['team2'].tolist())
        
        if team1 in teams_in_loser and team2 not in teams_in_loser:
            return 'away_team'  # team1 a perdu, donc team2 a gagné
        elif team2 in teams_in_loser and team1 not in teams_in_loser:
            return 'home_team'  # team2 a perdu, donc team1 a gagné
    
    return None

def get_result(row, df_full):
    """
    Détermine le résultat du match.
    Pour les phases de groupes: home_team, away_team, ou draw
    Pour les phases à élimination: home_team ou away_team (pas de draw)
    """
    score1 = row['score_team1']
    score2 = row['score_team2']
    round_name = row['round']
    edition = row['edition_year']
    team1 = row['team1']
    team2 = row['team2']
    
    # Si le score est différent, le résultat est clair
    if score1 > score2:
        return 'home_team'
    elif score1 < score2:
        return 'away_team'
    
    # Draw: OK pour les phases de groupes
    if round_name in GROUP_ROUNDS:
        return 'draw'
    
    # Draw en knockout: cas spéciaux
    
    # 1. Vérifier si c'est un replay (match rejoué) - le premier match était bien un nul
    if row.get('replay', False):
        return 'draw'  # Ce match spécifique est un draw (le précédent aussi)
    
    # 2. Si finale aux tirs au but, utiliser le dictionnaire hardcodé
    if round_name == 'Final' and edition in FINALS_PENALTY_WINNERS:
        winner = FINALS_PENALTY_WINNERS[edition]
        if team1 == winner:
            return 'home_team'
        elif team2 == winner:
            return 'away_team'
    
    # 3. Pour les autres matchs à élimination avec draw:
    # Déterminer le vainqueur en regardant qui est présent au tour suivant
    winner = find_winner_from_next_round(row, df_full)
    if winner:
        return winner
    
    # Si on ne peut pas déterminer, retourner draw (cas à vérifier manuellement)
    return 'draw'

In [22]:
df_matches = pd.DataFrame({
    'id_match': range(1, len(df) + 1),
    'home_team_id': df['team1'].map(team_to_id),
    'away_team_id': df['team2'].map(team_to_id),
    'home_result': df['score_team1'],
    'away_result': df['score_team2'],
    'result': df.apply(lambda row: get_result(row, df), axis=1),
    'extra_time': df['extra_time'],
    'penalties': df['penalty_shootout'],
    'replay': df['replay'],
    'date': None,
    'round': df['round'],
    'city': df['venue'],
    'id_stadium': None,
    'edition': df['edition_year']
})

In [23]:
df_matches['extra_time'] = df_matches['extra_time'].map({True: True, False: False})
df_matches['penalties'] = df_matches['penalties'].map({True: True, False: False})
df_matches['replay'] = df_matches['replay'].map({True: True, False: False})

In [24]:
df_matches['home_result'] = df_matches['home_result'].astype('Int64')
df_matches['away_result'] = df_matches['away_result'].astype('Int64')

In [25]:
df_matches

Unnamed: 0,id_match,home_team_id,away_team_id,home_result,away_result,result,extra_time,penalties,replay,date,round,city,id_stadium,edition
0,1,76,131,4,1,home_team,False,False,False,,Group Stage,Montevideo,,1930
1,2,213,21,3,0,home_team,False,False,False,,Group Stage,Montevideo,,1930
2,3,226,28,2,1,home_team,False,False,False,,Group Stage,Montevideo,,1930
3,4,164,157,3,1,home_team,False,False,False,,Group Stage,Montevideo,,1930
4,5,10,76,1,0,home_team,False,False,False,,Group Stage,Montevideo,,1930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7294,7295,141,47,0,0,home_team,True,True,False,,Quarter-finals,Salvador,,2014
7295,7296,81,28,7,1,home_team,False,False,False,,Semi-finals,Belo Horizonte,,2014
7296,7297,10,141,0,0,home_team,True,True,False,,Semi-finals,Sao Paulo,,2014
7297,7298,141,28,3,0,home_team,False,False,False,,Third Place,Brasilia,,2014
