# 1. Import des bibliothèques


In [None]:
import pandas as pd
import numpy as np
import re
import dateparser

from datetime import datetime
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Nettoyage du dataset de matchs de baby-foot


In [None]:
import pandas as pd
import numpy as np

FILE_PATH = '/content/drive/MyDrive/babyfoot_dataset.csv'
SEPARATEUR = ','

df = pd.read_csv(FILE_PATH, sep=SEPARATEUR)

  df = pd.read_csv(FILE_PATH, sep=SEPARATEUR)


In [None]:
df.head(5)

Unnamed: 0,game_id,game_date,location,table_id,table_condition,ball_type,music_playing,referee,game_duration,final_score_red,...,possession_time,mood,player_comment,team_color,is_substitute,ping_ms,notes,duplicate_flag,misc,created_at
0,G015295,Feb 06st 2023,Ynov Toulouse,T05,beer stains,,Spotify: Queen - We Will Rock You,Paul Kim,12.45,0,...,9:36,1,ref biased,Red,yes,185.0,injured,,,2025-10-02T10:41:54
1,G023800,24-03-2023,Cafeteria (1st floor),T07,worn,,Indie playlist,,8.57,10,...,,🙂,,Blue,yes,,,0.0,-,2025-10-02T10:41:55
2,G023577,2025-01-13,Gym Hall,T26,scratched,,Spotify: Queen - We Will Rock You,Lena Clement,17.2,2,...,5.09min,2,,R,no,,double booked,0.0,,2025-10-02T10:41:55
3,G020644,Nov 11 2025,Salle Polyvalente,T21,worn,mini ball,EDM mix,Isabella Girard,5.18,6,...,,3,team spirit high,B,maybe,,double booked,,-,2025-10-02T10:41:54
4,G011677,30 Sep 23,Campus - Cafeteria,T26,missing screw,trainer ball,Oldies 80s,yes,6min,3,...,177,😂,rage quit,Red,yes,,,,,2025-10-02T10:41:54


In [None]:
df.shape

(100200, 35)

In [None]:
# Suppression des lignes dont les IDs critiques manquent
df.dropna(subset=['player_id', 'game_id'], inplace=True)

# Mise à jour et suppression des doublons
df.drop_duplicates(subset=['game_id', 'player_id'], inplace=True)

print(f"Après standardisation des IDs et suppression des doublons : {df.shape[0]} lignes restantes.")

Après standardisation des IDs et suppression des doublons : 100000 lignes restantes.


In [None]:
def clean_final_score(row):
    """Nettoie les scores red et blue quand le format 'x - y' est dans final_score_red."""
    s_red = row['final_score_red']
    s_blue = row['final_score_blue']


    # Si la cellule est vide ou NaN
    if pd.isna(s_red):
        return pd.Series([pd.NA, pd.NA])

    s_red = str(s_red).strip().lower()
    s_blue = str(s_blue).strip().lower()

    # Si format 'x - y' détecté
    if '�' in s_red :
        s_red = s_red.replace('�', '')
    if '�' in s_blue :
        s_blue = s_red.replace('�', '')
    if ' - ' in s_red:
        parts = s_red.split(' - ')

        red = int(float(parts[0].strip()))
        blue = int(float(parts[1].strip()))
    else:
        # Sinon, on convertit juste les deux colonnes existantes
        try:
            red = int(float(s_red))
            blue = int(float(row['final_score_blue'])) if not pd.isna(row['final_score_blue']) else pd.NA
        except:
            return pd.Series([pd.NA, pd.NA])

    # Règle métier : score entre 0 et 10
    if not (0 <= red <= 10 and (blue is pd.NA or 0 <= blue <= 10)):
        return pd.Series([pd.NA, pd.NA])

    return pd.Series([red, blue])

# Application ligne par ligne
df[['final_score_red','final_score_blue']] = df.apply(clean_final_score, axis=1).astype('Int64')


In [None]:
df[df['final_score_red'].isna() & df['final_score_blue'].isna()].head(5)


Unnamed: 0,game_id,game_date,location,table_id,table_condition,ball_type,music_playing,referee,game_duration,final_score_red,...,possession_time,mood,player_comment,team_color,is_substitute,ping_ms,notes,duplicate_flag,misc,created_at
69603,G006559,2023/06/30,Ynov Toulouse,T06,worn,mini ball,Indie playlist,Taylor Bernard,11.46,,...,370,,8 goals in one minute,Red,no,125,double booked,yes,,2025-10-02T10:41:53


In [None]:
print(df["team_color"].unique())
print(df["winner"].unique())

['Red' 'Blue' 'R' 'B' '🔴' '🔵' 'blue' 'red' 'Red�' 'Blue�']
['Blue' 'red' 'Bleu' 'blue' 'Red' nan 'R' 'B' 'BLUE' 'Rouge' 'draw' 'TIE'
 'RED' 'tie' 'RED�' 'Rouge�']


In [None]:
def standardize_color(color):
    """Standardise les couleurs d'équipe en 'RED' ou 'BLUE'."""
    if pd.isna(color): return 'UNKNOWN'
    color = str(color).strip().lower()
    if color in ['red', 'r', 'redteam', '🔴', 'red�', 'rouge', 'rouge�']: return 'RED'
    elif color in ['blue', 'b', 'blueteam','🔵', 'blue�', 'bleu', 'bleu�']: return 'BLUE'
    elif color in ['tie', 'draw']: return 'DRAW'
    else: return 'UNKNOWN'

# Application
df['winner'] = df['winner'].apply(standardize_color)
df['team_color'] = df['team_color'].apply(standardize_color)

# Suppression des matchs inutilisables
df.dropna(subset=['final_score_red', 'final_score_blue'], inplace=True)
df = df[df['winner'] != 'UNKNOWN']
print(f"Après nettoyage des scores et des gagnants : {df.shape[0]} lignes restantes.")

Après nettoyage des scores et des gagnants : 95339 lignes restantes.


In [None]:
def clean_player_stat(stat):
    """Nettoie les statistiques de joueur (buts, saves, etc.) en gérant le texte et les NaN."""
    if pd.isna(stat): return 0
    stat = str(stat).strip().lower()

    # Gérer les cas textuels (ex: 'two' -> 2)
    text_to_num = {'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5}
    if stat in text_to_num: return text_to_num[stat]

    try:
        # Conversion finale en entier
        return int(float(stat))
    except:
        return 0

stats_cols = ['player_goals', 'player_own_goals', 'player_assists', 'player_saves']
for col in stats_cols:
    df[col] = df[col].apply(clean_player_stat)

print("Statistiques individuelles (buts, saves, assists) nettoyées.")

Statistiques individuelles (buts, saves, assists) nettoyées.


In [None]:
df[['player_goals', 'player_own_goals', 'player_assists', 'player_saves']].head(5)

Unnamed: 0,player_goals,player_own_goals,player_assists,player_saves
0,0,0,4,3
1,0,0,2,2
2,2,0,1,1
3,0,0,5,1
4,0,0,5,8


In [None]:
# Conversion de la date
df['game_date'] = pd.to_datetime(df['game_date'], errors='coerce')
df.dropna(subset=['game_date'], inplace=True)
df['DayOfWeek'] = df['game_date'].dt.day_name()

import re

def duration_to_seconds(duration):
    """Convertit différentes représentations de durée en secondes."""
    if pd.isna(duration):
        return np.nan

    s = str(duration).strip().lower()

    # 1️⃣ Format HH:MM:SS ou MM:SS
    if ':' in s:
        try:
            parts = list(map(int, s.split(':')))
            if len(parts) == 3:
                return parts[0]*3600 + parts[1]*60 + parts[2]
            elif len(parts) == 2:
                return parts[0]*60 + parts[1]
        except:
            return np.nan

    # 2️⃣ Format "10min", "4 min"
    match_min = re.match(r'(\d+(\.\d+)?|\d+,\d+)\s*min', s)
    if match_min:
        val = match_min.group(1).replace(',', '.')
        try:
            return int(float(val) * 60)
        except:
            return np.nan

    # 3️⃣ Format décimal en minutes "4,1" ou "4.1"
    match_decimal = re.match(r'^\d+([.,]\d+)?$', s)
    if match_decimal:
        val = s.replace(',', '.')
        try:
            return int(float(val) * 60)
        except:
            return np.nan

    return np.nan

def clean_attendance(value):
    """Extrait le nombre entier du nombre de joueurs."""
    if pd.isna(value):
        return np.nan
    s = str(value).strip().lower()

    # Cherche le premier nombre dans la chaîne
    match = re.search(r'\d+', s)
    if match:
        return int(match.group())
    return np.nan

# Application
df['attendance_count'] = df['attendance_count'].apply(clean_attendance).astype('Int64')

# Application
df['game_duration'] = df['game_duration'].apply(duration_to_seconds)

# Nettoyage des autres colonnes
df['location'] = df['location'].str.strip().str.upper().replace(['TOULOUSE','YNOV TLS', 'YNOV - BÂTIMENT A','YNOV_TLS'], 'YNOV TOULOUSE').fillna('UNKNOWN')

df['player_role'] = df['player_role'].str.strip().str.upper().replace(['DEF','DEFENCE'], 'DEFENSE')
df['player_role'] = df['player_role'].str.strip().str.upper().replace(['ATT', 'ATTCK'], 'ATTACK').fillna('UNKNOWN')

df['referee'] = df['referee'].str.strip().str.upper().replace(['yes','no', 'Player1'], 'NONE').fillna('NONE')

df['season'] = df['season'].str.strip().replace(['s24/25','Season 24-25'], '2024/2025')
df['season'] = df['season'].str.strip().replace('2025 Season', '2025')

# --- PÔLE 3 : Création de Features IA ---
#df['IsWinner'] = np.where(df['team_color'] == df['winner'], 1, 0)
#df['ScoreDifference'] = abs(df['final_score_red'] - df['final_score_blue'])


In [None]:
df[['game_id','game_date', 'DayOfWeek', 'game_duration', 'location', 'player_role', 'referee', 'attendance_count']].head(5)

Unnamed: 0,game_id,game_date,DayOfWeek,game_duration,location,player_role,referee,attendance_count
0,G015295,2023-02-06,Monday,747,YNOV TOULOUSE,DEFENSE,PAUL KIM,8
13,G009714,2023-03-06,Monday,2304,GYM HALL,DEFENSE,LEO NAKAMURA,5
51,G003438,2024-10-13,Sunday,600,YNOV TOULOUSE,DEFENSE,NO,4
53,G012107,2025-03-25,Tuesday,805,SALLE POLYVALENTE,DEFENSE,NONE,4
80,G010274,2024-04-20,Saturday,1009,YNOV TOULOUSE,ATTACK,LENA MARTIN,4


In [None]:
# --- PÔLE 2 : CLEANING rating_raw (pour la suite de l'EDA) ---
def standardize_rating(rating):
    """Convertit le rating en une échelle numérique (1-5)."""
    if pd.isna(rating): return np.nan
    rating = str(rating).strip().lower()

    # Mapping des symboles et mots-clés
    if '⭐⭐⭐⭐⭐' in rating or '5' in rating or 'excellent' in rating: return 5
    elif '⭐⭐⭐⭐' in rating or '4' in rating or 'good' in rating: return 4
    elif '⭐⭐⭐' in rating or '3' in rating or 'ok' in rating: return 3
    elif '⭐⭐' in rating or '2' in rating or 'bad' in rating: return 2
    elif '⭐' in rating or '1' in rating or 'terrible' in rating: return 1

    # Tentative de conversion simple pour les notes numériques directes
    try: return float(rating) if 1 <= float(rating) <= 5 else np.nan
    except: return np.nan

df['rating_raw'] = df['rating_raw'].apply(standardize_rating).astype('Int64')

print("Colonne 'rating_standardized' créée pour l'analyse d'expérience.")

Colonne 'rating_standardized' créée pour l'analyse d'expérience.


In [None]:
# Sélection des colonnes choisies par le groupe + les nouvelles features créées
colonnes_finales = [
    'game_id', 'game_date', 'DayOfWeek', 'location', 'game_duration',
    'final_score_red', 'final_score_blue', 'winner',
    'player_id', 'player_canonical_name', 'team_color', 'player_role',
    'player_goals', 'player_own_goals', 'player_assists', 'player_saves',
    'referee', 'attendance_count', 'season', 'recorded_by', 'rating_raw',
    'player_comment'
]

# Enregistrement du fichier prêt
df_clean = df.filter(items=colonnes_finales).copy()
df_clean.to_csv('dataset_Ynov_babyfoot_CLEAN.csv', index=False)

print("\n-------------------------------------------------------------")
print("LE NETTOYAGE EST TERMINÉ. Le fichier 'dataset_Ynov_babyfoot_CLEAN.csv' est prêt.")
print("Lignes Finales Utiles :", df_clean.shape[0])
print("-------------------------------------------------------------")


-------------------------------------------------------------
LE NETTOYAGE EST TERMINÉ. Le fichier 'dataset_Ynov_babyfoot_CLEAN.csv' est prêt.
Lignes Finales Utiles : 2052
-------------------------------------------------------------
