In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import warnings
import xml.etree.ElementTree as ET


warnings.filterwarnings("ignore")

# Import des fichiers

medals_df = pd.read_excel('./olympic_medals.xlsx')

# Charger les données des hôtes des Jeux Olympiques
tree = ET.parse('./olympic_hosts.xml')
root = tree.getroot()

hosts_data = []
for row in root.findall('row'):
    data = {child.tag: child.text for child in row}
    hosts_data.append(data)

hosts_df = pd.DataFrame(hosts_data)

In [5]:
# Renommage de la colonne 'slug_game' pour avoir une cohésion
medals_df = medals_df.rename({'slug_game': 'game_slug'}, axis=1)
medals_df.columns

Index(['Unnamed: 0', 'discipline_title', 'game_slug', 'event_title',
       'event_gender', 'medal_type', 'participant_type', 'participant_title',
       'athlete_url', 'athlete_full_name', 'country_name', 'country_code',
       'country_3_letter_code'],
      dtype='object')

In [6]:
# Supprimer les colonnes non nécessaires
medals_df.drop(['Unnamed: 0', 'participant_title', 'athlete_url'], axis=1, inplace=True)
print(medals_df.head())

  discipline_title     game_slug    event_title event_gender medal_type  \
0          Curling  beijing-2022  Mixed Doubles        Mixed       GOLD   
1          Curling  beijing-2022  Mixed Doubles        Mixed       GOLD   
2          Curling  beijing-2022  Mixed Doubles        Mixed     SILVER   
3          Curling  beijing-2022  Mixed Doubles        Mixed     SILVER   
4          Curling  beijing-2022  Mixed Doubles        Mixed     BRONZE   

  participant_type     athlete_full_name country_name country_code  \
0         GameTeam  Stefania CONSTANTINI        Italy           IT   
1         GameTeam          Amos MOSANER        Italy           IT   
2         GameTeam      Kristin SKASLIEN       Norway           NO   
3         GameTeam    Magnus NEDREGOTTEN       Norway           NO   
4         GameTeam         Almida DE VAL       Sweden           SE   

  country_3_letter_code  
0                   ITA  
1                   ITA  
2                   NOR  
3                   NOR 

In [7]:
# Extraire les données qui concerne la France
france_medals = medals_df[medals_df['country_name'] == 'France']
print(france_medals.head())

     discipline_title     game_slug              event_title event_gender  \
26   Freestyle Skiing  beijing-2022  Women's Freeski Big Air        Women   
88          Snowboard  beijing-2022  Women's Snowboard Cross        Women   
139    Figure skating  beijing-2022                Ice Dance        Mixed   
140    Figure skating  beijing-2022                Ice Dance        Mixed   
172          Biathlon  beijing-2022     Men's 12.5km Pursuit          Men   

    medal_type participant_type       athlete_full_name country_name  \
26      SILVER          Athlete             Tess LEDEUX       France   
88      SILVER          Athlete         Chloe TRESPEUCH       France   
139       GOLD         GameTeam     Gabriella PAPADAKIS       France   
140       GOLD         GameTeam       Guillaume CIZERON       France   
172       GOLD          Athlete  Quentin FILLON MAILLET       France   

    country_code country_3_letter_code  
26            FR                   FRA  
88            FR      

In [8]:
#Regroupement des données par la colonne "game_slug" et comptage du nombre de médailles
medals_count = france_medals.groupby(['game_slug'])['medal_type'].count().reset_index()
print(medals_count.head())

          game_slug  medal_type
0  albertville-1992          10
1    amsterdam-1928          21
2      antwerp-1920          45
3       athens-1896          11
4       athens-2004          36


In [9]:
# Renommage des deux colonnes
medals_count.columns = ['game_slug', 'medal_count']
print(medals_count.head())

          game_slug  medal_count
0  albertville-1992           10
1    amsterdam-1928           21
2      antwerp-1920           45
3       athens-1896           11
4       athens-2004           36


In [10]:
# Fusion avec des données de "hosts_df" et "medals_count" 
merged_df = hosts_df.merge(medals_count, on='game_slug', how='left')
# Remplissage des valeurs manquantes avec 0
merged_df['medal_count'].fillna(0, inplace=True)  
print(merged_df.head())

  index         game_slug         game_end_date       game_start_date  \
0     0      beijing-2022  2022-02-20T12:00:00Z  2022-02-04T15:00:00Z   
1     1        tokyo-2020  2021-08-08T14:00:00Z  2021-07-23T11:00:00Z   
2     2  pyeongchang-2018  2018-02-25T08:00:00Z  2018-02-08T23:00:00Z   
3     3          rio-2016  2016-08-21T21:00:00Z  2016-08-05T12:00:00Z   
4     4        sochi-2014  2014-02-23T16:00:00Z  2014-02-07T04:00:00Z   

        game_location         game_name game_season game_year  medal_count  
0               China      Beijing 2022      Winter      2022         15.0  
1               Japan        Tokyo 2020      Summer      2020         37.0  
2   Republic of Korea  PyeongChang 2018      Winter      2018         17.0  
3              Brazil          Rio 2016      Summer      2016         45.0  
4  Russian Federation        Sochi 2014      Winter      2014         15.0  


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2

# Exemple de données - Assurez-vous que merged_df est déjà défini
# merged_df = pd.read_csv('votre_fichier.csv')

# Création de la colonne 'is_host'. Si game_location = 'France' ALORS 1 SINON 0.
merged_df['is_host'] = merged_df['game_location'].apply(lambda x: 1 if 'France' in x else 0)

# Création de la colonne "year" avec les données de la colonne "game_year"
merged_df['year'] = merged_df['game_year']

# Création de la colonne 'season'. Si game_season = 'Summer' ALORS 1 SINON 0.
merged_df['season'] = merged_df['game_season'].apply(lambda x: 1 if x == 'Summer' else 0)

# Convertir les colonnes en type float32
merged_df['year'] = merged_df['year'].astype('float32')
merged_df['season'] = merged_df['season'].astype('float32')
merged_df['is_host'] = merged_df['is_host'].astype('float32')
merged_df['medal_count'] = merged_df['medal_count'].astype('float32')

# Afficher les premières lignes du DataFrame pour vérification
print("Premières lignes du DataFrame:")
print(merged_df.head())

# Afficher les types de données
print("\nTypes de données:")
print(merged_df.dtypes)

# Sélection des caractéristiques
X = merged_df[['year', 'season', 'is_host']]
y = merged_df['medal_count']

# Normaliser les caractéristiques
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Afficher les dimensions des ensembles d'entraînement et de test
print("\nDimensions des ensembles:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}, y_test: {y_test.shape}")

# Construire le modèle de deep learning
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dense(1))  # Sortie avec une seule unité pour la régression

# Compiler le modèle
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Afficher le résumé du modèle
print("\nRésumé du modèle:")
model.summary()

# Entraîner le modèle
print("\nEntraînement du modèle...")
history = model.fit(X_train, y_train, epochs=100, batch_size=10, validation_split=0.2)

# Évaluer le modèle (facultatif, pour vérifier les performances sur les données de test)
print("\nÉvaluation du modèle sur l'ensemble de test:")
loss, mae = model.evaluate(X_test, y_test)
print(f'Loss: {loss}, MAE: {mae}')

# Préparer les données pour la prédiction de 2024
# On suppose que la France est l'hôte en 2024 et que les jeux sont d'été
data_2024 = pd.DataFrame({
    'year': [2024.0],
    'season': [1.0],  # 1 pour l'été
    'is_host': [1.0]  # 1 car la France est l'hôte
})

# Normaliser les caractéristiques de 2024
data_2024_scaled = scaler.transform(data_2024)

# Faire la prédiction pour 2024
pred_2024 = model.predict(data_2024_scaled)
print(f'\nPrédiction du nombre de médailles pour la France en 2024: {pred_2024[0][0]}')



Premières lignes du DataFrame:
  index         game_slug         game_end_date       game_start_date  \
0     0      beijing-2022  2022-02-20T12:00:00Z  2022-02-04T15:00:00Z   
1     1        tokyo-2020  2021-08-08T14:00:00Z  2021-07-23T11:00:00Z   
2     2  pyeongchang-2018  2018-02-25T08:00:00Z  2018-02-08T23:00:00Z   
3     3          rio-2016  2016-08-21T21:00:00Z  2016-08-05T12:00:00Z   
4     4        sochi-2014  2014-02-23T16:00:00Z  2014-02-07T04:00:00Z   

        game_location         game_name game_season game_year  medal_count  \
0               China      Beijing 2022      Winter      2022         15.0   
1               Japan        Tokyo 2020      Summer      2020         37.0   
2   Republic of Korea  PyeongChang 2018      Winter      2018         17.0   
3              Brazil          Rio 2016      Summer      2016         45.0   
4  Russian Federation        Sochi 2014      Winter      2014         15.0   

   is_host    year  season  
0      0.0  2022.0     0.0  
1  