In [1]:
# Importation des librairies nécessaires

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib.dates as mdates
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# Charger les fichiers relatifs aux athlètes qualifiés pour 2024
df = pd.read_csv('PREDICTION.csv')

# Sélection des caractéristiques pertinentes
# Pour simplifier, nous sélectionnons quelques caractéristiques clés
features = ['age','rank_actual','score_total','half_marathon','QP','time_qualif_seconds','vitesse_qualif','Denivele_positif', 'Denivele_negatif','Height','Weight', 'Sponsor']
target = 'finish_time_seconds'  # Nous voulons prédire le temps du marathon

df['Sponsor'] = df.Sponsor.apply(lambda x: 1 if x else 0)
df['Followers'] = df.Sponsor.apply(lambda x: x if x else 0)



In [2]:
df.Sponsor

0      1
1      1
2      1
3      1
4      1
      ..
388    1
389    1
390    1
391    1
392    1
Name: Sponsor, Length: 393, dtype: int64

In [3]:
df.Followers

0      1
1      1
2      1
3      1
4      1
      ..
388    1
389    1
390    1
391    1
392    1
Name: Followers, Length: 393, dtype: int64

In [4]:
# Remplacer les valeurs manquantes par la moyenne

y = df[target].fillna(df[target].mean())  

# Sélection des données pour la modélisation

X = df[features].fillna(df[features].mean())  # Remplir les autres valeurs manquantes par 0 pour simplifier
X.isnull().sum()

age                    0
rank_actual            0
score_total            0
half_marathon          0
QP                     0
time_qualif_seconds    0
vitesse_qualif         0
Denivele_positif       0
Denivele_negatif       0
Height                 0
Weight                 0
Sponsor                0
dtype: int64

In [5]:
# Division des données en ensembles d'entraînement et de test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_test.isnull().sum()

age                    0
rank_actual            0
score_total            0
half_marathon          0
QP                     0
time_qualif_seconds    0
vitesse_qualif         0
Denivele_positif       0
Denivele_negatif       0
Height                 0
Weight                 0
Sponsor                0
dtype: int64

In [6]:
# Normalisation des caractéristiques
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Création du modèle de régression linéaire
model = LinearRegression()

# Entraînement du modèle
model.fit(X_train_scaled, y_train)

# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test_scaled)

# Calcul du RMSE
rmse = (mean_squared_error(y_test, y_pred, squared=False))**0.5

rmse

# Charger les fichiers relatifs aux athlètes qualifiés pour 2024
df_test = pd.read_csv('table_a_predir2.csv')

# Remplir les valeurs manquantes pour les colonnes numériques
numeric_cols = df_test.select_dtypes(include=np.number).columns
df_test[numeric_cols] = df_test[numeric_cols].fillna(df_test[numeric_cols].mean())
df_test['Sponsor'] = df_test.Sponsor.apply(lambda x: 1 if x else 0)

# Normalisation des caractéristiques
X_test_scaled2 = scaler.transform(df_test[features])

# Prédiction sur l'ensemble de test
y_pred2 = model.predict(X_test_scaled2)

In [7]:
rmse

17.92370611464118

In [8]:
df_test['predict_time'] = y_pred2

In [9]:
df_test

Unnamed: 0,year,full_name,gender,age,finish_time,finish_time_seconds,previous_time,progression_percentage,vitesse_finish_time,formatted_date_marathon,...,Sponsor,Followers,Year_inauguration,Estimated_Spectators,Denivele_positif,Denivele_negatif,Month_of_Race,Nb_participant_moy,Season,predict_time
0,2023,Eliud KIPCHOGE,M,40,02:04:05,7406.416667,7530.0,-7.500000e-03,20.417484,29/09/2013,...,1,2.600000e+06,2024,330000,436,438,Aout,88,Summer,7321.558477
1,2020,Benson KIPRUTO,M,33,02:06:42,7602.000000,7513.0,1.000000e-02,19.889503,04/10/2020,...,1,2.449000e+03,2024,330000,436,438,Aout,88,Summer,7391.697458
2,2023,Victor KIPLANGAT,M,25,02:06:03,7563.000000,7855.0,-4.000000e-02,19.992067,26/02/2023,...,1,5.654239e+04,2024,330000,436,438,Aout,88,Summer,7662.495067
3,2023,Gabriel Gerald GEAY,M,28,02:06:10,7474.333333,7495.0,-6.666667e-03,20.231408,05/12/2021,...,1,4.130000e+04,2024,330000,436,438,Aout,88,Summer,7493.484233
4,2023,Sisay LEMMA,M,34,02:10:45,7538.500000,7516.0,-8.673617e-19,20.065406,24/04/2016,...,1,5.654239e+04,2024,330000,436,438,Aout,88,Summer,7360.265316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,2024,Isabel OROPEZA VAZQUEZ,F,34,02:27:57,8877.000000,9005.0,-1.000000e-02,17.032781,18/02/2024,...,1,5.654239e+04,2024,330000,436,438,Aout,88,Summer,8127.150289
148,2024,Tereza HROCHOVÁ,F,28,02:26:38,8798.000000,9111.0,-3.000000e-02,17.185724,18/02/2024,...,1,5.509000e+03,2024,330000,436,438,Aout,88,Summer,8857.012681
149,2024,Luz Mery ROJAS,F,31,02:26:47,8807.000000,9380.0,-6.000000e-02,17.168162,18/02/2024,...,1,3.225000e+03,2024,330000,436,438,Aout,88,Summer,8811.913416
150,2023,Angie ORJUELA,F,35,02:29:12,8957.333333,9147.0,-3.666667e-02,16.887125,06/12/2020,...,1,1.220000e+04,2024,330000,436,438,Aout,88,Summer,8775.735606


In [10]:
import datetime
import math

# Exemple de temps en secondes
temps_en_secondes = df_test['predict_time']

# Convertir en format d'heure (HH:MM:SS)
temps_format_heure = []

for temps in temps_en_secondes:
    heures = int(temps // 3600)
    minutes = int((temps % 3600) // 60)
    secondes = math.ceil(temps % 60)  # Arrondir les secondes vers le haut
    delta = datetime.timedelta(hours=heures, minutes=minutes, seconds=secondes)
    temps_format_heure.append(str(delta))

print(temps_format_heure)

['2:02:02', '2:03:12', '2:07:43', '2:04:54', '2:02:41', '2:05:01', '2:07:24', '2:05:59', '2:08:03', '2:05:40', '2:08:13', '2:07:33', '2:06:59', '2:10:20', '2:06:59', '2:05:38', '2:06:20', '2:08:17', '2:05:57', '2:08:19', '2:08:24', '2:08:37', '2:07:49', '2:07:45', '2:08:48', '2:09:09', '2:07:48', '2:07:44', '2:09:11', '2:06:57', '2:09:24', '2:09:31', '2:08:11', '2:07:13', '2:08:09', '2:06:45', '2:08:01', '2:06:45', '2:07:18', '2:08:27', '2:06:47', '2:08:12', '2:08:00', '2:09:02', '2:09:21', '2:07:59', '2:17:15', '2:06:48', '2:08:20', '2:16:25', '2:08:41', '2:16:09', '2:16:16', '2:16:45', '2:08:42', '2:16:34', '2:07:04', '2:07:29', '2:07:37', '2:15:00', '2:13:59', '2:15:06', '2:14:35', '2:15:13', '2:15:18', '2:15:09', '2:15:09', '2:14:41', '2:05:39', '2:14:47', '2:13:39', '2:14:02', '2:05:10', '2:14:07', '2:13:09', '2:03:38', '2:11:03', '2:13:52', '2:13:44', '2:15:50', '2:17:15', '2:18:37', '2:21:23', '2:25:25', '2:24:37', '2:21:15', '2:17:07', '2:24:46', '2:20:07', '2:23:08', '2:23:28'

In [11]:
df_test['predict_time_hour'] = temps_format_heure

In [12]:
df_test

Unnamed: 0,year,full_name,gender,age,finish_time,finish_time_seconds,previous_time,progression_percentage,vitesse_finish_time,formatted_date_marathon,...,Followers,Year_inauguration,Estimated_Spectators,Denivele_positif,Denivele_negatif,Month_of_Race,Nb_participant_moy,Season,predict_time,predict_time_hour
0,2023,Eliud KIPCHOGE,M,40,02:04:05,7406.416667,7530.0,-7.500000e-03,20.417484,29/09/2013,...,2.600000e+06,2024,330000,436,438,Aout,88,Summer,7321.558477,2:02:02
1,2020,Benson KIPRUTO,M,33,02:06:42,7602.000000,7513.0,1.000000e-02,19.889503,04/10/2020,...,2.449000e+03,2024,330000,436,438,Aout,88,Summer,7391.697458,2:03:12
2,2023,Victor KIPLANGAT,M,25,02:06:03,7563.000000,7855.0,-4.000000e-02,19.992067,26/02/2023,...,5.654239e+04,2024,330000,436,438,Aout,88,Summer,7662.495067,2:07:43
3,2023,Gabriel Gerald GEAY,M,28,02:06:10,7474.333333,7495.0,-6.666667e-03,20.231408,05/12/2021,...,4.130000e+04,2024,330000,436,438,Aout,88,Summer,7493.484233,2:04:54
4,2023,Sisay LEMMA,M,34,02:10:45,7538.500000,7516.0,-8.673617e-19,20.065406,24/04/2016,...,5.654239e+04,2024,330000,436,438,Aout,88,Summer,7360.265316,2:02:41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,2024,Isabel OROPEZA VAZQUEZ,F,34,02:27:57,8877.000000,9005.0,-1.000000e-02,17.032781,18/02/2024,...,5.654239e+04,2024,330000,436,438,Aout,88,Summer,8127.150289,2:15:28
148,2024,Tereza HROCHOVÁ,F,28,02:26:38,8798.000000,9111.0,-3.000000e-02,17.185724,18/02/2024,...,5.509000e+03,2024,330000,436,438,Aout,88,Summer,8857.012681,2:27:38
149,2024,Luz Mery ROJAS,F,31,02:26:47,8807.000000,9380.0,-6.000000e-02,17.168162,18/02/2024,...,3.225000e+03,2024,330000,436,438,Aout,88,Summer,8811.913416,2:26:52
150,2023,Angie ORJUELA,F,35,02:29:12,8957.333333,9147.0,-3.666667e-02,16.887125,06/12/2020,...,1.220000e+04,2024,330000,436,438,Aout,88,Summer,8775.735606,2:26:16


In [13]:
# Créer un dataframe pour les hommes
df_men = df_test[df_test['gender'] == 'M'].copy()
# Trier le dataframe des hommes par 'predict_time' et réinitialiser l'index
df_men = df_men.sort_values(by='predict_time').reset_index(drop=True)
# Ajouter une colonne 'ranking' pour le classement
df_men['ranking'] = df_men.index + 1
df_men

Unnamed: 0,year,full_name,gender,age,finish_time,finish_time_seconds,previous_time,progression_percentage,vitesse_finish_time,formatted_date_marathon,...,Year_inauguration,Estimated_Spectators,Denivele_positif,Denivele_negatif,Month_of_Race,Nb_participant_moy,Season,predict_time,predict_time_hour,ranking
0,2023,Eliud KIPCHOGE,M,40,02:04:05,7406.416667,7530.000000,-7.500000e-03,20.417484,29/09/2013,...,2024,330000,436,438,Aout,88,Summer,7321.558477,2:02:02,1
1,2023,Sisay LEMMA,M,34,02:10:45,7538.500000,7516.000000,-8.673617e-19,20.065406,24/04/2016,...,2024,330000,436,438,Aout,88,Summer,7360.265316,2:02:41,2
2,2020,Benson KIPRUTO,M,33,02:06:42,7602.000000,7513.000000,1.000000e-02,19.889503,04/10/2020,...,2024,330000,436,438,Aout,88,Summer,7391.697458,2:03:12,3
3,2023,Shokhrukh DAVLYATOV,M,28,02:07:02,7622.000000,8269.000000,-8.000000e-02,19.837313,03/12/2023,...,2024,330000,436,438,Aout,88,Summer,7417.020491,2:03:38,4
4,2023,Gabriel Gerald GEAY,M,28,02:06:10,7474.333333,7495.000000,-6.666667e-03,20.231408,05/12/2021,...,2024,330000,436,438,Aout,88,Summer,7493.484233,2:04:54,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,2023,Samuel Tsegay TESFAMARIAM,M,36,02:08:06,7800.833333,7648.000000,-1.000000e-02,19.401557,22/04/2012,...,2024,330000,436,438,Aout,88,Summer,8175.220535,2:16:16,72
72,2023,Liam ADAMS,M,38,02:12:52,7939.500000,8172.000000,0.000000e+00,19.044340,24/09/2017,...,2024,330000,436,438,Aout,88,Summer,8184.954447,2:16:25,73
73,2024,Tachlowini GABRIYESOS,M,26,02:10:09,7765.333333,8042.000000,-1.333333e-02,19.471463,20/02/2022,...,2024,330000,436,438,Aout,88,Summer,8193.807175,2:16:34,74
74,2024,Abdi Ali GELELCHU,M,27,02:07:15,7668.500000,7736.000000,-8.000000e-02,19.717400,20/02/2022,...,2024,330000,436,438,Aout,88,Summer,8204.360812,2:16:45,75


In [14]:
df_men[['ranking', 'predict_time_hour','full_name','gender','age','noc']]

Unnamed: 0,ranking,predict_time_hour,full_name,gender,age,noc
0,1,2:02:02,Eliud KIPCHOGE,M,40,KEN
1,2,2:02:41,Sisay LEMMA,M,34,ETH
2,3,2:03:12,Benson KIPRUTO,M,33,KEN
3,4,2:03:38,Shokhrukh DAVLYATOV,M,28,UZB
4,5,2:04:54,Gabriel Gerald GEAY,M,28,TAN
...,...,...,...,...,...,...
71,72,2:16:16,Samuel Tsegay TESFAMARIAM,M,36,ERI
72,73,2:16:25,Liam ADAMS,M,38,AUS
73,74,2:16:34,Tachlowini GABRIYESOS,M,26,ART
74,75,2:16:45,Abdi Ali GELELCHU,M,27,BRN


In [15]:
# Créer un dataframe pour les femmes
df_women = df_test[df_test['gender'] == 'F'].copy()
# Trier le dataframe des femmes par 'predict_time' et réinitialiser l'index
df_women = df_women.sort_values(by='predict_time').reset_index(drop=True)
# Ajouter une colonne 'ranking' pour le classement
df_women['ranking'] = df_women.index + 1

In [16]:
df_women[['ranking', 'predict_time_hour','full_name','gender','age','noc']]

Unnamed: 0,ranking,predict_time_hour,full_name,gender,age,noc
0,1,2:11:03,Tigst ASSEFA,F,28,ETH
1,2,2:13:44,Sifan HASSAN,F,31,NED
2,3,2:13:52,Amane Beriso SHANKULE,F,33,ETH
3,4,2:15:28,Isabel OROPEZA VAZQUEZ,F,34,MEX
4,5,2:15:41,Solange JESUS,F,37,POR
...,...,...,...,...,...,...
71,72,2:28:40,Julia MAYER,F,31,AUT
72,73,2:28:41,Gladys TEJEDA,F,39,PER
73,74,2:28:43,Hanne VERBRUGGEN,F,31,BEL
74,75,2:28:50,Aleksandra LISOWSKA,F,34,POL


In [17]:
df_men.to_csv('men_RANK_JO.csv', index=False)

In [18]:
df_women.to_csv('women_RANK_JO.csv', index=False)