# Importing necessary libraries

In [1]:

import pandas as pd      # For data manipulation and analysis
import numpy as np       # For numerical operations and arrays


# Loading and Initial Data Inspection

In [2]:
# Defining the file path 
file_path = 'regularite-mensuelle-tgv-aqst.csv'

# Reading the dataset from the CSV file into a pandas DataFrame
data = pd.read_csv(file_path, delimiter=";")

# Displaying the first few rows of the dataset to get an initial overview
data.head()

Unnamed: 0,date,service,gare_depart,gare_arrivee,duree_moyenne,nb_train_prevu,nb_annulation,commentaire_annulation,nb_train_depart_retard,retard_moyen_depart,...,nb_train_retard_sup_15,retard_moyen_trains_retard_sup15,nb_train_retard_sup_30,nb_train_retard_sup_60,prct_cause_externe,prct_cause_infra,prct_cause_gestion_trafic,prct_cause_materiel_roulant,prct_cause_gestion_gare,prct_cause_prise_en_charge_voyageurs
0,2018-01,National,BORDEAUX ST JEAN,PARIS MONTPARNASSE,141,870,5,,289,11.247809,...,110,6.511118,44,8,36.134454,31.092437,10.92437,15.966387,5.042017,0.840336
1,2018-01,National,LA ROCHELLE VILLE,PARIS MONTPARNASSE,165,222,0,,8,2.875,...,22,5.696096,5,0,15.384615,30.769231,38.461538,11.538462,3.846154,0.0
2,2018-01,National,PARIS MONTPARNASSE,QUIMPER,220,248,1,,37,9.501351,...,26,7.548387,17,7,26.923077,38.461538,15.384615,19.230769,0.0,0.0
3,2018-01,National,PARIS MONTPARNASSE,ST MALO,156,102,0,,12,19.9125,...,8,6.724757,6,4,23.076923,46.153846,7.692308,15.384615,7.692308,0.0
4,2018-01,National,PARIS MONTPARNASSE,ST PIERRE DES CORPS,61,391,2,,61,7.796995,...,17,3.346487,6,0,21.212121,42.424242,9.090909,21.212121,6.060606,0.0


In [3]:
# Check the dimensions of the dataset (number of rows, number of columns)
data.shape


(8154, 26)

In [4]:
data.columns

Index(['date', 'service', 'gare_depart', 'gare_arrivee', 'duree_moyenne',
       'nb_train_prevu', 'nb_annulation', 'commentaire_annulation',
       'nb_train_depart_retard', 'retard_moyen_depart',
       'retard_moyen_tous_trains_depart', 'commentaire_retards_depart',
       'nb_train_retard_arrivee', 'retard_moyen_arrivee',
       'retard_moyen_tous_trains_arrivee', 'commentaires_retard_arrivee',
       'nb_train_retard_sup_15', 'retard_moyen_trains_retard_sup15',
       'nb_train_retard_sup_30', 'nb_train_retard_sup_60',
       'prct_cause_externe', 'prct_cause_infra', 'prct_cause_gestion_trafic',
       'prct_cause_materiel_roulant', 'prct_cause_gestion_gare',
       'prct_cause_prise_en_charge_voyageurs'],
      dtype='object')

In [5]:
# Display concise information about the dataset, including data types and non-null values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8154 entries, 0 to 8153
Data columns (total 26 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   date                                  8154 non-null   object 
 1   service                               8154 non-null   object 
 2   gare_depart                           8154 non-null   object 
 3   gare_arrivee                          8154 non-null   object 
 4   duree_moyenne                         8154 non-null   int64  
 5   nb_train_prevu                        8154 non-null   int64  
 6   nb_annulation                         8154 non-null   int64  
 7   commentaire_annulation                0 non-null      float64
 8   nb_train_depart_retard                8154 non-null   int64  
 9   retard_moyen_depart                   8154 non-null   float64
 10  retard_moyen_tous_trains_depart       8154 non-null   float64
 11  commentaire_retar

# Data cleaning

## Counting Non-Null Comments

In [6]:

# Extract the 'commentaire_annulation' column
commentaires = data['commentaire_annulation']

# Count the number of non-null (non-NaN) comments
non_nan_count = commentaires.notna().sum()

# Display the count of non-null comments
print(f'Nombre de commentaires annulation non NaN : {non_nan_count}')


Nombre de commentaires annulation non NaN : 0


In [7]:
# Since there are no non-null comments in this column, we can safely drop it
data.drop(columns=['commentaire_annulation'], inplace=True)

In [8]:
# Counting Non-Null Comments for Departure Delays

# Extract the 'commentaire_retards_depart' column
commentaires_r = data['commentaire_retards_depart']

# Count the number of non-null (non-NaN) comments for departure delays
non_nan_count = commentaires_r.notna().sum()

# Display the count of non-null comments for departure delays
print(f'Nombre de commentaires non NaN pour les retards au départ : {non_nan_count}')


Nombre de commentaires non NaN pour les retards au départ : 0


In [9]:
# Since there are no non-null comments in this column, we can safely drop it
data.drop(columns=['commentaire_retards_depart'], inplace=True)


In [11]:
# Counting Non-Null Comments for Arrival Delays and Deciding to Drop the Column

# Extract the 'commentaires_retard_arrivee' column
commentaires_a = data['commentaires_retard_arrivee']

# Count the number of non-null (non-NaN) comments for arrival delays
non_nan_count = commentaires_a.notna().sum()

# Display the count of non-null comments for arrival delays
print(f'Nombre de commentaires non NaN pour les retards à l\'arrivée : {non_nan_count}')



Nombre de commentaires non NaN pour les retards à l'arrivée : 698


In [13]:

# After careful examination, we found that this column doesn't significantly contribute to our project's goal of predicting train punctuality.
# Therefore, we have decided to remove it to streamline the dataset.
data.drop(columns=['commentaires_retard_arrivee'], inplace=True)

In [14]:
# Get the column names from the dataset
nom_des_colonnes = data.columns

# Display the list of column names
nom_des_colonnes


Index(['date', 'service', 'gare_depart', 'gare_arrivee', 'duree_moyenne',
       'nb_train_prevu', 'nb_annulation', 'nb_train_depart_retard',
       'retard_moyen_depart', 'retard_moyen_tous_trains_depart',
       'nb_train_retard_arrivee', 'retard_moyen_arrivee',
       'retard_moyen_tous_trains_arrivee', 'nb_train_retard_sup_15',
       'retard_moyen_trains_retard_sup15', 'nb_train_retard_sup_30',
       'nb_train_retard_sup_60', 'prct_cause_externe', 'prct_cause_infra',
       'prct_cause_gestion_trafic', 'prct_cause_materiel_roulant',
       'prct_cause_gestion_gare', 'prct_cause_prise_en_charge_voyageurs'],
      dtype='object')

## Looking for duplicates

In [15]:
# Remove Duplicate Rows

# Create a new dataset with duplicate rows removed
Data_no_dup = data.drop_duplicates()

# Display the resulting dataset with duplicates removed
Data_no_dup


Unnamed: 0,date,service,gare_depart,gare_arrivee,duree_moyenne,nb_train_prevu,nb_annulation,nb_train_depart_retard,retard_moyen_depart,retard_moyen_tous_trains_depart,...,nb_train_retard_sup_15,retard_moyen_trains_retard_sup15,nb_train_retard_sup_30,nb_train_retard_sup_60,prct_cause_externe,prct_cause_infra,prct_cause_gestion_trafic,prct_cause_materiel_roulant,prct_cause_gestion_gare,prct_cause_prise_en_charge_voyageurs
0,2018-01,National,BORDEAUX ST JEAN,PARIS MONTPARNASSE,141,870,5,289,11.247809,3.693179,...,110,6.511118,44,8,36.134454,31.092437,10.924370,15.966387,5.042017,0.840336
1,2018-01,National,LA ROCHELLE VILLE,PARIS MONTPARNASSE,165,222,0,8,2.875000,0.095796,...,22,5.696096,5,0,15.384615,30.769231,38.461538,11.538462,3.846154,0.000000
2,2018-01,National,PARIS MONTPARNASSE,QUIMPER,220,248,1,37,9.501351,1.003981,...,26,7.548387,17,7,26.923077,38.461538,15.384615,19.230769,0.000000,0.000000
3,2018-01,National,PARIS MONTPARNASSE,ST MALO,156,102,0,12,19.912500,1.966667,...,8,6.724757,6,4,23.076923,46.153846,7.692308,15.384615,7.692308,0.000000
4,2018-01,National,PARIS MONTPARNASSE,ST PIERRE DES CORPS,61,391,2,61,7.796995,0.886889,...,17,3.346487,6,0,21.212121,42.424242,9.090909,21.212121,6.060606,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8149,2023-06,National,STRASBOURG,PARIS EST,114,492,2,273,11.989927,6.742041,...,80,35.446667,31,10,7.619048,5.714286,55.238095,18.095238,7.619048,5.714286
8150,2023-06,National,TOULOUSE MATABIAU,PARIS MONTPARNASSE,273,215,0,24,28.730556,3.181240,...,36,76.688889,24,10,13.888889,33.333333,8.333333,19.444444,8.333333,16.666667
8151,2023-06,National,TOURS,PARIS MONTPARNASSE,78,192,1,20,31.281667,3.276353,...,13,97.394872,7,5,14.285714,21.428571,28.571429,21.428571,3.571429,10.714286
8152,2023-06,National,VALENCE ALIXAN TGV,PARIS LYON,133,440,2,347,13.517051,10.271499,...,96,47.424479,62,22,26.724138,19.827586,27.586207,7.758621,7.758621,10.344828


On n'a pas donc de duplicates .
