In [36]:
import pandas as pd

# Chargement des données depuis un fichier CSV
df = pd.read_csv('../data/sbb-data.csv', delimiter=';')

# Séparation de la colonne Geopos en latitude et longitude
df[['latitude', 'longitude']] = df['Geopos'].str.split(',', expand=True)

# Conversion des nouvelles colonnes en type float pour un usage géospatial
df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)

# Vérification du résultat
print(df[['Geopos', 'latitude', 'longitude']].head())

# Sauvegarde des données dans un nouveau fichier CSV (optionnel)
df.to_csv('../data/sbb-data_latlong.csv', index=False)




                                 Geopos   latitude  longitude
0   47.5079223325029, 8.760389196167752  47.507922   8.760389
1  47.36660976671019, 8.548489088449456  47.366610   8.548489
2  47.40743350791071, 8.214805494441988  47.407434   8.214805
3   47.39917636823757, 8.44724506333892  47.399176   8.447245
4  47.42807508777434, 8.276363251209915  47.428075   8.276363


In [37]:
df['Departure time'] = pd.to_datetime(df['Departure time'])
df['Departure forecast'] = pd.to_datetime(df['Departure forecast'])
df['Departure delay'] = df['Departure delay'].astype(bool)
df['delay'] = df['Departure forecast']- df['Departure time']

# replace all delay > 10 minutes by NaT
df.loc[df['delay'] > '24:00:00', 'delay'] = pd.NaT

In [None]:
station_data = df.groupby('Stop name').agg({'delay': 'max', 'Day of operation': 'count', 'Departure delay': 'sum'}).reset_index()
station_data['delay rate'] = station_data['Departure delay']/station_data['Day of operation'] * 100

line_data = df.groupby('Line Text').agg({'Day of operation': 'count', 'Departure delay': 'sum'}).reset_index()
line_data['delay rate'] = line_data['Departure delay']/line_data['Day of operation'] * 100

# add the information to the initial data frame when the station name is the same and the line is the same

data = df.merge(station_data, on='Stop name', how='left', suffixes=('', '_station'))
data = data.merge(line_data, on='Line Text', how='left', suffixes=('', '_line'))

# df['Departure delay'] transform boolean value to strings 'ok' and 'ko'
data['Departure delay'] = data['Departure delay'].replace({True: 'En retard', False: 'Pas en retard'})

# filter the data frame to keep only the relevant columns

data = data[['Stop name', 'delay_station', 'Day of operation_station', 'Departure delay_station', 'delay rate', 'longitude', 'latitude', 'Line Text', 'Day of operation_line', 'Departure delay_line', 'delay rate_line', 'Departure delay', 'Arrival forecast']]

# rename column Stop name with name
data = data.set_axis(['nom gare', 'retard max gare', 'nombre total train gare', 'nombre retard train gare', 'taux retard gare', 'longitude', 'latitude', 'Nom Ligne', 'nombre total train ligne', 'nombre retard train ligne', 'taux retard ligne', 'retard', 'heure d\'arrivé'], axis=1)
data.to_csv('../data/sbb-data_grouped.csv', index=False)

RangeIndex(start=0, stop=61858, step=1)