In [26]:
import pandas as pd

# Chargement des données depuis un fichier CSV
df = pd.read_csv('../data/sbb-data.csv', delimiter=';')

# Séparation de la colonne Geopos en latitude et longitude
df[['latitude', 'longitude']] = df['Geopos'].str.split(',', expand=True)

# Conversion des nouvelles colonnes en type float pour un usage géospatial
df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)

# Vérification du résultat
print(df[['Geopos', 'latitude', 'longitude']].head())

# Sauvegarde des données dans un nouveau fichier CSV (optionnel)
df.to_csv('../data/sbb-data_latlong.csv', index=False)




                                 Geopos   latitude  longitude
0   47.5079223325029, 8.760389196167752  47.507922   8.760389
1  47.36660976671019, 8.548489088449456  47.366610   8.548489
2  47.40743350791071, 8.214805494441988  47.407434   8.214805
3   47.39917636823757, 8.44724506333892  47.399176   8.447245
4  47.42807508777434, 8.276363251209915  47.428075   8.276363


In [72]:
df['Departure time'] = pd.to_datetime(df['Departure time'])
df['Departure forecast'] = pd.to_datetime(df['Departure forecast'])
df['Departure delay'] = df['Departure delay'].astype(bool)
df['delay'] = df['Departure forecast']- df['Departure time']

# replace all delay > 10 minutes by NaT
df.loc[df['delay'] > '24:00:00', 'delay'] = pd.NaT

# groupby line text and get somme statistics on it and add one with an array of all Stop name AND THE number of unique Linie and an array of all Departure time that are classified as delayed
grouped = df.groupby('Line Text').agg({'Line Text': 'count', 'Departure delay': 'sum', 'delay': ['sum'], 'Stop name':  lambda x: list(x), 'Linie': 'nunique', 'Departure time': lambda x: list(x[df['Departure delay'] == True])})

# show the line Text with IR90
print(grouped.loc['IR90'])


grouped.to_csv('../data/sbb-ligne-transformed.csv', index=False)

Line Text        count                                                     853
Departure delay  sum                                                        47
delay            sum                                           0 days 12:09:45
Stop name        <lambda>    [Sierre/Siders, Montreux, Genève, Brig, Martig...
Linie            nunique                                                    66
Departure time   <lambda>    [2024-10-13 20:23:00, 2024-10-13 22:58:00, 202...
Name: IR90, dtype: object
