# load from open data paris

In [4]:
%cd ..

/mnt/d/Google Drive/projects/paris-traffic-forecast


In [202]:
import matplotlib.pyplot as plt
from tqdm import tqdm
import glob
import pandas as pd
import numpy as np

# set figure size for notebook
plt.rcParams['figure.figsize'] = [25, 5]

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

#### load data from data/voi

In [26]:
# define constants
# target_arcs= [['Lecourbe-Convention', 'Convention-Blomet'],
#             ['Av_Champs_Elysees-Washington', 'Av_Champs_Elysees-Berri'],
#             ['Sts_Peres-Voltaire', 'Sts_Peres-Universite']]

useful_columns = ["libelle", "libelle_nd_amont", "libelle_nd_aval", "t_1h", "q", "k"] # drop rows with these missing values
keep_columns = useful_columns + ["etat_barre"]

In [125]:
# Get folder path containing text files
file_list = glob.glob("data/voi/*.csv")

all_df = pd.DataFrame()
for file_path in tqdm(file_list):
    df =  pd.read_csv(file_path, delimiter=";")
    df['etat_barre'] = df['etat_barre'].map({'Invalide': 3, 'Barré': 2, 'Ouvert': 1, 'Inconnu': 0})

    df = df[keep_columns]
    # drop rows where q or k is null
    df.dropna(subset=useful_columns, inplace=True)
    all_df = all_df.append(df)

all_df.rename(columns={"libelle": "arc", "libelle_nd_amont": "noeud_amont", "libelle_nd_aval": "noeud_aval", "q":"debit", "k":"occupation", "t_1h":"time"}, inplace=True)
all_df.drop_duplicates(subset=["arc", "noeud_amont", "noeud_aval", "time"], keep="first", inplace=True)
all_df.sort_values(by=['time'], inplace=True)
all_df

100%|██████████| 48/48 [00:13<00:00,  3.52it/s]


Unnamed: 0,arc,noeud_amont,noeud_aval,time,debit,occupation,etat_barre
7866,Av_Champs_Elysees,Av_Champs_Elysees-La_Boetie,Av_Champs_Elysees-Berri,2021-01-01T00:00:00+00:00,0.0,0.33500,3
7863,Sts_Peres,Bd_St_Germain-Sts_Peres,Sts_Peres-Grenelle,2021-01-01T00:00:00+00:00,26.0,0.36556,3
7786,Av_Champs_Elysees,Av_Champs_Elysees-Berri,Av_Champs_Elysees-La_Boetie,2021-01-01T00:00:00+00:00,151.0,1.87111,3
7811,Convention,Lecourbe-Convention,Convention-Blomet,2021-01-01T00:00:00+00:00,91.0,0.63111,3
7810,Av_Champs_Elysees,Av_Champs_Elysees-Berri,Av_Champs_Elysees-Washington,2021-01-01T00:00:00+00:00,0.0,0.00000,3
...,...,...,...,...,...,...,...
2968,Bd_St_Germain,Bd_St_Germain-St_Guillaume,Bd_St_Germain-Sts_Peres,2021-12-10T23:00:00+00:00,777.0,4.34111,3
2663,Av_Champs_Elysees,Av_Champs_Elysees-Colisee,Rond_Point_Champs_Elysees,2021-12-10T23:00:00+00:00,1393.0,17.62278,3
2708,Av_Champs_Elysees,Av_Champs_Elysees-Berri,Av_Champs_Elysees-Washington,2021-12-10T23:00:00+00:00,95.0,4.61778,3
2844,Convention,Convention-St_Charles,Convention-Lourmel,2021-12-10T23:00:00+00:00,151.0,2.67222,3


In [151]:
import joblib
arc_neighborhoods = "voi_champs_elysees"
voi_champs_elysees = joblib.load(f"raw_data/{arc_neighborhoods}.pkl")

# update voi_champs_elysees with new values
time_col = "time"
value_cols = ['debit', 'occupation', 'etat_barre']

for arc in voi_champs_elysees:
    amont, aval, ts = arc['noeud_amont'], arc['noeud_aval'], arc['ts']
    df = all_df[(all_df["noeud_amont"] == amont) & (all_df["noeud_aval"] == aval)]
    # df[time_col] = pd.to_datetime(df[time_col], utc=True) # in case of timezone
    df[time_col] = pd.to_datetime(df[time_col])
    df[time_col] = pd.DatetimeIndex(df[time_col]).tz_localize(None)
    df = df.set_index(time_col)

    if len(df) == 0:
        print(f"No data for arc {amont} - {aval}")
        continue

    # convert to TimeSeries
    ts_new = TimeSeries.from_dataframe(df, 
                    value_cols=value_cols,
                    fill_missing_dates=True,
                    freq='H'
                    )

    if ts_new.time_index[-1] <= ts.time_index[-1]:
        print(f"No new data for arc {amont} - {aval}")
        continue

    ts_new = ts_new.drop_before(ts.time_index[-1])
    arc['ts'] = ts.append(ts_new)

joblib.dump(voi_champs_elysees, f"raw_data/{arc_neighborhoods}1.pkl")

No data for arc Av_Champs_Elysees-Balzac - Av_Champs_Elysees-Washington
No data for arc Av_Champs_Elysees-Face_Air_Franc - Av_Champs_Elysees-Balzac
No data for arc Av_Champs_Elysees-Face_Air_Franc - Grande_Armee-Forge
No data for arc Av_Champs_Elysees-Washington - Av_Georges_V-Place_Dunant
No data for arc Av_Georges_V-Place_Dunant - Av_Champs_Elysees-Washington
No data for arc Champs-Tilsitt - Av_Champs_Elysees-Face_Air_Franc
No data for arc Concorde_Ouest - Pl_Concorde-Av_Champs_Elysees
No data for arc Pl_Concorde-Av_Champs_Elysees - Cours_la_Reine-Concorde
No data for arc Sortie_Souterrain_Champs_Elysees - Tuileries-Sedar_Senghor


['raw_data/voi_champs_elysees1.pkl']

In [152]:
import joblib
arc_neighborhoods = "voi-convention"
neighborhood = joblib.load(f"raw_data/{arc_neighborhoods}.pkl")

# update neighborhood with new values
time_col = "time"
value_cols = ['debit', 'occupation', 'etat_barre']

for arc in neighborhood:
    amont, aval, ts = arc['noeud_amont'], arc['noeud_aval'], arc['ts']
    df = all_df[(all_df["noeud_amont"] == amont) & (all_df["noeud_aval"] == aval)]
    # df[time_col] = pd.to_datetime(df[time_col], utc=True) # in case of timezone
    df[time_col] = pd.to_datetime(df[time_col])
    df[time_col] = pd.DatetimeIndex(df[time_col]).tz_localize(None)
    df = df.set_index(time_col)

    if len(df) == 0:
        print(f"No data for arc {amont} - {aval}")
        continue

    # convert to TimeSeries
    ts_new = TimeSeries.from_dataframe(df, 
                    value_cols=value_cols,
                    fill_missing_dates=True,
                    freq='H'
                    )

    if ts_new.time_index[-1] <= ts.time_index[-1]:
        print(f"No new data for arc {amont} - {aval}")
        continue

    ts_new = ts_new.drop_before(ts.time_index[-1])
    arc['ts'] = ts.append(ts_new)

joblib.dump(neighborhood, f"raw_data/{arc_neighborhoods}1.pkl")

No data for arc Convention-Felix_Faure - Convention-Lourmel
No data for arc Convention-Felix_Faure - Convention-Nivert
No data for arc Convention-Felix_Faure - Faure-Javel
No data for arc Convention-Gutemberg - Convention-St_Charles
No data for arc Convention-Gutemberg - Rond_Point_Mirabeau
No data for arc Convention-Lourmel - Convention-St_Charles
No data for arc Convention-Nivert - Convention-Felix_Faure
No data for arc Convention-Nivert - Lecourbe-Convention
No data for arc Convention-St_Charles - Convention-Gutemberg
No data for arc Faure-Javel - Convention-Felix_Faure
No data for arc Lecourbe-Convention - Convention-Nivert
No data for arc Lecourbe-Convention - Lecourbe-Croix-Nivert


['raw_data/voi-convention1.pkl']

In [201]:
import joblib
arc_neighborhoods = "voi-sts_peres"
neighborhood = joblib.load(f"raw_data/{arc_neighborhoods}.pkl")

# update neighborhood with new values
time_col = "time"
value_cols = ['debit', 'occupation', 'etat_barre']

for arc in neighborhood:
    amont, aval, ts = arc['noeud_amont'], arc['noeud_aval'], arc['ts']
    df = all_df[(all_df["noeud_amont"] == amont) & (all_df["noeud_aval"] == aval)]
    # df[time_col] = pd.to_datetime(df[time_col], utc=True) # in case of timezone
    df[time_col] = pd.to_datetime(df[time_col])
    df[time_col] = pd.DatetimeIndex(df[time_col]).tz_localize(None)
    df = df.set_index(time_col)

    if len(df) == 0:
        print(f"No data for arc {amont} - {aval}")
        continue

    # convert to TimeSeries
    ts_new = TimeSeries.from_dataframe(df, 
                    value_cols=value_cols,
                    fill_missing_dates=True,
                    freq='H'
                    )

    if ts_new.time_index[-1] <= ts.time_index[-1]:
        print(f"No new data for arc {amont} - {aval}")
        continue
    if ts_new.time_index[0] > ts.time_index[-1]: # fill gap
        ts = ts.append_values([[np.nan, np.nan, np.nan]]*int((ts_new.time_index[0]-ts.time_index[-1]).total_seconds()/3600))


    ts_new = ts_new.drop_before(ts.time_index[-1])
    arc['ts'] = ts.append(ts_new)

joblib.dump(neighborhood, f"raw_data/{arc_neighborhoods}1.pkl")

No data for arc Sevres-Babylone - Sevres-Sts_Peres


['raw_data/voi-sts_peres1.pkl']

#### create arc neighborhood for first time

In [None]:
# select arcs to work with
amont = ["Sts_Peres-Voltaire"]
aval = ["Sts_Peres-Universite"]

df = pd.DataFrame()
for amont_arc, aval_arc in amont_aval[2:]:
    # filter by amont and aval arcs
    df = df.append(all_df[(all_df["noeud_amont"] == amont_arc) & (all_df["noeud_aval"] == aval_arc)])
print(df['arc'].unique())
# df.drop_duplicates(inplace=True)
df = df.sort_values(by=["time"])
df

['Sts_Peres']


Unnamed: 0,arc,noeud_amont,noeud_aval,time,debit,occupation,etat_barre
7751,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-01-01T00:00:00+01:00,52.0,0.96056,3
7793,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-01-01T01:00:00+01:00,56.0,1.28056,3
7795,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-01-01T02:00:00+01:00,71.0,1.24667,3
7794,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-01-01T03:00:00+01:00,48.0,0.76333,3
242,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-01-01T04:00:00+01:00,69.0,1.23222,3
...,...,...,...,...,...,...,...
2585,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-12-09T20:00:00+01:00,759.0,11.37222,3
2584,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-12-09T21:00:00+01:00,720.0,10.49445,3
2583,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-12-09T22:00:00+01:00,571.0,8.91667,3
2587,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-12-09T23:00:00+01:00,547.0,6.87500,3


In [None]:
df['time'] = df['time'].apply(lambda x: x[:-6]+"+00:00")

In [None]:
old_df = pd.read_csv('raw_data/sts_peres-2014-2020.csv').set_index('time')
old_df.index = pd.to_datetime(old_df.index, utc=False)
old_df.sort_index(inplace=True)
old_df

Unnamed: 0_level_0,arc,noeud_amont,noeud_aval,debit,occupation,etat_barre
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-01 01:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,486.0,6.17889,3
2014-01-01 02:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,571.0,6.66222,3
2014-01-01 03:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,532.0,5.56611,3
2014-01-01 04:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,458.0,4.63444,3
2014-01-01 05:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,386.0,3.94500,3
...,...,...,...,...,...,...
2020-12-31 20:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,338.0,4.53334,3
2020-12-31 21:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,195.0,2.34556,3
2020-12-31 22:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,129.0,1.80222,3
2020-12-31 23:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,77.0,1.18389,3


In [None]:
import pandas as pd
time_col = "time"
value_cols = ['debit', "occupation", "etat_barre"]
# set datetime index
# df[time_col] = pd.to_datetime(df[time_col], utc=True).tz_localize(None)
df[time_col] = pd.to_datetime(df[time_col])
df[time_col] = pd.DatetimeIndex(df[time_col]).tz_localize(None)
df = df.set_index(time_col)
# sort df by index
df

Unnamed: 0_level_0,arc,noeud_amont,noeud_aval,debit,occupation,etat_barre
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-01 00:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,52.0,0.96056,3
2021-01-01 01:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,56.0,1.28056,3
2021-01-01 02:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,71.0,1.24667,3
2021-01-01 03:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,48.0,0.76333,3
2021-01-01 04:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,69.0,1.23222,3
...,...,...,...,...,...,...
2021-12-09 20:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,759.0,11.37222,3
2021-12-09 21:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,720.0,10.49445,3
2021-12-09 22:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,571.0,8.91667,3
2021-12-09 23:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,547.0,6.87500,3


In [None]:
df = df.iloc[1:]

df = old_df.append(df, verify_integrity=True)
# drop duplicated indexes
df[df.index.duplicated()]



Unnamed: 0_level_0,arc,noeud_amont,noeud_aval,debit,occupation,etat_barre
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [None]:
ts = TimeSeries.from_dataframe(df, 
                    value_cols=value_cols,
                    fill_missing_dates=True,
                    freq='H'
                    )

In [None]:
df.describe()

Unnamed: 0,debit,occupation,etat_barre
count,67702.0,67702.0,67702.0
mean,470.721515,7.149994,2.97328
std,272.289232,6.283867,0.161265
min,0.0,0.0,2.0
25%,223.0,2.59834,3.0
50%,492.0,5.875,3.0
75%,687.0,9.46111,3.0
max,1393.0,57.6411,3.0


In [None]:
# remove debit values bigger than 10000
df = df[df["debit"] < 20000]

In [None]:
voisinages = []
for i, x in df.groupby(["noeud_amont", "noeud_aval"]):
    voisinages.append({'noeud_amont':i[0], 'noeud_aval':i[1], 'df':x})


In [None]:
from darts import TimeSeries

# transform data to TimeSeries
for voi in voisinages:
    ts = TimeSeries.from_dataframe(voi["df"], 
                            value_cols=value_cols,
                            fill_missing_dates=True,
                            freq='H'
                            )
    voi["ts"] = ts
ts

In [None]:
for voi in voisinages:
    # delete df key
    del voi["df"]

In [None]:
import joblib
joblib.dump(voisinages, "raw_data/voi-sts_peres.pkl")

['raw_data/voi-sts_peres.pkl']

In [None]:
df = pd.read_csv('data/rech_champs_elysees.csv', delimiter=';')

NameError: name 'pd' is not defined

In [None]:
df.columns

Index(['Identifiant arc', 'Libelle', 'Date et heure de comptage',
       'Débit horaire', 'Taux d'occupation', 'Etat trafic',
       'Identifiant noeud amont', 'Libelle noeud amont',
       'Identifiant noeud aval', 'Libelle noeud aval', 'Etat arc',
       'Date debut dispo data', 'Date fin dispo data', 'geo_point_2d',
       'geo_shape'],
      dtype='object')

In [None]:
df.head(1)

Unnamed: 0,Identifiant arc,Libelle,Date et heure de comptage,Débit horaire,Taux d'occupation,Etat trafic,Identifiant noeud amont,Libelle noeud amont,Identifiant noeud aval,Libelle noeud aval,Etat arc,Date debut dispo data,Date fin dispo data,geo_point_2d,geo_shape
0,4272,Av_Champs_Elysees,2021-10-14T00:00:00+02:00,8.0,5.72944,Fluide,2300,Av_Champs_Elysees-La_Boetie,2293,Av_Champs_Elysees-Berri,Invalide,2005-01-01,2019-06-01,"48.8710753632,2.3039212731","{""type"": ""LineString"", ""coordinates"": [[2.3053..."


In [None]:
df = df.iloc[:,[1, 2, 3, 4, 10, 7, 9]]
df.columns = ["libelle", "t_1h", "q", "k", "etat_barre", "libelle_nd_amont", "libelle_nd_aval"]
df

Unnamed: 0,libelle,t_1h,q,k,etat_barre,libelle_nd_amont,libelle_nd_aval
0,Av_Champs_Elysees,2021-10-14T00:00:00+02:00,8.0,5.72944,Invalide,Av_Champs_Elysees-La_Boetie,Av_Champs_Elysees-Berri
1,Av_Champs_Elysees,2021-10-13T23:00:00+02:00,27.0,6.24000,Invalide,Av_Champs_Elysees-La_Boetie,Av_Champs_Elysees-Berri
2,Av_Champs_Elysees,2021-10-13T22:00:00+02:00,42.0,5.00722,Invalide,Av_Champs_Elysees-La_Boetie,Av_Champs_Elysees-Berri
3,Av_Champs_Elysees,2021-10-13T19:00:00+02:00,241.0,1.83222,Invalide,Av_Champs_Elysees-La_Boetie,Av_Champs_Elysees-Berri
4,Av_Champs_Elysees,2021-10-13T18:00:00+02:00,172.0,2.15444,Invalide,Av_Champs_Elysees-La_Boetie,Av_Champs_Elysees-Berri
...,...,...,...,...,...,...,...
216797,Av_Champs_Elysees,2021-09-01T01:00:00+02:00,,,Invalide,Av_Champs_Elysees-Balzac,Av_Champs_Elysees-Washington
216798,Av_Champs_Elysees,2021-09-01T01:00:00+02:00,,,Invalide,Av_Champs_Elysees-Face_Air_Franc,Av_Champs_Elysees-Balzac
216799,Av_Champs_Elysees,2021-09-01T01:00:00+02:00,506.0,1.72333,Invalide,Rond_Point_Champs_Elysees,Av_Champs_Elysees-Colisee
216800,Av_Champs_Elysees,2021-09-01T01:00:00+02:00,,,Invalide,Rond_Point_Champs_Elysees,Av_Champs_Elysees-Clemenceau


In [None]:
# replace values in etat_barre
df['etat_barre'] = df['etat_barre'].apply(lambda x: 3 if x.startswith("I") else 2)

In [None]:
file_list

['data/comptages-routiers-permanents (1).csv',
 'data/comptages-routiers-permanents (2).csv',
 'data/rech_champs_elysees.csv',
 'data/rech_souterain_champs_elysees.csv']

In [None]:
amont_aval = [['Av_Champs_Elysees-Washington', 'Av_Georges_V-Place_Dunant'],
 ['Pl_Concorde-Av_Champs_Elysees', 'Av_Champs_Elysees-Dutuit'],
 ['Av_Champs_Elysees-Colisee', 'Rond_Point_Champs_Elysees'],
 ['Av_Champs_Elysees-La_Boetie', 'Av_Champs_Elysees-Berri'],
 ['Concorde_Ouest', 'Pl_Concorde-Av_Champs_Elysees'],
 ['Av_Champs_Elysees-Berri', 'Av_Champs_Elysees-Washington'],
 ['Av_Champs_Elysees-Face_Air_Franc', 'Av_Champs_Elysees-Balzac'],
 ['Pl_Concorde-Av_Champs_Elysees', 'Cours_la_Reine-Concorde'],
 ['Champs-Tilsitt', 'Av_Champs_Elysees-Face_Air_Franc'],
 ['Cours_la_Reine-Concorde', 'Sortie_Souterrain_Champs_Elysees'],
 ['Sortie_Souterrain_Champs_Elysees', 'Tuileries-Sedar_Senghor'],
 ['Av_Champs_Elysees-Face_Air_Franc', 'Grande_Armee-Forge'],
 ['Av_Champs_Elysees-Clemenceau', 'Rond_Point_Champs_Elysees'],
 ['Rond_Point_Champs_Elysees', 'Av_Champs_Elysees-Clemenceau'],
 ['Av_Champs_Elysees-Washington', 'Av_Champs_Elysees-Berri'],
 ['Av_Champs_Elysees-Colisee', 'Av_Champs_Elysees-La_Boetie'],
 ['Av_Champs_Elysees-Dutuit', 'Av_Champs_Elysees-Clemenceau'],
 ['Rond_Point_Champs_Elysees', 'Av_Champs_Elysees-Colisee'],
 ['Av_Champs_Elysees-Balzac', 'Av_Champs_Elysees-Washington'],
 ['Av_Champs_Elysees-Berri', 'Av_Champs_Elysees-La_Boetie'],
 ['Av_Georges_V-Place_Dunant', 'Av_Champs_Elysees-Washington']]

amont_aval += [['Convention-St_Charles', 'Convention-Lourmel'],
 ['Convention-Felix_Faure', 'Convention-Nivert'],
 ['Convention-Lourmel', 'Convention-St_Charles'],
 ['Convention-Blomet', 'Convention-Vaugirard'],
 ['Convention-St_Charles', 'Convention-Gutemberg'],
 ['Convention-Felix_Faure', 'Convention-Lourmel'],
 ['Convention-Lourmel', 'Convention-Felix_Faure'],
 ['Convention-Nivert', 'Convention-Felix_Faure'],
 ['Faure-Javel', 'Convention-Felix_Faure'],
 ['Convention-Gutemberg', 'Convention-St_Charles'],
 ['Convention-Nivert', 'Lecourbe-Convention'],
 ['Convention-Gutemberg', 'Rond_Point_Mirabeau'],
 ['Convention-Vaugirard', 'Convention-Olivier_de_Serres'],
 ['Convention-Felix_Faure', 'Faure-Javel'],
 ['Lecourbe-Convention', 'Convention-Nivert'],
 ['Rond_Point_Mirabeau', 'Convention-Gutemberg'],
 ['Lecourbe-Convention', 'Convention-Blomet'],
 ['Convention-Olivier_de_Serres', 'Place_Charles_Valin'],
 ['Lecourbe-Convention', 'Lecourbe-Croix-Nivert']]

amont_aval += [['Sevres-Babylone', 'Sevres-Sts_Peres'],
 ['Sts_Peres-Voltaire', 'Sts_Peres-Universite'],
 ['Bd_St_Germain-St_Guillaume', 'Bd_St_Germain-Sts_Peres'],
 ['Sts_Peres-Grenelle', 'Sevres-Sts_Peres'],
 ['Sts_Peres-Universite', 'Bd_St_Germain-Sts_Peres'],
 ['Malaquais-Bonaparte', 'Sts_Peres-Voltaire'],
 ['Bd_St_Germain-Sts_Peres', 'Sts_Peres-Grenelle'],
 ['Bd_St_Germain-Sts_Peres', 'Bd_St_Germain-Dragon']]

 for amont, aval in amont_aval:
     print(f"echo collecting data for {amont}- {aval}")
     print(f'wget "https://opendata.paris.fr/explore/dataset/comptages-routiers-permanents/download/?format=csv&disjunctive.libelle=true&disjunctive.etat_trafic=true&disjunctive.libelle_nd_amont=true&disjunctive.libelle_nd_aval=true&q=av_champs&refine.libelle=AV_Champs_Elysees&timezone=Europe/Paris&lang=fr&use_labels_for_header=true&csv_separator=%3B"  -o /dev/null -O "../data/AV_Champs_Elysees.csv"'

In [None]:
# read txt files from folder
from tqdm import tqdm
import glob
import pandas as pd
# Get folder path containing text files
file_list = glob.glob("data/*.csv")

useful_columns = ["libelle", "libelle_nd_amont", "libelle_nd_aval", "t_1h", "q", "k"] # drop rows with these missing values
keep_columns = useful_columns + ["etat_barre"]

# select arcs to filter by
# amont = ["Lecourbe-Convention", "Sts_Peres-Voltaire"]
# aval = ["Convention-Blomet", "Sts_Peres-Universite"]
amont_aval = [['Av_Champs_Elysees-Washington', 'Av_Georges_V-Place_Dunant'],
 ['Pl_Concorde-Av_Champs_Elysees', 'Av_Champs_Elysees-Dutuit'],
 ['Av_Champs_Elysees-Colisee', 'Rond_Point_Champs_Elysees'],
 ['Av_Champs_Elysees-La_Boetie', 'Av_Champs_Elysees-Berri'],
 ['Concorde_Ouest', 'Pl_Concorde-Av_Champs_Elysees'],
 ['Av_Champs_Elysees-Berri', 'Av_Champs_Elysees-Washington'],
 ['Av_Champs_Elysees-Face_Air_Franc', 'Av_Champs_Elysees-Balzac'],
 ['Pl_Concorde-Av_Champs_Elysees', 'Cours_la_Reine-Concorde'],
 ['Champs-Tilsitt', 'Av_Champs_Elysees-Face_Air_Franc'],
 ['Cours_la_Reine-Concorde', 'Sortie_Souterrain_Champs_Elysees'],
 ['Sortie_Souterrain_Champs_Elysees', 'Tuileries-Sedar_Senghor'],
 ['Av_Champs_Elysees-Face_Air_Franc', 'Grande_Armee-Forge'],
 ['Av_Champs_Elysees-Clemenceau', 'Rond_Point_Champs_Elysees'],
 ['Rond_Point_Champs_Elysees', 'Av_Champs_Elysees-Clemenceau'],
 ['Av_Champs_Elysees-Washington', 'Av_Champs_Elysees-Berri'],
 ['Av_Champs_Elysees-Colisee', 'Av_Champs_Elysees-La_Boetie'],
 ['Av_Champs_Elysees-Dutuit', 'Av_Champs_Elysees-Clemenceau'],
 ['Rond_Point_Champs_Elysees', 'Av_Champs_Elysees-Colisee'],
 ['Av_Champs_Elysees-Balzac', 'Av_Champs_Elysees-Washington'],
 ['Av_Champs_Elysees-Berri', 'Av_Champs_Elysees-La_Boetie'],
 ['Av_Georges_V-Place_Dunant', 'Av_Champs_Elysees-Washington']]

amont_aval += [['Convention-St_Charles', 'Convention-Lourmel'],
 ['Convention-Felix_Faure', 'Convention-Nivert'],
 ['Convention-Lourmel', 'Convention-St_Charles'],
 ['Convention-Blomet', 'Convention-Vaugirard'],
 ['Convention-St_Charles', 'Convention-Gutemberg'],
 ['Convention-Felix_Faure', 'Convention-Lourmel'],
 ['Convention-Lourmel', 'Convention-Felix_Faure'],
 ['Convention-Nivert', 'Convention-Felix_Faure'],
 ['Faure-Javel', 'Convention-Felix_Faure'],
 ['Convention-Gutemberg', 'Convention-St_Charles'],
 ['Convention-Nivert', 'Lecourbe-Convention'],
 ['Convention-Gutemberg', 'Rond_Point_Mirabeau'],
 ['Convention-Vaugirard', 'Convention-Olivier_de_Serres'],
 ['Convention-Felix_Faure', 'Faure-Javel'],
 ['Lecourbe-Convention', 'Convention-Nivert'],
 ['Rond_Point_Mirabeau', 'Convention-Gutemberg'],
 ['Lecourbe-Convention', 'Convention-Blomet'],
 ['Convention-Olivier_de_Serres', 'Place_Charles_Valin'],
 ['Lecourbe-Convention', 'Lecourbe-Croix-Nivert']]

amont_aval += [['Sevres-Babylone', 'Sevres-Sts_Peres'],
 ['Sts_Peres-Voltaire', 'Sts_Peres-Universite'],
 ['Bd_St_Germain-St_Guillaume', 'Bd_St_Germain-Sts_Peres'],
 ['Sts_Peres-Grenelle', 'Sevres-Sts_Peres'],
 ['Sts_Peres-Universite', 'Bd_St_Germain-Sts_Peres'],
 ['Malaquais-Bonaparte', 'Sts_Peres-Voltaire'],
 ['Bd_St_Germain-Sts_Peres', 'Sts_Peres-Grenelle'],
 ['Bd_St_Germain-Sts_Peres', 'Bd_St_Germain-Dragon']]

all_df = pd.DataFrame()
for file_path in tqdm(file_list):
    df =  pd.read_csv(file_path, delimiter=";")
    df = df.iloc[:,[1, 2, 3, 4, 10, 7, 9]]
    df.columns = ["libelle", "t_1h", "q", "k", "etat_barre", "libelle_nd_amont", "libelle_nd_aval"]
    df['etat_barre'] = df['etat_barre'].apply(lambda x: 3 if x.startswith("I") else 2)

    df = df[keep_columns]
    # drop rows where q or k is null
    df.dropna(subset=useful_columns, inplace=True)
    for amont_arc, aval_arc in amont_aval:
        # filter by amont and aval arcs
        all_df = all_df.append(df[(df["libelle_nd_amont"] == amont_arc) & (df["libelle_nd_aval"] == aval_arc)], ignore_index=True)
all_df.rename(columns={"libelle": "arc", "libelle_nd_amont": "noeud_amont", "libelle_nd_aval": "noeud_aval", "q":"debit", "k":"occupation", "t_1h":"time"}, inplace=True)
all_df.drop_duplicates(subset=["arc", "noeud_amont", "noeud_aval", "time"], keep="first", inplace=True)
all_df

100%|██████████| 4/4 [00:06<00:00,  1.74s/it]


Unnamed: 0,arc,noeud_amont,noeud_aval,time,debit,occupation,etat_barre
0,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-04-02T12:00:00+02:00,651.0,8.44667,3
1,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-07-08T00:00:00+02:00,742.0,8.46056,3
2,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-07-07T20:00:00+02:00,899.0,9.77389,3
3,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-07-07T18:00:00+02:00,852.0,11.03556,3
4,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-04-02T18:00:00+02:00,634.0,7.23611,3
...,...,...,...,...,...,...,...
189192,VGP_Souterrain_Concorde,Cours_la_Reine-Concorde,Sortie_Souterrain_Champs_Elysees,2021-08-01T03:00:00+02:00,638.0,0.89222,3
189193,VGP_Souterrain_Concorde,Cours_la_Reine-Concorde,Sortie_Souterrain_Champs_Elysees,2021-08-01T04:00:00+02:00,370.0,0.41389,3
189194,VGP_Souterrain_Concorde,Cours_la_Reine-Concorde,Sortie_Souterrain_Champs_Elysees,2021-11-01T02:00:00+01:00,677.0,1.27055,3
189195,VGP_Souterrain_Concorde,Cours_la_Reine-Concorde,Sortie_Souterrain_Champs_Elysees,2021-11-01T01:00:00+01:00,758.0,1.65445,3


In [None]:
 ['Lecourbe-Convention', 'Convention-Blomet'],
 ['Av_Champs_Elysees-Washington', 'Av_Champs_Elysees-Berri'],
[['Sts_Peres-Voltaire', 'Sts_Peres-Universite'],


In [None]:
# filter rows where noeud_amont or noeud_aval containts "champs"
# df = all_df[all_df["noeud_amont"].str.lower().str.contains("sts_peres") | all_df["noeud_aval"].str.lower().str.contains("sts_peres")]
# get zip of  unique noeud_amont and noeud_aval combinations
df_unique = all_df.drop_duplicates(subset=["noeud_amont", "noeud_aval"])
zipset = df_unique[["noeud_amont", "noeud_aval"]].values.tolist()
zipset

[['Sts_Peres-Voltaire', 'Sts_Peres-Universite'],
 ['Bd_St_Germain-St_Guillaume', 'Bd_St_Germain-Sts_Peres'],
 ['Sts_Peres-Grenelle', 'Sevres-Sts_Peres'],
 ['Sts_Peres-Universite', 'Bd_St_Germain-Sts_Peres'],
 ['Malaquais-Bonaparte', 'Sts_Peres-Voltaire'],
 ['Bd_St_Germain-Sts_Peres', 'Sts_Peres-Grenelle'],
 ['Bd_St_Germain-Sts_Peres', 'Bd_St_Germain-Dragon'],
 ['Convention-St_Charles', 'Convention-Lourmel'],
 ['Convention-Blomet', 'Convention-Vaugirard'],
 ['Convention-Lourmel', 'Convention-Felix_Faure'],
 ['Convention-Vaugirard', 'Convention-Olivier_de_Serres'],
 ['Rond_Point_Mirabeau', 'Convention-Gutemberg'],
 ['Lecourbe-Convention', 'Convention-Blomet'],
 ['Convention-Olivier_de_Serres', 'Place_Charles_Valin'],
 ['Pl_Concorde-Av_Champs_Elysees', 'Av_Champs_Elysees-Dutuit'],
 ['Av_Champs_Elysees-Colisee', 'Rond_Point_Champs_Elysees'],
 ['Av_Champs_Elysees-La_Boetie', 'Av_Champs_Elysees-Berri'],
 ['Av_Champs_Elysees-Berri', 'Av_Champs_Elysees-Washington'],
 ['Av_Champs_Elysees-Clemen

In [None]:
all_df[all_df["noeud_amont"] == "Convention-Blomet"]

In [None]:
orig = all_df

In [None]:
all_df.to_csv("all_df.csv", index=False)

In [None]:
# select arcs to work with
amont = ["Sts_Peres-Voltaire"]
aval = ["Sts_Peres-Universite"]

amont_aval = [
 ['Lecourbe-Convention', 'Convention-Blomet'],
 ['Av_Champs_Elysees-Washington', 'Av_Champs_Elysees-Berri'],
['Sts_Peres-Voltaire', 'Sts_Peres-Universite']]


df = pd.DataFrame()
for amont_arc, aval_arc in amont_aval[2:]:
    # filter by amont and aval arcs
    df = df.append(all_df[(all_df["noeud_amont"] == amont_arc) & (all_df["noeud_aval"] == aval_arc)])
print(df['arc'].unique())
# df.drop_duplicates(inplace=True)
df = df.sort_values(by=["time"])
df

['Sts_Peres']


Unnamed: 0,arc,noeud_amont,noeud_aval,time,debit,occupation,etat_barre
7751,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-01-01T00:00:00+01:00,52.0,0.96056,3
7793,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-01-01T01:00:00+01:00,56.0,1.28056,3
7795,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-01-01T02:00:00+01:00,71.0,1.24667,3
7794,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-01-01T03:00:00+01:00,48.0,0.76333,3
242,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-01-01T04:00:00+01:00,69.0,1.23222,3
...,...,...,...,...,...,...,...
2585,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-12-09T20:00:00+01:00,759.0,11.37222,3
2584,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-12-09T21:00:00+01:00,720.0,10.49445,3
2583,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-12-09T22:00:00+01:00,571.0,8.91667,3
2587,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,2021-12-09T23:00:00+01:00,547.0,6.87500,3


In [None]:
df['time'] = df['time'].apply(lambda x: x[:-6]+"+00:00")

In [None]:
old_df = pd.read_csv('raw_data/sts_peres-2014-2020.csv').set_index('time')
old_df.index = pd.to_datetime(old_df.index, utc=False)
old_df.sort_index(inplace=True)
old_df

Unnamed: 0_level_0,arc,noeud_amont,noeud_aval,debit,occupation,etat_barre
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-01 01:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,486.0,6.17889,3
2014-01-01 02:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,571.0,6.66222,3
2014-01-01 03:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,532.0,5.56611,3
2014-01-01 04:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,458.0,4.63444,3
2014-01-01 05:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,386.0,3.94500,3
...,...,...,...,...,...,...
2020-12-31 20:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,338.0,4.53334,3
2020-12-31 21:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,195.0,2.34556,3
2020-12-31 22:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,129.0,1.80222,3
2020-12-31 23:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,77.0,1.18389,3


In [None]:
import pandas as pd
time_col = "time"
value_cols = ['debit', "occupation", "etat_barre"]
# set datetime index
# df[time_col] = pd.to_datetime(df[time_col], utc=True).tz_localize(None)
df[time_col] = pd.to_datetime(df[time_col])
df[time_col] = pd.DatetimeIndex(df[time_col]).tz_localize(None)
df = df.set_index(time_col)
# sort df by index
df

Unnamed: 0_level_0,arc,noeud_amont,noeud_aval,debit,occupation,etat_barre
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-01 00:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,52.0,0.96056,3
2021-01-01 01:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,56.0,1.28056,3
2021-01-01 02:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,71.0,1.24667,3
2021-01-01 03:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,48.0,0.76333,3
2021-01-01 04:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,69.0,1.23222,3
...,...,...,...,...,...,...
2021-12-09 20:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,759.0,11.37222,3
2021-12-09 21:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,720.0,10.49445,3
2021-12-09 22:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,571.0,8.91667,3
2021-12-09 23:00:00,Sts_Peres,Sts_Peres-Voltaire,Sts_Peres-Universite,547.0,6.87500,3


In [None]:
df = df.iloc[1:]

df = old_df.append(df, verify_integrity=True)
# drop duplicated indexes
df[df.index.duplicated()]



Unnamed: 0_level_0,arc,noeud_amont,noeud_aval,debit,occupation,etat_barre
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [None]:
ts = TimeSeries.from_dataframe(df, 
                    value_cols=value_cols,
                    fill_missing_dates=True,
                    freq='H'
                    )

In [None]:
df.describe()

Unnamed: 0,debit,occupation,etat_barre
count,67702.0,67702.0,67702.0
mean,470.721515,7.149994,2.97328
std,272.289232,6.283867,0.161265
min,0.0,0.0,2.0
25%,223.0,2.59834,3.0
50%,492.0,5.875,3.0
75%,687.0,9.46111,3.0
max,1393.0,57.6411,3.0


In [None]:
# remove debit values bigger than 10000
df = df[df["debit"] < 20000]

In [None]:
voisinages = []
for i, x in df.groupby(["noeud_amont", "noeud_aval"]):
    voisinages.append({'noeud_amont':i[0], 'noeud_aval':i[1], 'df':x})


In [None]:
from darts import TimeSeries

# transform data to TimeSeries
for voi in voisinages:
    ts = TimeSeries.from_dataframe(voi["df"], 
                            value_cols=value_cols,
                            fill_missing_dates=True,
                            freq='H'
                            )
    voi["ts"] = ts
ts

In [None]:
for voi in voisinages:
    # delete df key
    del voi["df"]

In [None]:
import joblib
joblib.dump(voisinages, "raw_data/voi-sts_peres.pkl")

['raw_data/voi-sts_peres.pkl']