In [1]:
import pandas as pd
import seaborn as sns
import datetime

df_washington = pd.read_csv("data/washington.csv", sep = ";")
# df_convention = pd.read_csv("data/convention.csv", sep = ";")
# df_sts = pd.read_csv("data/sts.csv", sep = ";")

def filter(df,amont,aval):
    df = df[df["Libelle noeud amont"]==amont]
    df = df[df["Libelle noeud aval"]==aval]
    return df

def set_date(df):
    df["Date et heure de comptage"] = pd.to_datetime(df_washington["Date et heure de comptage"],format='%Y-%m-%dT%H:%M:%S%z')
    return df


df_washington = filter(df_washington,"Av_Champs_Elysees-Washington","Av_Champs_Elysees-Berri")
df_washington = set_date(df_washington)
# df_sts = filter(df_sts,"Lecourbe-Convention","Convention-Blomet")
# df_convention = filter(df_convention,"Sts_Peres-Voltaire","Sts_Peres-Universite")

def simplify_df(df):
    df_loc = df[["Libelle", "Date et heure de comptage", "Débit horaire", "Taux d'occupation"]]
    return df_loc

df_washington = simplify_df(df_washington)

df_washington["date"] = [d.date() for d in df_washington["Date et heure de comptage"]]
df_washington["heure"] = [d.hour for d in df_washington["Date et heure de comptage"]]

df_washington.head()


Unnamed: 0,Libelle,Date et heure de comptage,Débit horaire,Taux d'occupation,date,heure
0,AV_Champs_Elysees,2020-11-02 05:00:00+01:00,144.0,2.25611,2020-11-02,5
1,AV_Champs_Elysees,2020-11-01 23:00:00+01:00,245.0,8.11222,2020-11-01,23
2,AV_Champs_Elysees,2020-07-01 16:00:00+02:00,1138.0,17.86611,2020-07-01,16
3,AV_Champs_Elysees,2020-07-01 22:00:00+02:00,880.0,16.03611,2020-07-01,22
4,AV_Champs_Elysees,2020-08-05 03:00:00+02:00,487.0,6.35389,2020-08-05,3


In [2]:
def baseline_1(df, start_date, end_date):
    """
    input :
    
    df : dataframe 
    - with columns : [Libelle, Date et heure de comptage, Débit horaire, Taux d'occupation, date]
    - with all historic rows
    
    start_date : datetime.date (format datetime.date(YYYY, MM, DD))
    end_date : datetime.date (format datetime.date(YYYY, MM, DD))
    
    output :
    dataframe 
    - with columns : [Libelle, Date et heure de comptage, Débit horaire, Taux d'occupation, date]
    - with all hours between start_date and end_date
    
    
    Logic :
    Last day values are predicted to be the ones happening for all comin days
    Ex : If we want to predict from 2020/12/05 to 2020/12/10, then each day 
    will have the same values as 2020/12/04
    """
    number_of_days = (end_date - start_date).days
    
    last_day_date = start_date - datetime.timedelta(days=1)
    
    df_last_day = df[df["date"] == last_day_date].copy()
    
    df_prediction = df_last_day.copy()
    df_prediction["date"] = start_date
    
    
    local_date = start_date
    for i in range(number_of_days):
        local_date += datetime.timedelta(days=1)
        df_loc = df_last_day.copy()
        df_loc["date"] = local_date
        df_prediction = pd.concat([df_prediction, df_loc])
    return df_prediction

In [3]:
baseline_1(df_washington, datetime.date(2020, 11, 2), datetime.date(2020, 11, 5))

Unnamed: 0,Libelle,Date et heure de comptage,Débit horaire,Taux d'occupation,date,heure
1,AV_Champs_Elysees,2020-11-01 23:00:00+01:00,245.0,8.11222,2020-11-02,23
9,AV_Champs_Elysees,2020-11-01 12:00:00+01:00,116.0,5.46722,2020-11-02,12
18,AV_Champs_Elysees,2020-11-01 14:00:00+01:00,65.0,0.76278,2020-11-02,14
32,AV_Champs_Elysees,2020-11-01 20:00:00+01:00,358.0,6.20500,2020-11-02,20
89,AV_Champs_Elysees,2020-11-01 17:00:00+01:00,96.0,1.90055,2020-11-02,17
115,AV_Champs_Elysees,2020-11-01 11:00:00+01:00,92.0,8.63889,2020-11-02,11
119,AV_Champs_Elysees,2020-11-01 05:00:00+01:00,40.0,0.48389,2020-11-02,5
121,AV_Champs_Elysees,2020-11-01 09:00:00+01:00,239.0,3.13445,2020-11-02,9
132,AV_Champs_Elysees,2020-11-01 19:00:00+01:00,402.0,5.51445,2020-11-02,19
146,AV_Champs_Elysees,2020-11-01 18:00:00+01:00,326.0,3.87833,2020-11-02,18
