In [132]:
import pandas as pd
import seaborn as sns
import scipy.fftpack as fourier
import numpy as np
from matplotlib import pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline
plt.rcParams['figure.figsize'] = (15,6)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.svm import SVR
from sklearn import metrics

In [133]:
df_washington = pd.read_csv("data/washington.csv", sep = ";")
# df_convention = pd.read_csv("data/convention.csv", sep = ";")
# df_sts = pd.read_csv("data/sts.csv", sep = ";")

def filter(df,amont,aval):
    df = df[df["Libelle noeud amont"]==amont]
    df = df[df["Libelle noeud aval"]==aval]
    return df

def set_date(df):
    df["Date et heure de comptage"] = pd.to_datetime(df_washington["Date et heure de comptage"])
    return df

def remove_confinement(df):
    return df[
        (df["date"] < pd.to_datetime("2020-03-16")) | 
        (df["date"] >= pd.to_datetime("2020-05-25")) &
        (df["date"] < pd.to_datetime("2020-10-31"))
    ]

# keep only the arc that interests us
df_washington = filter(df_washington,"Av_Champs_Elysees-Washington","Av_Champs_Elysees-Berri")
# convert dates string to datetime
df_washington = set_date(df_washington)
# change name of date column
df_washington['date'] = df_washington['Date et heure de comptage'] 
# create a dataframe with every hour in the date range (to have N/A when data is missing rather than empty rows)
start_date = df_washington[
    (df_washington.date.dt.dayofweek == 0) & (df_washington.date.dt.hour == 5)
].date.min() # start on first Monday 00:00
end_date = df_washington[
    (df_washington.date.dt.dayofweek) == 6  & (df_washington.date.dt.hour == 4)
].date.max() # end on last Sunday 23:00
all_dates = pd.DataFrame(
    pd.date_range(start = start_date, end = end_date, freq='1H'), 
    columns=['date']
)
all_dates = remove_confinement(all_dates)
df_washington = df_washington.merge(all_dates, how='right', on='date')
df_washington = df_washington.sort_values(by = "date")
df_washington = df_washington.interpolate() #remove Nan by interpolation
df_washington = df_washington.reset_index(drop=True)
df_washington = df_washington.reset_index(drop=False) # add an index column to easily manipulate order of dates
#df_washington = remove_confinement(df_washington)
# df_sts = filter(df_sts,"Lecourbe-Convention","Convention-Blomet")
# df_convention = filter(df_convention,"Sts_Peres-Voltaire","Sts_Peres-Universite")

df_washington.head()

Unnamed: 0,index,Identifiant arc,Libelle,Date et heure de comptage,Débit horaire,Taux d'occupation,Etat trafic,Identifiant noeud amont,Libelle noeud amont,Identifiant noeud aval,Libelle noeud aval,Etat arc,Date debut dispo data,Date fin dispo data,geo_point_2d,geo_shape,date
0,0,4264.0,AV_Champs_Elysees,2019-11-04 05:00:00,398.0,3.66445,Fluide,2294.0,Av_Champs_Elysees-Washington,2293.0,Av_Champs_Elysees-Berri,Invalide,2005-01-01,2019-06-01,"48.8715358799,2.30172279246","{""type"": ""LineString"", ""coordinates"": [[2.3009...",2019-11-04 05:00:00
1,1,4264.0,AV_Champs_Elysees,2019-11-04 06:00:00,634.0,7.02945,Fluide,2294.0,Av_Champs_Elysees-Washington,2293.0,Av_Champs_Elysees-Berri,Invalide,2005-01-01,2019-06-01,"48.8715358799,2.30172279246","{""type"": ""LineString"", ""coordinates"": [[2.3009...",2019-11-04 06:00:00
2,2,4264.0,AV_Champs_Elysees,2019-11-04 07:00:00,1056.0,12.48611,Fluide,2294.0,Av_Champs_Elysees-Washington,2293.0,Av_Champs_Elysees-Berri,Invalide,2005-01-01,2019-06-01,"48.8715358799,2.30172279246","{""type"": ""LineString"", ""coordinates"": [[2.3009...",2019-11-04 07:00:00
3,3,4264.0,AV_Champs_Elysees,2019-11-04 08:00:00,1228.0,14.84222,Fluide,2294.0,Av_Champs_Elysees-Washington,2293.0,Av_Champs_Elysees-Berri,Invalide,2005-01-01,2019-06-01,"48.8715358799,2.30172279246","{""type"": ""LineString"", ""coordinates"": [[2.3009...",2019-11-04 08:00:00
4,4,4264.0,AV_Champs_Elysees,2019-11-04 09:00:00,1260.0,20.47389,Pré-saturé,2294.0,Av_Champs_Elysees-Washington,2293.0,Av_Champs_Elysees-Berri,Invalide,2005-01-01,2019-06-01,"48.8715358799,2.30172279246","{""type"": ""LineString"", ""coordinates"": [[2.3009...",2019-11-04 09:00:00


In [134]:
df_washington['week'] = df_washington['index'] // (24*7) # Week number in the dataset, 
                                                         # to keep track easily of which was the previous week
df_washington['day'] = df_washington['date'].dt.date # day without time
df_washington['day'] = df_washington['index'] // 24
df_washington['dayofweek'] = df_washington['date'].dt.dayofweek # 0 = Monday, 1 = Tuesday...
df_washington['hour'] = df_washington['date'].dt.hour

# THOSE LINES SERVED THE PURPOSE OF WEEK AND INDEX COLUMNS > NOT NEEDED ANYMORE
#df_washington['week'] = df_washington['date'].dt.week.astype(str).apply(lambda x: ('0'+x)[-2:])
#df_washington['week'] = df_washington['date'].dt.year.astype(str) + '-' + df_washington['week']
#df_washington['time'] = df_washington['date'].astype(int)/3600e9
#df_washington['time'] = df_washington['time'] - df_washington['time'].min()

# Data about state of the trafic
df_washington['invalide'] = df_washington['Etat arc']=='Invalide'
df_washington['fluide'] = df_washington['Etat trafic']=='Fluide'
df_washington['day_debit'] = df_washington.groupby('day')['Débit horaire'].transform('sum')
df_washington['percentage_debit'] = df_washington['Débit horaire']/df_washington['day_debit']

# Remove useless columns
df_washington = df_washington.drop([
    'Identifiant arc', 'Libelle', 'Date et heure de comptage', 'Etat trafic', 'Etat arc', 
    "Identifiant noeud amont","Libelle noeud amont","Identifiant noeud aval", "Libelle noeud aval",
    "Date debut dispo data", "Date fin dispo data","geo_point_2d", "geo_shape"], axis=1)

In [135]:
# Add data about previous week
df_week = df_washington.groupby('week', squeeze=True).agg({
    'Débit horaire':'sum', 'fluide':'mean', 'invalide':'mean'
}).shift()
df_washington = df_washington.join(df_week, on='week', how='left', rsuffix='_previous_week')

In [137]:
df_train = df_washington.groupby('day').agg(
    {
        'Débit horaire_previous_week':'min', 'fluide_previous_week':'min', 'invalide_previous_week':'min', 
        'dayofweek':'min', 'Débit horaire':'sum'
    }
)
df_train['last_week_debit'] = df_train['Débit horaire'].shift(7)
df_train = df_train.dropna()
prop_train = 0.50
idx_train = int(prop_train*len(df_train))
y = df_train['Débit horaire']
df_train = df_train.drop('Débit horaire', axis=1)

X_tr, X_te, y_tr, y_te = train_test_split(df_train, y, shuffle=True)
X_tr = df_train.iloc[:idx_train]
X_te = df_train.iloc[idx_train:]
y_tr = y.iloc[:idx_train]
y_te = y.iloc[idx_train:]
print(X_tr.shape, X_te.shape)

(142, 5) (143, 5)


In [138]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
pca = PCA()
scaler = StandardScaler()
X_tr_scale = scaler.fit_transform(X_tr)
X_tr_pca = pca.fit_transform(X_tr_scale)
X_te_scale = scaler.transform(X_te)
X_te_pca = pca.transform(X_te_scale)

In [139]:
clf = RFR()
clf.fit(X_tr, y_tr)
pred = clf.predict(X_te)

df_pred = pd.DataFrame(y_te).rename(columns={'Débit horaire':'true'}).reset_index()
df_pred['pred'] = pred
df_pred = df_pred.sort_values('day')

fig = go.Figure()
fig.add_trace(
    go.Scatter(x=df_pred.day, y=df_pred['true'], name='true')
)
fig.add_trace(
    go.Scatter(x=df_pred.day, y=df_pred['pred'], name='pred')
)
fig.show()

#plt.plot(df_pred.day, df_pred['true'])
#plt.plot(df_pred.day, df_pred['pred'])

print(metrics.mean_squared_error(y_te, pred, squared=False))
print(metrics.mean_absolute_error(y_te, pred))

2588.393490400218
1886.5083719008262


In [85]:
px.line(df_washington, x='date', y='Débit horaire', color='fluide')

In [86]:
px.line(y)

In [None]:
df_day = df_washington.groupby('day').agg(
    {'dayofweek':'min','week':'min', 'Débit horaire':'sum', 'fluide':'mean', 'invalide':'mean'}
)
df_day = df_day.reset_index()
df_day['week'] = df_day.groupby('week').day.transform('min')
px.line(df_day, x='week', y='Débit horaire', color='dayofweek', hover_data=['fluide','invalide', 'day'])

In [None]:
df_washington[df_washington["Débit horaire"] == 771]

In [None]:
df_day = df_washington.groupby('day').agg(
    {'dayofweek':'min','week':'min', 'Débit horaire':'sum', 'fluide':'mean', 'invalide':'mean'}
)
df_day = df_day.reset_index()
df_day['week'] = df_day.groupby('week').day.transform('min')
px.line(df_day, x='week', y='Débit horaire', color='dayofweek', hover_data=['fluide','invalide', 'day'])

In [None]:
def predict()

In [None]:
#debit horaire des n derniers samedis

for day in df_saturday.date[-n_saturdays:]:
    df_day = df_washington[df_washington["date"] == day]
    # plt.plot(df_day["hour"],df_day["Taux d'occupation"]*100)
    plt.plot(df_day["hour"],df_day["Débit horaire"])

In [None]:
#######
#features
#######
df_X = pd.DataFrame({"dt": df_washington["Date et heure de comptage"],"debit" : df_washington["Débit horaire"],"occupation" : df_washington["Taux d'occupation"],"etat" : df_washington["Etat trafic"]})
df_X

In [None]:
def etat_to_num(etat):
    if etat=="Fluide":
        return 1
    elif etat=="Pré-saturé":
        return 2
    elif etat=="Saturé":
        return 3
    elif etat=="Bloqué":
        return 4
    else:
        return "Nan"
   

In [None]:
df_X["hour"]= [d.hour for d in df_X["dt"]]
df_X["weekday"]= [d.weekday() for d in df_X["dt"]]
df_X["month"]= [d.month for d in df_X["dt"]]
df_X["mean_week"] = df_X.debit.rolling(24*7).mean()
df_X["mean_past_week"] = df_X.debit.rolling(24*7).mean().shift(24*5)
df_X["etat"] = [etat_to_num(e) for e in df_X.etat]

In [None]:
df_X["etat"].value_counts()

In [None]:
plt.plot(df_X.dt,df_X.mean_week)

In [None]:
df_fin=df_X[24*7+24*5:]
X=df_fin[["hour","weekday","mean_past_week"]]
Y_occupation=df_fin[["occupation"]]
Y_debit=df_fin[["debit"]]

In [None]:
print(Y_occupation.isna().sum())
print(X.isna().sum())

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

i_split = int(len(X)*0.6)
# i_split=3200
X_train = X[:i_split].values
y_train = Y_occupation[:i_split]
X_test = X[i_split:]
y_test = Y_occupation[i_split:]

model = RandomForestRegressor(min_samples_leaf=5)
model.fit(X_train,y_train)
pred = model.predict(X_test)


In [None]:
y_test.reset_index().occupation

In [None]:
plt.plot(Y_occupation.values)
print(i_split)

In [None]:
plt.figure(figsize=(24,9))
plt.plot(y_test[:24*5])
plt.plot(pred[:24*5])
plt.show()

In [None]:
plt.figure(figsize=(25,10))
plt.plot(y_test)
plt.plot(pred)
plt.show()

In [None]:
plt.plot(y_test,pred,"bx")
plt.plot([0,50],[0,50])


In [None]:
[0,1,2,3,4,5][3:]

In [None]:
y_train