# Trafic prediction

## Let's discover data first

imports

In [18]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import numpy as np

In [19]:
traffic_conversion = {'low':1,
                      "normal":2, 
                      "high": 3,
                      "heavy": 4}

In [20]:
df = pd.read_csv("sources/Traffic.csv")
df.describe(include='object')


Unnamed: 0,Time,Day of the week,Traffic Situation
count,2976,2976,2976
unique,96,7,4
top,12:00:00 AM,Tuesday,normal
freq,31,480,1669


In [21]:
df.head()

Unnamed: 0,Time,Date,Day of the week,CarCount,BikeCount,BusCount,TruckCount,Total,Traffic Situation
0,12:00:00 AM,10,Tuesday,31,0,4,4,39,low
1,12:15:00 AM,10,Tuesday,49,0,3,3,55,low
2,12:30:00 AM,10,Tuesday,46,0,3,6,55,low
3,12:45:00 AM,10,Tuesday,51,0,2,5,58,low
4,1:00:00 AM,10,Tuesday,57,6,15,16,94,normal


### HeatMap

In [22]:

traffic_order = ['low', 'normal',"high","heavy"]  
day_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
heatmap_df = df.copy()
# print(heatmap_df)
heatmap_df['Day of the week'] = pd.Categorical(heatmap_df['Day of the week'], categories=day_order, ordered=True)
heatmap_df['Traffic Situation'] = pd.Categorical(heatmap_df['Traffic Situation'], categories=traffic_order, ordered=True)
heatmap_df = heatmap_df.sort_values(['Day of the week', 'Traffic Situation'])

grouped = heatmap_df.groupby(['Traffic Situation', 'Day of the week']).size().unstack(fill_value=0)
# print(grouped)
fig = px.imshow(grouped,
                labels=dict(x="Day of the week", y="Traffic Situation", color="Count"),
                x=grouped.columns,
                y=grouped.index,
                color_continuous_scale='Viridis')

fig.show()

### Time evolution 

#### Month scope

In [23]:

month_df = df.copy()
# print(month_df.head())

month_df['Traffic Situation'] = month_df['Traffic Situation'].map(traffic_conversion)

month_df['Traffic Situation'] = pd.to_numeric(month_df['Traffic Situation'], errors='coerce')

month_df['Time'] = pd.to_datetime(month_df['Time'])

month_grouped = month_df.groupby('Date')['Traffic Situation'].mean().reset_index()

fig = px.bar(month_grouped, x='Date', y='Traffic Situation', title='Moyenne de Traffic Situation par jour')
fig.show()


traffic_conversion_v2 = {'low':0,
                      "normal":1, 
                      "high":2,
                      "heavy": 3}



month_df_v2 = df.copy()
# print(month_df_v2.head())

month_df_v2['Traffic Situation'] = month_df_v2['Traffic Situation'].map(traffic_conversion_v2)

month_df_v2['Traffic Situation'] = pd.to_numeric(month_df_v2['Traffic Situation'], errors='coerce')

month_df_v2['Time'] = pd.to_datetime(month_df_v2['Time'])

month_grouped_v2 = month_df_v2.groupby('Date')['Traffic Situation'].mean().reset_index()

fig2 = px.bar(month_grouped_v2, x='Date', y='Traffic Situation', title='Moyenne de Traffic Situation par jour')
fig2.show()

Make a choice between this two

#### Average traffic by hour

In [24]:
hourly_df = df.copy()
hourly_df['Traffic Situation'] = hourly_df['Traffic Situation'].map(traffic_conversion)
hourly_df['Time'] = pd.to_datetime(hourly_df['Time'])
hourly_df = hourly_df.groupby('Time')['Traffic Situation'].mean().reset_index()
fig = px.bar(hourly_df, x='Time', y='Traffic Situation', title='Moyenne de Traffic Situation par heure')
fig.show()

### Vehicles impacts

In [25]:
vehicle_df = df.copy()


daily_average = vehicle_df.groupby('Date').mean().reset_index()
vehicle_df["Traffic Situation"] = vehicle_df['Traffic Situation'].map(traffic_conversion_v2)
columns_to_average = ['CarCount', 'BikeCount', 'BusCount', 'TruckCount', 'Total','Traffic Situation']
daily_average_values = vehicle_df.groupby('Date')[columns_to_average].mean().reset_index()


# print(daily_average_values["Traffic Situation"])
fig = px.bar(daily_average_values, x='Date', y=['CarCount', 'BikeCount', 'BusCount', 'TruckCount'],
              title='Moyenne quotidienne du nombre de véhicules',
              labels={'value': 'Moyenne', 'Date': 'Date', 'variable': 'Type de véhicule'},
              )


fig.show()

fig2 = px.line(daily_average_values, x='Date', y="Traffic Situation")
fig2.show()


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



##### Trucks ? 

In [26]:
trucks_df = df.copy()
trucks_df['Time'] = pd.to_datetime(trucks_df['Time'])
trucks_df = trucks_df.groupby('Time')['TruckCount'].mean().reset_index()

fig = px.bar(trucks_df, x='Time', y='TruckCount', title='Trucks means per hour')
fig.show()

## Some correlations with the output

In [27]:
corr_df = df.copy()
corr_df["Traffic Situation"] = corr_df['Traffic Situation'].map(traffic_conversion)
methods = ["kendall","pearson","spearman"]
for column in corr_df.columns:
    for method in methods:
        if column not in ["Unnamed: 0","Time","Traffic Situation","Date","Day of the week"]:
            new_df =corr_df[[column,"Traffic Situation"]]
            value = corr_df["Traffic Situation"].corr(corr_df[column],method=method)
            print(f"La corrélation entre le traffic et {column}  avec la méthode {method} est de {value} ")
            
        
   
print("=====================")     
corr_df_2 = df.copy()
corr_df_2['Day of the week'] = pd.Categorical(df['Day of the week'], ordered=True)
corr_df_2['Traffic Situation'] = pd.Categorical(df['Traffic Situation'], ordered=True)
corr_df_2['Date'] = pd.Categorical(corr_df_2['Date'],ordered=True)
corr_df_2['Time'] = pd.Categorical(corr_df_2['Time'],ordered=True)
columns = ["Time","Date","Day of the week"]
for column in columns:
    for method in methods:
        try:
            value = corr_df_2["Traffic Situation"].corr(corr_df_2[column],method=method)
            print(f"La corrélation entre le traffic et {column}  avec la méthode {method} est de {value} ")
        except:
            print(f"Methode : {method} impossible")


La corrélation entre le traffic et CarCount  avec la méthode kendall est de 0.5539955223221965 
La corrélation entre le traffic et CarCount  avec la méthode pearson est de 0.7462804547623046 
La corrélation entre le traffic et CarCount  avec la méthode spearman est de 0.6911675414283132 
La corrélation entre le traffic et BikeCount  avec la méthode kendall est de 0.4496208394590606 
La corrélation entre le traffic et BikeCount  avec la méthode pearson est de 0.6092848399666767 
La corrélation entre le traffic et BikeCount  avec la méthode spearman est de 0.5609309081145483 
La corrélation entre le traffic et BusCount  avec la méthode kendall est de 0.5555610761332743 
La corrélation entre le traffic et BusCount  avec la méthode pearson est de 0.7339146242435511 
La corrélation entre le traffic et BusCount  avec la méthode spearman est de 0.6887718292772953 
La corrélation entre le traffic et TruckCount  avec la méthode kendall est de -0.20817539711711971 
La corrélation entre le traffi


The input array could not be properly checked for nan values. nan values will be ignored.



## Correlation with input (with eachother)

In [28]:
matrice_df = df.copy()

liste_input = ["Time","Date","Day of the week","CarCount","BikeCount","BusCount","TruckCount","Total"]
methods = ["kendall","pearson","spearman"]
dict_matrice = {}

for method in methods:
    dict_matrice[method] = np.zeros((8,8))
    for i,input_i in enumerate(liste_input):
        for j,input_j in enumerate(liste_input):
            try:
                dict_matrice[method][i][j] = matrice_df[input_i].corr(matrice_df[input_j],method=method)
            except:
                dict_matrice[method][i][j] = 0
                # print(f"Méthode : {method} ne marche pas")
for method,matrice in dict_matrice.items():
    fig = px.imshow(matrice,title=f"Correlation matrix for inputs with method: {method}",
                    labels=dict(x="Inputs", y="Inputs", color="Correlation"),
                    x=liste_input,
                    y=liste_input)
    fig.show()
    

## Prediction

##### Encoded data

In [29]:
from sklearn.preprocessing import LabelEncoder

predict_df = pd.read_csv("sources/Traffic.csv")

# Colonne to change in numeric format
categorical_columns = ["Time", "Day of the week"]

# Encoded 
label_encoder = LabelEncoder()
predict_df_encoded = predict_df.copy()  


predict_df_encoded['Time'] = pd.to_datetime(predict_df['Time'])

predict_df_encoded['Hour'] = predict_df_encoded['Time'].dt.hour  # extract hour
predict_df_encoded['Minute'] = predict_df_encoded['Time'].dt.minute  # Extract minute

# AM_PO
predict_df_encoded['AM_PM'] = predict_df_encoded['Time'].dt.strftime('%p')
predict_df_encoded['AM_PM'] = predict_df_encoded['AM_PM'].map({'AM': 0, 'PM': 1})


predict_df_encoded["Day of the week" + '_encoded'] = label_encoder.fit_transform(predict_df["Day of the week"])


selected_features = ["CarCount", "BikeCount", "BusCount", "TruckCount", "Total", "Day of the week_encoded","Hour", "Minute", "AM_PM","Date","Traffic Situation"]
# selected_features = ["CarCount", "BikeCount", "BusCount", "TruckCount", "Total", "Day of the week_encoded","Date","Traffic Situation"]
# selected_features = ["CarCount", "BikeCount", "BusCount", "TruckCount", "Total","Traffic Situation"]

X = predict_df_encoded[selected_features]
print(X)
# Maintenant, X contient à la fois des caractéristiques numériques et encodées pour l'entraînement



    



      CarCount  BikeCount  BusCount  TruckCount  Total  \
0           31          0         4           4     39   
1           49          0         3           3     55   
2           46          0         3           6     55   
3           51          0         2           5     58   
4           57          6        15          16     94   
...        ...        ...       ...         ...    ...   
2971        16          3         1          36     56   
2972        11          0         1          30     42   
2973        15          4         1          25     45   
2974        16          5         0          27     48   
2975        14          3         1          15     33   

      Day of the week_encoded  Hour  Minute  AM_PM  Date Traffic Situation  
0                           5     0       0      0    10               low  
1                           5     0      15      0    10               low  
2                           5     0      30      0    10               l

#### Neuronal network

In [30]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import plotly.express as px



# Diviser les données en ensemble d'entraînement et ensemble de test
train_data = X[:int(0.8 * len(X))] 
test_data = X[int(0.8 * len(X)):int(0.95 * len(X))]  
eval_data = X[int(0.95 * len(X)):]  

X_train = train_data.drop('Traffic Situation', axis=1).values
y_train_encoded = label_encoder.fit_transform(train_data['Traffic Situation'])

X_test = test_data.drop('Traffic Situation', axis=1).values
y_test_encoded = label_encoder.transform(test_data['Traffic Situation'])

y_train_one_hot = tf.keras.utils.to_categorical(y_train_encoded)
y_test_one_hot = tf.keras.utils.to_categorical(y_test_encoded)

# Normaliser les données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Créer un modèle de perceptron simple
model = tf.keras.Sequential([
    tf.keras.layers.Dense(4, activation='softmax', input_shape=(X_train_scaled.shape[1],)),
])

# Compiler le modèle
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Entraîner le modèle
#AUGMENTER EPOCHS AMELIORE au dessus de 100 pas fou fou 
#Expliquer ce qu'est Epochs,batch_size et verbose
history = model.fit(X_train_scaled, y_train_one_hot, epochs=200, batch_size=16, validation_data=(X_test_scaled, y_test_one_hot), verbose=2)

# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test_scaled)

# Calculer la différence entre les prédictions et les vraies valeurs
diff = np.argmax(y_pred, axis=1) - np.argmax(y_test_one_hot, axis=1)

# Afficher la fonction de coût (loss) au fil des époques
fig_loss = px.line(
    x=np.arange(1, len(history.history['loss']) + 1),
    y=history.history['loss'],
    labels={'x': 'Epochs', 'y': 'Loss'},
    title='Fonction de coût (Loss) au fil des époques'
)
fig_loss.show()

# Afficher la différence entre prédictions et vraies valeurs au fil du temps
fig_diff = px.line(
    x=np.arange(len(diff)),
    y=diff,
    labels={'x': 'Sample Index', 'y': 'Difference (Prediction - Real)'},
    title='Différence entre prédictions et vraies valeurs au fil du temps'
)
fig_diff.show()
loss, accuracy = model.evaluate(X_test_scaled, y_test_one_hot)






Epoch 1/200
149/149 - 1s - loss: 2.0494 - accuracy: 0.2643 - val_loss: 1.5495 - val_accuracy: 0.4027 - 744ms/epoch - 5ms/step
Epoch 2/200
149/149 - 0s - loss: 1.4324 - accuracy: 0.3819 - val_loss: 1.1087 - val_accuracy: 0.5391 - 244ms/epoch - 2ms/step
Epoch 3/200
149/149 - 0s - loss: 1.0790 - accuracy: 0.5752 - val_loss: 0.8770 - val_accuracy: 0.7025 - 243ms/epoch - 2ms/step
Epoch 4/200
149/149 - 0s - loss: 0.9022 - accuracy: 0.6891 - val_loss: 0.7618 - val_accuracy: 0.7405 - 243ms/epoch - 2ms/step
Epoch 5/200
149/149 - 0s - loss: 0.8065 - accuracy: 0.7126 - val_loss: 0.6991 - val_accuracy: 0.7673 - 239ms/epoch - 2ms/step
Epoch 6/200
149/149 - 0s - loss: 0.7440 - accuracy: 0.7244 - val_loss: 0.6566 - val_accuracy: 0.7740 - 241ms/epoch - 2ms/step
Epoch 7/200
149/149 - 0s - loss: 0.6973 - accuracy: 0.7345 - val_loss: 0.6242 - val_accuracy: 0.7852 - 238ms/epoch - 2ms/step
Epoch 8/200
149/149 - 0s - loss: 0.6591 - accuracy: 0.7403 - val_loss: 0.5978 - val_accuracy: 0.7964 - 243ms/epoch - 2



In [35]:
manual_test_data = pd.DataFrame({
    "CarCount": [14],
    "BikeCount": [12],
    "BusCount": [0],
    "TruckCount": [0],
    "Total": [0],
    "Day of the week_encoded": [3],  
    "Hour": [5],
    "Minute": [0],
    "AM_PM": [1],  
    "Date": [25]  
})


caca_dict = {0:0,
             1:0,
             2:0,
             3:0}

pipi_dict = {"low":0,
             "normal":0,
             "high":0,
             "heavy":0}



    



def test_model(car,bike,bus,truck,total,day,hour,minute,am_pm,date):
    manual_test_data = pd.DataFrame({
    "CarCount": [car],
    "BikeCount": [bike],
    "BusCount": [bus],
    "TruckCount": [truck],
    "Total": [total],
    "Day of the week_encoded": [day],  
    "Hour": [hour],
    "Minute": [minute],
    "AM_PM": [am_pm],  
    "Date": [date]   
    })

    manual_test_data_scaled = scaler.transform(manual_test_data)

    predictions_manual = model.predict(manual_test_data_scaled,verbose=0)
    if np.argmax(predictions_manual,axis=1)[0] == 0:
        return("heavy")
    elif np.argmax(predictions_manual,axis=1)[0] == 1:
        return("high")
    elif np.argmax(predictions_manual,axis=1)[0] == 2:
        return("low")
    elif np.argmax(predictions_manual,axis=1)[0] == 3:
        return("normal")   

cpt = 0
correct = 0
print("Evaluation de du modèle sur des données d'évaluation (différentes des données réelles)")

for index, row in eval_data.iterrows():
    real = row["Traffic Situation"]
    theo = test_model(row["CarCount"],row["BikeCount"],row["BusCount"],row["TruckCount"],
                      row["Total"],row["Day of the week_encoded"],row["Hour"],row["Minute"],row["AM_PM"],
                      row["Date"])
    if real == theo:
        correct+=1
    cpt+=1
    pourcentage = correct/cpt*100
print(f"Pourcentage de résultat juste : {pourcentage}%")
    


Evaluation de du modèle sur des données d'évaluation (différentes des données réelles)



X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but

Pourcentage de résultat juste : 89.93288590604027%



X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names

