# Trafic prediction

## Let's discover data first

imports

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder



In [2]:
traffic_conversion_v2 = {'low':1,
                      "normal":2, 
                      "high": 3,
                      "heavy": 4}

In [3]:
df = pd.read_csv("sources/Traffic.csv")
df.describe(include='object')


Unnamed: 0,Time,Day of the week,Traffic Situation
count,2976,2976,2976
unique,96,7,4
top,12:00:00 AM,Tuesday,normal
freq,31,480,1669


In [4]:
df.head()

Unnamed: 0,Time,Date,Day of the week,CarCount,BikeCount,BusCount,TruckCount,Total,Traffic Situation
0,12:00:00 AM,10,Tuesday,31,0,4,4,39,low
1,12:15:00 AM,10,Tuesday,49,0,3,3,55,low
2,12:30:00 AM,10,Tuesday,46,0,3,6,55,low
3,12:45:00 AM,10,Tuesday,51,0,2,5,58,low
4,1:00:00 AM,10,Tuesday,57,6,15,16,94,normal


### HeatMap

In [5]:

traffic_order = ['low', 'normal',"high","heavy"]  
day_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
heatmap_df = df.copy()

heatmap_df['Day of the week'] = pd.Categorical(heatmap_df['Day of the week'], categories=day_order, ordered=True)
heatmap_df['Traffic Situation'] = pd.Categorical(heatmap_df['Traffic Situation'], categories=traffic_order, ordered=True)
heatmap_df = heatmap_df.sort_values(['Day of the week', 'Traffic Situation'])

grouped = heatmap_df.groupby(['Traffic Situation', 'Day of the week']).size().unstack(fill_value=0)

fig = px.imshow(grouped,
                labels=dict(x="Day of the week", y="Traffic Situation", color="Count"),
                x=grouped.columns,
                y=grouped.index,
                color_continuous_scale='Viridis')

fig.show()

### Time evolution 

#### Month scope

In [6]:
month_df_v2 = df.copy()

month_df_v2['Traffic Situation'] = month_df_v2['Traffic Situation'].map(traffic_conversion_v2)

month_df_v2['Traffic Situation'] = pd.to_numeric(month_df_v2['Traffic Situation'], errors='coerce')

month_df_v2['Time'] = pd.to_datetime(month_df_v2['Time'])

month_grouped_v2 = month_df_v2.groupby('Date')['Traffic Situation'].mean().reset_index()

fig2 = px.bar(month_grouped_v2, x='Date', y='Traffic Situation', title='Average traffic daily')
fig2.show()

#### Average traffic by hour

In [7]:
hourly_df = df.copy()
hourly_df['Traffic Situation'] = hourly_df['Traffic Situation'].map(traffic_conversion_v2)
hourly_df['Time'] = pd.to_datetime(hourly_df['Time'])
hourly_df = hourly_df.groupby('Time')['Traffic Situation'].mean().reset_index()
fig = px.bar(hourly_df, x='Time', y='Traffic Situation', title='Average traffic hourly')
fig.show()

### Vehicles impacts

In [8]:
vehicle_df = df.copy()


daily_average = vehicle_df.groupby('Date').mean().reset_index()
vehicle_df["Traffic Situation"] = vehicle_df['Traffic Situation'].map(traffic_conversion_v2)
columns_to_average = ['CarCount', 'BikeCount', 'BusCount', 'TruckCount', 'Total','Traffic Situation']
daily_average_values = vehicle_df.groupby('Date')[columns_to_average].mean().reset_index()

fig = px.bar(daily_average_values, x='Date', y=['CarCount', 'BikeCount', 'BusCount', 'TruckCount'],
              title='Average daily vehicle count',
              labels={'value': 'Mean', 'Date': 'Date', 'variable': 'Vehicule Type'},
              )


fig.show()

fig2 = px.line(daily_average_values, x='Date', y="Traffic Situation")
fig2.show()


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



##### Focus on Trucks 

In [9]:
trucks_df = df.copy()
trucks_df['Time'] = pd.to_datetime(trucks_df['Time'])
trucks_df = trucks_df.groupby('Time')['TruckCount'].mean().reset_index()

fig = px.bar(trucks_df, x='Time', y='TruckCount', title='Trucks means per hour')
fig.show()

## Some correlations with the output

In [10]:
corr_df = df.copy()
corr_df["Traffic Situation"] = corr_df['Traffic Situation'].map(traffic_conversion_v2)
methods = ["kendall","pearson","spearman"]
for column in corr_df.columns:
    for method in methods:
        if column not in ["Unnamed: 0","Time","Traffic Situation","Date","Day of the week"]:
            new_df =corr_df[[column,"Traffic Situation"]]
            value = corr_df["Traffic Situation"].corr(corr_df[column],method=method)
            print(f"La corrélation entre le traffic et {column}  avec la méthode {method} est de {value} ")
            
        
   
print("=====================")     
corr_df_2 = df.copy()
corr_df_2['Day of the week'] = pd.Categorical(df['Day of the week'], ordered=True)
corr_df_2['Traffic Situation'] = pd.Categorical(df['Traffic Situation'], ordered=True)
corr_df_2['Date'] = pd.Categorical(corr_df_2['Date'],ordered=True)
corr_df_2['Time'] = pd.Categorical(corr_df_2['Time'],ordered=True)
columns = ["Time","Date","Day of the week"]
for column in columns:
    for method in methods:
        try:
            value = corr_df_2["Traffic Situation"].corr(corr_df_2[column],method=method)
            print(f"The correlation between traffic and {column}  using the  {method} method is {value} ")
        except:
            print(f"Method : {method} impossible")


La corrélation entre le traffic et CarCount  avec la méthode kendall est de 0.5539955223221965 
La corrélation entre le traffic et CarCount  avec la méthode pearson est de 0.7462804547623046 
La corrélation entre le traffic et CarCount  avec la méthode spearman est de 0.6911675414283132 
La corrélation entre le traffic et BikeCount  avec la méthode kendall est de 0.4496208394590606 
La corrélation entre le traffic et BikeCount  avec la méthode pearson est de 0.6092848399666767 
La corrélation entre le traffic et BikeCount  avec la méthode spearman est de 0.5609309081145483 
La corrélation entre le traffic et BusCount  avec la méthode kendall est de 0.5555610761332743 
La corrélation entre le traffic et BusCount  avec la méthode pearson est de 0.7339146242435511 
La corrélation entre le traffic et BusCount  avec la méthode spearman est de 0.6887718292772953 
La corrélation entre le traffic et TruckCount  avec la méthode kendall est de -0.20817539711711971 
La corrélation entre le traffi


The input array could not be properly checked for nan values. nan values will be ignored.



## Correlation with input (with eachother)

In [11]:
matrice_df = df.copy()

liste_input = ["Time","Date","Day of the week","CarCount","BikeCount","BusCount","TruckCount","Total"]
methods = ["kendall","pearson","spearman"]
dict_matrice = {}

for method in methods:
    dict_matrice[method] = np.zeros((8,8))
    for i,input_i in enumerate(liste_input):
        for j,input_j in enumerate(liste_input):
            try:
                dict_matrice[method][i][j] = matrice_df[input_i].corr(matrice_df[input_j],method=method)
            except:
                dict_matrice[method][i][j] = 0
for method,matrice in dict_matrice.items():
    fig = px.imshow(matrice,title=f"Correlation matrix for inputs with method: {method}",
                    labels=dict(x="Inputs", y="Inputs", color="Correlation"),
                    x=liste_input,
                    y=liste_input)
    fig.show()
    

## Prediction

##### Encoded data

In [12]:


predict_df = pd.read_csv("sources/Traffic.csv")

# Colonne to change in numeric format
categorical_columns = ["Time", "Day of the week"]

# Encoded 
label_encoder = LabelEncoder()
predict_df_encoded = predict_df.copy()  


predict_df_encoded['Time'] = pd.to_datetime(predict_df['Time'])

predict_df_encoded['Hour'] = predict_df_encoded['Time'].dt.hour  # extract hour
predict_df_encoded['Minute'] = predict_df_encoded['Time'].dt.minute  # Extract minute

# AM_PO
predict_df_encoded['AM_PM'] = predict_df_encoded['Time'].dt.strftime('%p')
predict_df_encoded['AM_PM'] = predict_df_encoded['AM_PM'].map({'AM': 0, 'PM': 1})


predict_df_encoded["Day of the week" + '_encoded'] = label_encoder.fit_transform(predict_df["Day of the week"])


selected_features = ["CarCount", "BikeCount", "BusCount", "TruckCount", "Total", "Day of the week_encoded","Hour", "Minute", "AM_PM","Date","Traffic Situation"]
# selected_features = ["CarCount", "BikeCount", "BusCount", "TruckCount", "Total", "Day of the week_encoded","Date","Traffic Situation"]
# selected_features = ["CarCount", "BikeCount", "BusCount", "TruckCount", "Total","Traffic Situation"]

X = predict_df_encoded[selected_features]
#Now, X add numeric caracteristic and encoded one for training

#### Model (softmax cross  entropy)

In [13]:
# Splitting the data into training, testing, and evaluation sets

train_data = X[:int(0.8 * len(X))] 
test_data = X[int(0.8 * len(X)):int(0.95 * len(X))]  
eval_data = X[int(0.95 * len(X)):]  

X_train = train_data.drop('Traffic Situation', axis=1).values
y_train_encoded = label_encoder.fit_transform(train_data['Traffic Situation'])

X_test = test_data.drop('Traffic Situation', axis=1).values
y_test_encoded = label_encoder.transform(test_data['Traffic Situation'])

y_train_one_hot = tf.keras.utils.to_categorical(y_train_encoded)
y_test_one_hot = tf.keras.utils.to_categorical(y_test_encoded)

# Normalize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a simple perceptron model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(4, activation='softmax', input_shape=(X_train_scaled.shape[1],)),
])

# Model compilation
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model training
history = model.fit(X_train_scaled, y_train_one_hot, epochs=200, batch_size=16, validation_data=(X_test_scaled, y_test_one_hot), verbose=2)

# Prediction on testing set
y_pred = model.predict(X_test_scaled)

# Calculate the difference between predictions and true values
diff = np.argmax(y_pred, axis=1) - np.argmax(y_test_one_hot, axis=1)

# Display the loss function over epochs
fig_loss = px.line(
    x=np.arange(1, len(history.history['loss']) + 1),
    y=history.history['loss'],
    labels={'x': 'Epochs', 'y': 'Loss'},
    title='Loss function over epochs'
)
fig_loss.show()

# Display the difference between predictions and true values over time
fig_diff = px.line(
    x=np.arange(len(diff)),
    y=diff,
    labels={'x': 'Sample Index', 'y': 'Difference (Prediction - Real)'},
    title='Difference between predictions and true values over time'
)
fig_diff.show()
loss, accuracy = model.evaluate(X_test_scaled, y_test_one_hot)






Epoch 1/200
149/149 - 1s - loss: 1.3253 - accuracy: 0.3727 - val_loss: 0.9474 - val_accuracy: 0.6398 - 774ms/epoch - 5ms/step
Epoch 2/200
149/149 - 0s - loss: 0.9854 - accuracy: 0.5782 - val_loss: 0.7978 - val_accuracy: 0.7181 - 166ms/epoch - 1ms/step
Epoch 3/200
149/149 - 0s - loss: 0.8306 - accuracy: 0.6987 - val_loss: 0.7285 - val_accuracy: 0.7360 - 169ms/epoch - 1ms/step
Epoch 4/200
149/149 - 0s - loss: 0.7474 - accuracy: 0.7261 - val_loss: 0.6820 - val_accuracy: 0.7606 - 198ms/epoch - 1ms/step
Epoch 5/200
149/149 - 0s - loss: 0.6927 - accuracy: 0.7370 - val_loss: 0.6479 - val_accuracy: 0.7808 - 177ms/epoch - 1ms/step
Epoch 6/200
149/149 - 0s - loss: 0.6521 - accuracy: 0.7542 - val_loss: 0.6196 - val_accuracy: 0.7897 - 151ms/epoch - 1ms/step
Epoch 7/200
149/149 - 0s - loss: 0.6190 - accuracy: 0.7613 - val_loss: 0.5953 - val_accuracy: 0.7964 - 139ms/epoch - 933us/step
Epoch 8/200
149/149 - 0s - loss: 0.5917 - accuracy: 0.7697 - val_loss: 0.5734 - val_accuracy: 0.8054 - 135ms/epoch -



In [14]:
def test_model(car,bike,bus,truck,total,day,hour,minute,am_pm,date):
    manual_test_data = pd.DataFrame({
    "CarCount": [car],
    "BikeCount": [bike],
    "BusCount": [bus],
    "TruckCount": [truck],
    "Total": [total],
    "Day of the week_encoded": [day],  
    "Hour": [hour],
    "Minute": [minute],
    "AM_PM": [am_pm],  
    "Date": [date]   
    })

    manual_test_data_scaled = scaler.transform(manual_test_data)

    predictions_manual = model.predict(manual_test_data_scaled,verbose=0)
    if np.argmax(predictions_manual,axis=1)[0] == 0:
        return("heavy")
    elif np.argmax(predictions_manual,axis=1)[0] == 1:
        return("high")
    elif np.argmax(predictions_manual,axis=1)[0] == 2:
        return("low")
    elif np.argmax(predictions_manual,axis=1)[0] == 3:
        return("normal")   

cpt = 0
correct = 0
print("Evaluation of the model on evaluation data (different from training and test data)")

for index, row in eval_data.iterrows():
    real = row["Traffic Situation"]
    theo = test_model(row["CarCount"],row["BikeCount"],row["BusCount"],row["TruckCount"],
                      row["Total"],row["Day of the week_encoded"],row["Hour"],row["Minute"],row["AM_PM"],
                      row["Date"])
    if real == theo:
        correct+=1
    cpt+=1
    pourcentage = correct/cpt*100
print(f"The accuracy rate is : {pourcentage}% with {correct} correct results out of {cpt} tests conducted.")

Evaluation of the model on evaluation data (different from training and test data)



X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but StandardScaler was fitted without feature names


X has feature names, but

The accuracy rate is : 89.26174496644296% with 133 correct results out of 149 tests conducted.



X has feature names, but StandardScaler was fitted without feature names

