In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV


In [9]:
road = 1
direction = "in"
month = "02"
df = pd.read_csv(
    f"../dataset/2020-{month}-road-{road}-{direction}.csv", parse_dates=["datetime"])
df


Unnamed: 0,datetime,road_number,km,direction,all_units,inflow_units,outflow_unit,samecell_units,avg_speed,max_speed,avg_traveltime,max_traveltime
0,2020-02-01 00:00:34,1,19,in,0,0,1,0,0,0,0,0
1,2020-02-01 00:00:34,1,26,in,1,0,0,1,71,71,0,0
2,2020-02-01 00:00:34,1,28,in,3,1,1,2,41,48,0,0
3,2020-02-01 00:00:34,1,29,in,3,3,3,0,75,108,0,0
4,2020-02-01 00:00:34,1,30,in,5,5,3,0,32,56,62,66
...,...,...,...,...,...,...,...,...,...,...,...,...
22694507,2020-02-29 23:59:14,1,939,in,1,0,0,1,20,20,0,0
22694508,2020-02-29 23:59:14,1,941,in,0,0,1,0,0,0,0,0
22694509,2020-02-29 23:59:14,1,959,in,1,1,0,0,47,47,60,60
22694510,2020-02-29 23:59:14,1,960,in,0,0,1,0,0,0,0,0


In [10]:
df_km127 = pd.read_csv("../dataset/latlon_km127.csv")


In [11]:
import joblib
dtree_model = joblib.load(f'../Models/RandomForestModelRoad1in.joblib')
dtree_model


RandomForestClassifier()

In [12]:
def map_traffic_with_latlon(df):
    df['lat'] = df.apply(lambda row: df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lat'].values[0]
                         if len(df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lat'].values) > 0 else 0, axis=1)
    df['lon'] = df.apply(lambda row: df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lon'].values[0]
                         if len(df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lon'].values) > 0 else 0, axis=1)
    return df


In [13]:
header_list = ['datetime', 'road_number', 'km', 'direction', 'all_units', 'inflow_units',
               'outflow_unit', 'samecell_units', 'avg_speed', 'max_speed', 'avg_traveltime', 'max_traveltime']
df_test = pd.read_csv("../dataset/current_celldata.csv",
                      names=header_list, parse_dates=["datetime"])
df_test = df_test[(df_test['road_number'] == 1) | (
    df_test['road_number'] == 2) | (df_test['road_number'] == 7)]
df_test = map_traffic_with_latlon(df_test)
df_test


Unnamed: 0,datetime,road_number,km,direction,all_units,inflow_units,outflow_unit,samecell_units,avg_speed,max_speed,avg_traveltime,max_traveltime,lat,lon
0,2022-03-01 17:01:46,1,19,in,0,0,1,0,0,0,0,0,13.879176,100.598832
1,2022-03-01 17:01:46,1,25,in,0,0,1,0,0,0,0,0,13.926152,100.626455
2,2022-03-01 17:01:46,1,25,out,5,4,1,1,24,64,67,90,13.926152,100.626455
3,2022-03-01 17:01:46,1,26,in,4,3,1,1,50,63,69,90,13.935066,100.625095
4,2022-03-01 17:01:46,1,26,out,4,3,5,1,50,63,69,90,13.935066,100.625095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4088,2022-03-01 17:01:46,7,121,in,0,0,1,0,0,0,0,0,12.951311,100.947536
4089,2022-03-01 17:01:46,7,122,in,2,2,0,0,31,32,58,59,12.950342,100.938541
4090,2022-03-01 17:01:46,7,122,out,1,1,0,0,89,89,29,29,12.950342,100.938541
4091,2022-03-01 17:01:46,7,123,in,0,0,1,0,0,0,0,0,12.947607,100.929908


In [14]:
# num_cols = ['all_units', 'inflow_units',
#             'avg_speed', 'max_speed',
#             'avg_traveltime', 'max_traveltime']
# num_cols


In [15]:
num_cols = ['all_units', 'inflow_units',
            'avg_speed', 'max_speed',
            'max_traveltime']
num_cols


['all_units', 'inflow_units', 'avg_speed', 'max_speed', 'max_traveltime']

In [16]:
x_predict = df[num_cols]
x_predict


Unnamed: 0,all_units,inflow_units,avg_speed,max_speed,max_traveltime
0,0,0,0,0,0
1,1,0,71,71,0
2,3,1,41,48,0
3,3,3,75,108,0
4,5,5,32,56,66
...,...,...,...,...,...
22694507,1,0,20,20,0
22694508,0,0,0,0,0
22694509,1,1,47,47,60
22694510,0,0,0,0,0


In [17]:
y_predict = dtree_model.predict(x_predict)
y_predict


array([1, 0, 2, ..., 4, 1, 0], dtype=int64)

In [18]:
df["cluster"] = y_predict
df


Unnamed: 0,datetime,road_number,km,direction,all_units,inflow_units,outflow_unit,samecell_units,avg_speed,max_speed,avg_traveltime,max_traveltime,cluster
0,2020-02-01 00:00:34,1,19,in,0,0,1,0,0,0,0,0,1
1,2020-02-01 00:00:34,1,26,in,1,0,0,1,71,71,0,0,0
2,2020-02-01 00:00:34,1,28,in,3,1,1,2,41,48,0,0,2
3,2020-02-01 00:00:34,1,29,in,3,3,3,0,75,108,0,0,0
4,2020-02-01 00:00:34,1,30,in,5,5,3,0,32,56,62,66,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22694507,2020-02-29 23:59:14,1,939,in,1,0,0,1,20,20,0,0,2
22694508,2020-02-29 23:59:14,1,941,in,0,0,1,0,0,0,0,0,1
22694509,2020-02-29 23:59:14,1,959,in,1,1,0,0,47,47,60,60,4
22694510,2020-02-29 23:59:14,1,960,in,0,0,1,0,0,0,0,0,1


In [19]:
df[df["cluster"] == 1].max(axis=0)


datetime          2020-02-29 23:59:14
road_number                         1
km                                993
direction                          in
all_units                         104
inflow_units                       27
outflow_unit                       29
samecell_units                     84
avg_speed                          17
max_speed                         149
avg_traveltime                    989
max_traveltime                    989
cluster                             1
dtype: object

In [29]:
df_acc = df[((df["cluster"] == 0) | (df["cluster"] == 1)) & (df["datetime"] > pd.Timestamp(2020, 2, 17, 11, 25))
            & (df["datetime"] < pd.Timestamp(2020, 2, 17, 11, 35))]
df_acc


Unnamed: 0,datetime,road_number,km,direction,all_units,inflow_units,outflow_unit,samecell_units,avg_speed,max_speed,avg_traveltime,max_traveltime,cluster
12755652,2020-02-17 11:25:49,1,19,in,0,0,2,0,0,0,0,0,1
12755653,2020-02-17 11:25:49,1,25,in,0,0,6,0,0,0,0,0,1
12755655,2020-02-17 11:25:49,1,27,in,4,2,1,2,16,48,0,0,1
12755668,2020-02-17 11:25:49,1,40,in,7,6,8,1,62,85,54,60,0
12755681,2020-02-17 11:25:49,1,53,in,0,0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12761399,2020-02-17 11:34:00,1,977,in,1,1,0,0,77,77,60,60,0
12761400,2020-02-17 11:34:00,1,978,in,1,1,1,0,69,69,56,56,0
12761402,2020-02-17 11:34:00,1,980,in,0,0,2,0,0,0,0,0,1
12761405,2020-02-17 11:34:00,1,984,in,0,0,1,0,0,0,0,0,1


In [33]:
df_acc[df_acc['km'] == 26]


Unnamed: 0,datetime,road_number,km,direction,all_units,inflow_units,outflow_unit,samecell_units,avg_speed,max_speed,avg_traveltime,max_traveltime,cluster
