In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import GridSearchCV

In [2]:
road = 2
direction = "in"
month = "05"
df = pd.read_csv(
    f"../dataset/2020-{month}-road-{road}-{direction}.csv", parse_dates=["datetime"])
df


Unnamed: 0,datetime,road_number,km,direction,all_units,inflow_units,outflow_unit,samecell_units,avg_speed,max_speed,avg_traveltime,max_traveltime
0,2020-05-01 00:00:05,2,1,in,1,0,0,1,39,39,0,0
1,2020-05-01 00:00:05,2,6,in,2,1,0,1,60,61,66,66
2,2020-05-01 00:00:05,2,7,in,0,0,1,0,0,0,0,0
3,2020-05-01 00:00:05,2,8,in,1,1,0,0,91,91,41,41
4,2020-05-01 00:00:05,2,9,in,2,1,0,1,68,74,52,52
...,...,...,...,...,...,...,...,...,...,...,...,...
16891732,2020-05-31 23:59:04,2,461,in,0,0,1,0,0,0,0,0
16891733,2020-05-31 23:59:04,2,466,in,1,1,0,0,56,56,63,63
16891734,2020-05-31 23:59:04,2,467,in,0,0,1,0,0,0,0,0
16891735,2020-05-31 23:59:04,2,493,in,0,0,1,0,0,0,0,0


In [3]:
df_km127 = pd.read_csv("../dataset/latlon_km127.csv")

In [4]:
import joblib
dtree_model = joblib.load(f'../Models/DecisionTreeModelRoad2in.joblib')
dtree_model

DecisionTreeClassifier()

In [5]:
def map_traffic_with_latlon(df):
    df['lat'] = df.apply(lambda row: df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lat'].values[0]
                         if len(df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lat'].values) > 0 else 0, axis=1)
    df['lon'] = df.apply(lambda row: df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lon'].values[0]
                         if len(df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lon'].values) > 0 else 0, axis=1)
    return df


In [6]:
header_list = ['datetime', 'road_number', 'km', 'direction', 'all_units', 'inflow_units',
               'outflow_unit', 'samecell_units', 'avg_speed', 'max_speed', 'avg_traveltime', 'max_traveltime']
df_test = pd.read_csv("../dataset/current_celldata.csv",
                         names=header_list, parse_dates=["datetime"])
df_test = df_test[(df_test['road_number'] == 1) | (
    df_test['road_number'] == 2) | (df_test['road_number'] == 7)]
df_test = map_traffic_with_latlon(df_test)
df_test


Unnamed: 0,datetime,road_number,km,direction,all_units,inflow_units,outflow_unit,samecell_units,avg_speed,max_speed,avg_traveltime,max_traveltime,lat,lon
0,2022-03-01 17:01:46,1,19,in,0,0,1,0,0,0,0,0,13.879176,100.598832
1,2022-03-01 17:01:46,1,25,in,0,0,1,0,0,0,0,0,13.926152,100.626455
2,2022-03-01 17:01:46,1,25,out,5,4,1,1,24,64,67,90,13.926152,100.626455
3,2022-03-01 17:01:46,1,26,in,4,3,1,1,50,63,69,90,13.935066,100.625095
4,2022-03-01 17:01:46,1,26,out,4,3,5,1,50,63,69,90,13.935066,100.625095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4088,2022-03-01 17:01:46,7,121,in,0,0,1,0,0,0,0,0,12.951311,100.947536
4089,2022-03-01 17:01:46,7,122,in,2,2,0,0,31,32,58,59,12.950342,100.938541
4090,2022-03-01 17:01:46,7,122,out,1,1,0,0,89,89,29,29,12.950342,100.938541
4091,2022-03-01 17:01:46,7,123,in,0,0,1,0,0,0,0,0,12.947607,100.929908


In [7]:
num_cols = ['all_units', 'inflow_units',
            'avg_speed', 'max_speed',
            'avg_traveltime', 'max_traveltime']
num_cols


['all_units',
 'inflow_units',
 'avg_speed',
 'max_speed',
 'avg_traveltime',
 'max_traveltime']

In [8]:
# num_cols = ['all_units', 'inflow_units',
#             'avg_speed', 'max_speed',
#             'max_traveltime']
# num_cols


In [9]:
x_predict = df[num_cols]
x_predict

Unnamed: 0,all_units,inflow_units,avg_speed,max_speed,avg_traveltime,max_traveltime
0,1,0,39,39,0,0
1,2,1,60,61,66,66
2,0,0,0,0,0,0
3,1,1,91,91,41,41
4,2,1,68,74,52,52
...,...,...,...,...,...,...
16891732,0,0,0,0,0,0
16891733,1,1,56,56,63,63
16891734,0,0,0,0,0,0
16891735,0,0,0,0,0,0


In [10]:
y_predict = dtree_model.predict(x_predict)
y_predict

array([4, 1, 0, ..., 0, 0, 1], dtype=int64)

In [11]:
df["cluster"] = y_predict
df

Unnamed: 0,datetime,road_number,km,direction,all_units,inflow_units,outflow_unit,samecell_units,avg_speed,max_speed,avg_traveltime,max_traveltime,cluster
0,2020-05-01 00:00:05,2,1,in,1,0,0,1,39,39,0,0,4
1,2020-05-01 00:00:05,2,6,in,2,1,0,1,60,61,66,66,1
2,2020-05-01 00:00:05,2,7,in,0,0,1,0,0,0,0,0,0
3,2020-05-01 00:00:05,2,8,in,1,1,0,0,91,91,41,41,2
4,2020-05-01 00:00:05,2,9,in,2,1,0,1,68,74,52,52,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16891732,2020-05-31 23:59:04,2,461,in,0,0,1,0,0,0,0,0,0
16891733,2020-05-31 23:59:04,2,466,in,1,1,0,0,56,56,63,63,1
16891734,2020-05-31 23:59:04,2,467,in,0,0,1,0,0,0,0,0,0
16891735,2020-05-31 23:59:04,2,493,in,0,0,1,0,0,0,0,0,0


In [12]:
df[df["cluster"] == 4].max(axis=0)

datetime          2020-05-31 23:59:04
road_number                         2
km                                508
direction                          in
all_units                          71
inflow_units                       24
outflow_unit                       28
samecell_units                     51
avg_speed                          47
max_speed                         144
avg_traveltime                    964
max_traveltime                    997
cluster                             4
dtype: object

In [22]:
df_acc = df[((df["cluster"] == 0) | (df["cluster"] == 4)) & (df["datetime"] > pd.Timestamp(2020, 5, 31, 17, 53))
                  & (df["datetime"] < pd.Timestamp(2020, 5, 31, 18, 10))]
df_acc


Unnamed: 0,datetime,road_number,km,direction,all_units,inflow_units,outflow_unit,samecell_units,avg_speed,max_speed,avg_traveltime,max_traveltime,cluster
16757081,2020-05-31 17:53:22,2,1,in,0,0,2,0,0,0,0,0,0
16757083,2020-05-31 17:53:22,2,4,in,0,0,1,0,0,0,0,0,0
16757088,2020-05-31 17:53:22,2,10,in,6,2,3,4,26,44,60,61,4
16757094,2020-05-31 17:53:22,2,16,in,4,2,5,2,40,61,66,70,4
16757101,2020-05-31 17:53:22,2,23,in,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16763989,2020-05-31 18:09:02,2,461,in,0,0,1,0,0,0,0,0,0
16763992,2020-05-31 18:09:02,2,464,in,2,1,1,1,31,62,60,60,4
16763999,2020-05-31 18:09:02,2,479,in,0,0,1,0,0,0,0,0,0
16764004,2020-05-31 18:09:02,2,497,in,2,2,0,0,42,44,60,61,4


In [23]:
# df_acc[(df_acc["cluster"]==1) & (df_acc["all_units"]>0) & (df_acc["km"]==1)]
df_acc[(df_acc["all_units"]>=0) & (df_acc["km"]==48)]

Unnamed: 0,datetime,road_number,km,direction,all_units,inflow_units,outflow_unit,samecell_units,avg_speed,max_speed,avg_traveltime,max_traveltime,cluster
16759175,2020-05-31 17:58:48,2,48,in,0,0,3,0,0,0,0,0,0
