In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.tree import export_text
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler


In [3]:
road = 7
direction = "out"
df = pd.read_csv(
    f"../dataset/cluster/2020-02-road{road}-{direction}-5cluster.csv", parse_dates=["datetime"])
df

Unnamed: 0,datetime,road_number,km,direction,all_units,inflow_units,outflow_unit,samecell_units,avg_speed,max_speed,avg_traveltime,max_traveltime,inflow_units_N,outflow_units_N,avg_speed_N,cluster
0,2020-02-01 00:00:34,7,0,out,10,4,7,6,60,105,0,0,0.076923,0.159091,0.7500,0
1,2020-02-01 00:00:34,7,1,out,5,5,3,0,69,84,108,180,0.096154,0.068182,0.8625,2
2,2020-02-01 00:00:34,7,2,out,3,3,2,0,79,98,59,60,0.057692,0.045455,0.9875,2
3,2020-02-01 00:00:34,7,4,out,0,0,1,0,0,0,0,0,0.000000,0.022727,0.0000,1
4,2020-02-01 00:00:34,7,6,out,2,2,1,0,68,75,64,86,0.038462,0.022727,0.8500,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4519315,2020-02-29 23:59:14,7,108,out,2,2,2,0,38,52,68,76,0.038462,0.045455,0.4750,4
4519316,2020-02-29 23:59:14,7,109,out,1,1,3,0,36,36,60,60,0.019231,0.068182,0.4500,4
4519317,2020-02-29 23:59:14,7,111,out,1,1,0,0,64,64,60,60,0.019231,0.000000,0.8000,2
4519318,2020-02-29 23:59:14,7,113,out,0,0,1,0,0,0,0,0,0.000000,0.022727,0.0000,1


In [4]:
num_cols = ['all_units', 'inflow_units',
            'avg_speed', 'max_speed',
            'max_traveltime']
num_cols

['all_units', 'inflow_units', 'avg_speed', 'max_speed', 'max_traveltime']

In [5]:
X=df[num_cols]
y=df['cluster']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
param_grid = {"max_depth": np.arange(1, 10),
              "criterion": ["entropy", "gini"]}
tree = GridSearchCV(DecisionTreeClassifier(), param_grid)
tree.fit(x_train, y_train)
tree.best_estimator_


In [7]:
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

DecisionTreeClassifier()

In [8]:
y_pred = tree.predict(x_test)

In [9]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93    529410
           1       1.00      1.00      1.00    158386
           2       0.95      0.97      0.96    495543
           3       0.70      0.59      0.64    172501
           4       0.96      0.95      0.96    135536

    accuracy                           0.92   1491376
   macro avg       0.91      0.89      0.90   1491376
weighted avg       0.92      0.92      0.92   1491376



In [10]:
from sklearn.metrics import accuracy_score,f1_score

In [11]:
print(accuracy_score(y_test, y_pred))
# print(f1_score(y_test, y_pred))

0.9194656478312645


In [12]:
import joblib

In [13]:
joblib.dump(tree, f"DecisionTreeModelRoad{road}{direction}.joblib")


['DecisionTreeModelRoad7out.joblib']

In [14]:
model_test = joblib.load(f'DecisionTreeModelRoad{road}{direction}.joblib')


In [15]:
df_km127 = pd.read_csv("../dataset/latlon_km127.csv")

In [16]:
def map_traffic_with_latlon(df):
    df['lat'] = df.apply(lambda row: df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lat'].values[0]
                         if len(df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lat'].values) > 0 else 0, axis=1)
    df['lon'] = df.apply(lambda row: df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lon'].values[0]
                         if len(df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lon'].values) > 0 else 0, axis=1)
    return df


In [17]:
header_list = ['datetime', 'road_number', 'km', 'direction', 'all_units', 'inflow_units',
               'outflow_unit', 'samecell_units', 'avg_speed', 'max_speed', 'avg_traveltime', 'max_traveltime']
df_current = pd.read_csv("../dataset/current_celldata.csv",
                         names=header_list, parse_dates=["datetime"])
df_current = df_current[(df_current['road_number'] == 1) | (
    df_current['road_number'] == 2) | (df_current['road_number'] == 7)]
df_current = map_traffic_with_latlon(df_current)
df_current


Unnamed: 0,datetime,road_number,km,direction,all_units,inflow_units,outflow_unit,samecell_units,avg_speed,max_speed,avg_traveltime,max_traveltime,lat,lon
0,2022-03-01 17:01:46,1,19,in,0,0,1,0,0,0,0,0,13.879176,100.598832
1,2022-03-01 17:01:46,1,25,in,0,0,1,0,0,0,0,0,13.926152,100.626455
2,2022-03-01 17:01:46,1,25,out,5,4,1,1,24,64,67,90,13.926152,100.626455
3,2022-03-01 17:01:46,1,26,in,4,3,1,1,50,63,69,90,13.935066,100.625095
4,2022-03-01 17:01:46,1,26,out,4,3,5,1,50,63,69,90,13.935066,100.625095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4088,2022-03-01 17:01:46,7,121,in,0,0,1,0,0,0,0,0,12.951311,100.947536
4089,2022-03-01 17:01:46,7,122,in,2,2,0,0,31,32,58,59,12.950342,100.938541
4090,2022-03-01 17:01:46,7,122,out,1,1,0,0,89,89,29,29,12.950342,100.938541
4091,2022-03-01 17:01:46,7,123,in,0,0,1,0,0,0,0,0,12.947607,100.929908


In [18]:
x_predict = df_current[num_cols]
x_predict

Unnamed: 0,all_units,inflow_units,avg_speed,max_speed,max_traveltime
0,0,0,0,0,0
1,0,0,0,0,0
2,5,4,24,64,90
3,4,3,50,63,90
4,4,3,50,63,90
...,...,...,...,...,...
4088,0,0,0,0,0
4089,2,2,31,32,59
4090,1,1,89,89,29
4091,0,0,0,0,0


In [19]:
y_predict = model_test.predict(x_predict)
y_predict


array([1, 1, 4, ..., 2, 1, 1], dtype=int64)

In [20]:
df_current["cluster"] = y_predict
df_current

Unnamed: 0,datetime,road_number,km,direction,all_units,inflow_units,outflow_unit,samecell_units,avg_speed,max_speed,avg_traveltime,max_traveltime,lat,lon,cluster
0,2022-03-01 17:01:46,1,19,in,0,0,1,0,0,0,0,0,13.879176,100.598832,1
1,2022-03-01 17:01:46,1,25,in,0,0,1,0,0,0,0,0,13.926152,100.626455,1
2,2022-03-01 17:01:46,1,25,out,5,4,1,1,24,64,67,90,13.926152,100.626455,4
3,2022-03-01 17:01:46,1,26,in,4,3,1,1,50,63,69,90,13.935066,100.625095,0
4,2022-03-01 17:01:46,1,26,out,4,3,5,1,50,63,69,90,13.935066,100.625095,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4088,2022-03-01 17:01:46,7,121,in,0,0,1,0,0,0,0,0,12.951311,100.947536,1
4089,2022-03-01 17:01:46,7,122,in,2,2,0,0,31,32,58,59,12.950342,100.938541,4
4090,2022-03-01 17:01:46,7,122,out,1,1,0,0,89,89,29,29,12.950342,100.938541,2
4091,2022-03-01 17:01:46,7,123,in,0,0,1,0,0,0,0,0,12.947607,100.929908,1


In [21]:
df_current[df_current["cluster"] == 3].min(axis=0)


datetime          2022-03-01 17:01:46
road_number                         1
km                                  8
direction                          in
all_units                           4
inflow_units                        4
outflow_unit                        0
samecell_units                      0
avg_speed                          42
max_speed                          62
avg_traveltime                     38
max_traveltime                     57
lat                         12.979884
lon                         99.131279
cluster                             3
dtype: object