In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook
import seaborn as sns
import matplotlib.pyplot as plt


In [6]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.tree import export_text
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import GridSearchCV


In [8]:
df = pd.read_csv(
    "../dataset/cluster/2020-02-road2-in-5cluster.csv", parse_dates=["datetime"])
df

Unnamed: 0,datetime,road_number,km,direction,all_units,inflow_units,outflow_unit,samecell_units,avg_speed,max_speed,avg_traveltime,max_traveltime,inflow_units_N,outflow_units_N,avg_speed_N,cluster
0,2020-02-01 00:00:34,2,1,in,1,1,2,0,54,54,61,61,0.04,0.076923,0.6750,1
1,2020-02-01 00:00:34,2,2,in,0,0,1,0,0,0,0,0,0.00,0.038462,0.0000,0
2,2020-02-01 00:00:34,2,3,in,4,2,0,2,64,87,60,60,0.08,0.000000,0.8000,2
3,2020-02-01 00:00:34,2,4,in,2,2,2,0,80,97,30,30,0.08,0.076923,1.0000,2
4,2020-02-01 00:00:34,2,5,in,2,2,0,0,50,52,75,90,0.08,0.000000,0.6250,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16496944,2020-02-29 23:59:14,2,468,in,1,1,0,0,44,44,120,120,0.04,0.000000,0.5500,4
16496945,2020-02-29 23:59:14,2,488,in,0,0,1,0,0,0,0,0,0.00,0.038462,0.0000,0
16496946,2020-02-29 23:59:14,2,489,in,1,1,1,0,56,56,59,59,0.04,0.038462,0.7000,1
16496947,2020-02-29 23:59:14,2,490,in,2,2,1,0,64,78,54,60,0.08,0.038462,0.8000,2


In [9]:
num_cols = ['all_units', 'inflow_units',
            'avg_speed', 'max_speed',
            'max_traveltime']
num_cols

['all_units', 'inflow_units', 'avg_speed', 'max_speed', 'max_traveltime']

In [10]:
X=df[num_cols]
y=df['cluster']

In [11]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
param_grid = {"max_depth": np.arange(1, 10),
              "criterion": ["entropy", "gini"]}
tree = GridSearchCV(DecisionTreeClassifier(), param_grid)
tree.fit(x_train, y_train)
tree.best_estimator_


DecisionTreeClassifier(criterion='entropy', max_depth=9)

In [13]:
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

DecisionTreeClassifier()

In [14]:
y_pred = tree.predict(x_test)

In [15]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1141630
           1       0.88      0.95      0.91   1732913
           2       0.93      0.97      0.95   1258683
           3       0.75      0.56      0.64    695808
           4       0.98      0.97      0.98    614960

    accuracy                           0.92   5443994
   macro avg       0.91      0.89      0.90   5443994
weighted avg       0.91      0.92      0.91   5443994



In [17]:
from sklearn.metrics import accuracy_score,f1_score

In [18]:
print(accuracy_score(y_test, y_pred))
# print(f1_score(y_test, y_pred))

0.9172673592219242


In [19]:
import joblib

In [20]:
joblib.dump(tree, "DecisionTreeModel.joblib")


['DecisionTreeModel.joblib']

In [None]:
model_test = joblib.load('decisiton-tree.joblib')


In [None]:
df_km127 = pd.read_csv("../dataset/latlon_km127.csv")

In [None]:
def map_traffic_with_latlon(df):
    df['lat'] = df.apply(lambda row: df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lat'].values[0]
                         if len(df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lat'].values) > 0 else 0, axis=1)
    df['lon'] = df.apply(lambda row: df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lon'].values[0]
                         if len(df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lon'].values) > 0 else 0, axis=1)
    return df


In [None]:
header_list = ['datetime', 'road_number', 'km', 'direction', 'all_units', 'inflow_units',
               'outflow_unit', 'samecell_units', 'avg_speed', 'max_speed', 'avg_traveltime', 'max_traveltime']
df_current = pd.read_csv("../dataset/current_celldata.csv",
                         names=header_list, parse_dates=["datetime"])
df_current = df_current[(df_current['road_number'] == 1) | (
    df_current['road_number'] == 2) | (df_current['road_number'] == 7)]
df_current = map_traffic_with_latlon(df_current)
df_current


In [None]:
x_predict = df_current[num_cols]
x_predict

In [None]:
y_predict = model_test.predict(x_predict)
y_predict


In [None]:
df_current["cluster"] = y_predict
df_current

In [None]:
df_current[df_current["cluster"] == 0]
