In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.tree import export_text
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import GridSearchCV


In [4]:
df = pd.read_csv(
    "../dataset/cluster/2020-02-road2-in-5cluster.csv", parse_dates=["datetime"])
df

Unnamed: 0,datetime,road_number,km,direction,all_units,inflow_units,outflow_unit,samecell_units,avg_speed,max_speed,avg_traveltime,max_traveltime,inflow_units_N,outflow_units_N,avg_speed_N,cluster
0,2020-02-01 00:00:34,2,1,in,1,1,2,0,54,54,61,61,0.04,0.076923,0.6750,1
1,2020-02-01 00:00:34,2,2,in,0,0,1,0,0,0,0,0,0.00,0.038462,0.0000,0
2,2020-02-01 00:00:34,2,3,in,4,2,0,2,64,87,60,60,0.08,0.000000,0.8000,2
3,2020-02-01 00:00:34,2,4,in,2,2,2,0,80,97,30,30,0.08,0.076923,1.0000,2
4,2020-02-01 00:00:34,2,5,in,2,2,0,0,50,52,75,90,0.08,0.000000,0.6250,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16496944,2020-02-29 23:59:14,2,468,in,1,1,0,0,44,44,120,120,0.04,0.000000,0.5500,4
16496945,2020-02-29 23:59:14,2,488,in,0,0,1,0,0,0,0,0,0.00,0.038462,0.0000,0
16496946,2020-02-29 23:59:14,2,489,in,1,1,1,0,56,56,59,59,0.04,0.038462,0.7000,1
16496947,2020-02-29 23:59:14,2,490,in,2,2,1,0,64,78,54,60,0.08,0.038462,0.8000,2


In [6]:
num_cols = ['all_units', 'inflow_units',
            'avg_speed', 'max_speed',
            'max_traveltime']
num_cols

['all_units', 'inflow_units', 'avg_speed', 'max_speed', 'max_traveltime']

In [7]:
X=df[num_cols]
y=df['cluster']

In [41]:
X

Unnamed: 0,all_units,inflow_units,avg_speed,max_speed,max_traveltime
0,1,1,54,54,61
1,0,0,0,0,0
2,4,2,64,87,60
3,2,2,80,97,30
4,2,2,50,52,90
...,...,...,...,...,...
16496944,1,1,44,44,120
16496945,0,0,0,0,0
16496946,1,1,56,56,59
16496947,2,2,64,78,60


In [39]:
a = [[1,54,54,61],[0,0,0,0,0],[4,2,64,87,60]]

0           1
1           0
2           2
3           2
4           1
           ..
16496944    4
16496945    0
16496946    1
16496947    2
16496948    1
Name: cluster, Length: 16496949, dtype: int64

In [40]:
y

0           1
1           0
2           2
3           2
4           1
           ..
16496944    4
16496945    0
16496946    1
16496947    2
16496948    1
Name: cluster, Length: 16496949, dtype: int64

In [8]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
param_grid = {"max_depth": np.arange(1, 10),
              "criterion": ["entropy", "gini"]}
tree = GridSearchCV(DecisionTreeClassifier(), param_grid)
tree.fit(x_train, y_train)
tree.best_estimator_


KeyboardInterrupt: 

In [10]:
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

DecisionTreeClassifier()

In [11]:
y_pred = tree.predict(x_test)

In [12]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1141630
           1       0.88      0.95      0.91   1732913
           2       0.93      0.97      0.95   1258683
           3       0.75      0.56      0.64    695808
           4       0.98      0.97      0.98    614960

    accuracy                           0.92   5443994
   macro avg       0.91      0.89      0.90   5443994
weighted avg       0.91      0.92      0.91   5443994



In [24]:
y_test

3315434     2
5115704     0
16399233    0
1299436     1
13149690    1
           ..
1150818     2
6237395     2
9208428     2
5248273     4
8646700     4
Name: cluster, Length: 5443994, dtype: int64

In [13]:
from sklearn.metrics import accuracy_score,f1_score

In [14]:
print(accuracy_score(y_test, y_pred))
# print(f1_score(y_test, y_pred))

0.9172785642305998


In [15]:
import joblib

In [16]:
joblib.dump(tree, "DecisionTreeModel.joblib")


['DecisionTreeModel.joblib']

In [18]:
model_test = joblib.load('DecisionTreeModel.joblib')


In [19]:
df_km127 = pd.read_csv("../dataset/latlon_km127.csv")

In [25]:
df_km127

Unnamed: 0.1,Unnamed: 0,rd,km,lat,lon
0,0,1,815,18.976891,99.911708
1,1,1,816,18.983738,99.906080
2,2,1,817,18.992156,99.903222
3,3,1,818,19.000103,99.899271
4,4,1,819,19.006324,99.901429
...,...,...,...,...,...
2096,3957,7,99996,13.752260,100.591030
2097,3958,7,99997,13.755100,100.582320
2098,3959,7,99998,13.755510,100.573050
2099,3960,7,99999,13.756100,100.563820


In [20]:
def map_traffic_with_latlon(df):
    df['lat'] = df.apply(lambda row: df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lat'].values[0]
                         if len(df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lat'].values) > 0 else 0, axis=1)
    df['lon'] = df.apply(lambda row: df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lon'].values[0]
                         if len(df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lon'].values) > 0 else 0, axis=1)
    return df


In [None]:
header_list = ['datetime', 'road_number', 'km', 'direction', 'all_units', 'inflow_units',
               'outflow_unit', 'samecell_units', 'avg_speed', 'max_speed', 'avg_traveltime', 'max_traveltime']
df_current = pd.read_csv("../dataset/current_celldata.csv",
                         names=header_list, parse_dates=["datetime"])
df_current = df_current[(df_current['road_number'] == 1) | (
    df_current['road_number'] == 2) | (df_current['road_number'] == 7)]
df_current = map_traffic_with_latlon(df_current)
df_current


In [None]:
x_predict = df_current[num_cols]
x_predict

In [26]:
y_predict = model_test.predict(x_predict)
y_predict


NameError: name 'x_predict' is not defined

In [None]:
df_current["cluster"] = y_predict
df_current

In [None]:
df_current[df_current["cluster"] == 0]
