In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.tree import export_text
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler


In [None]:
road = 2
direction = "out"
df = pd.read_csv(
    f"../dataset/cluster/2020-02-road{road}-{direction}-5cluster.csv", parse_dates=["datetime"])
df


In [None]:
df.max(axis=0)

In [None]:
df_corr = df.filter(['all_units','inflow_units','outflow_unit','samecell_units','avg_speed','max_speed','avg_traveltime','max_traveltime','label'],axis=1)
corr = df_corr.corr()

In [None]:
sns.heatmap(corr, vmin=0, vmax=1, annot=True)


In [None]:
num_cols = ['all_units', 'inflow_units',
            'avg_speed', 'max_speed',
            'avg_traveltime',"max_traveltime"]
num_cols

In [None]:
X=df[num_cols]
y=df['cluster']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
param_grid = {"max_depth": np.arange(1, 10),
              "criterion": ["entropy", "gini"]}
tree = GridSearchCV(DecisionTreeClassifier(), param_grid)
tree.fit(x_train, y_train)
tree.best_estimator_


In [None]:
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

In [None]:
y_pred = tree.predict(x_test)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.metrics import accuracy_score,f1_score

In [None]:
print(accuracy_score(y_test, y_pred))
# print(f1_score(y_test, y_pred))

In [None]:
import joblib

In [None]:
joblib.dump(tree, f"DecisionTreeModelRoad{road}{direction}.joblib")


In [None]:
model_test = joblib.load(f'DecisionTreeModelRoad{road}{direction}.joblib')


In [None]:
df_km127 = pd.read_csv("../dataset/latlon_km127.csv")

In [None]:
def map_traffic_with_latlon(df):
    df['lat'] = df.apply(lambda row: df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lat'].values[0]
                         if len(df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lat'].values) > 0 else 0, axis=1)
    df['lon'] = df.apply(lambda row: df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lon'].values[0]
                         if len(df_km127[(df_km127['rd'] == row['road_number']) & (df_km127['km'] == row['km'])]['lon'].values) > 0 else 0, axis=1)
    return df


In [None]:
header_list = ['datetime', 'road_number', 'km', 'direction', 'all_units', 'inflow_units',
               'outflow_unit', 'samecell_units', 'avg_speed', 'max_speed', 'avg_traveltime', 'max_traveltime']
df_current = pd.read_csv("../dataset/current_celldata.csv",
                         names=header_list, parse_dates=["datetime"])
df_current = df_current[(df_current['road_number'] == 1) | (
    df_current['road_number'] == 2) | (df_current['road_number'] == 7)]
df_current = map_traffic_with_latlon(df_current)
df_current


In [None]:
x_predict = df_current[num_cols]
x_predict

In [None]:
y_predict = model_test.predict(x_predict)
y_predict


In [None]:
df_current["cluster"] = y_predict
df_current

In [None]:
df_current[df_current["cluster"] == 3].min(axis=0)
