In [352]:
#Import libraries
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score as ss
import pandas as pd 
import numpy as np
import plotly.express as px
import itertools as it 

In [353]:
#Original dataframe
original_df = pd.read_excel('Street Data_BaDinh.xlsx')
#Drop duplicates
new_df = original_df.drop_duplicates(subset=['Longitude','Latitude'])
new_df


Unnamed: 0,Index,District,Ward,Street Name 1,Street name 2,Street Name 3,Street Name 4,Longitude,Latitude,Traffic Level
0,10101,Ba Đình,Phúc Xá,An Xá,Nghĩa Dũng,,,21.046224,105.849289,1
1,10102,Ba Đình,Phúc Xá,An Xá,Tân Mỹ,,,21.042412,105.852127,1
2,10103,Ba Đình,Phúc Xá,An Xá,Phúc Xá,,,21.046067,105.849022,1
3,11901,Ba Đình,Điện Biên,Bà Huyện Thanh Quan,Chùa Một Cột,,,21.034974,105.835874,1
4,11902,Ba Đình,Điện Biên,Bà Huyện Thanh Quan,Lê Hồng Phong,,,21.033849,105.835607,2
...,...,...,...,...,...,...,...,...,...,...
350,11976,Ba Đình,Điện Biên,Tôn Thất Thiệp,Lý Nam Đế,,,21.034050,105.844725,2
362,11985,Ba Đình,Điện Biên,Trần Phú,Lý Nam Đế,,,21.030093,105.843887,2
363,11986,Ba Đình,Điện Biên,Trần Phú,Tống Duy Tân,,,21.029998,105.844335,2
364,11987,Ba Đình,Điện Biên,Trần Phú,Phùng Hưng,Hà Trung,,21.029973,105.844738,2


In [354]:
#Count Missing Value
print(new_df.isnull().sum())
#Check Datatype
new_df.dtypes

Index              0
District           0
Ward               0
Street Name 1      0
Street name 2      0
Street Name 3    165
Street Name 4    194
Longitude          0
Latitude           0
Traffic Level      0
dtype: int64


Index              int64
District          object
Ward              object
Street Name 1     object
Street name 2     object
Street Name 3     object
Street Name 4     object
Longitude        float64
Latitude         float64
Traffic Level      int64
dtype: object

In [355]:
fig = px.bar(new_df, x="Ward", color=new_df["Traffic Level"].astype(str))
fig.show()

In [356]:
#Count
CountsTotal = []
Ward_Count_Total = new_df['Ward'].value_counts()
for i in range (0, len(Ward_Count_Total)): 
    CountsTotal.append(Ward_Count_Total[i])
px.bar(Ward_Count_Total, y = CountsTotal)

In [357]:
#Cordinates and Traffic Level
main = new_df.iloc[:,[7,8,9]]
main.head(10)

Unnamed: 0,Longitude,Latitude,Traffic Level
0,21.046224,105.849289,1
1,21.042412,105.852127,1
2,21.046067,105.849022,1
3,21.034974,105.835874,1
4,21.033849,105.835607,2
5,21.035962,105.839265,1
6,21.03598,105.838074,1
7,21.036582,105.836578,1
8,21.037553,105.83108,1
9,21.037688,105.807348,3


In [358]:
#Cordinates for Traffic Level 1 
TF_1 = main[main['Traffic Level'] == 1].iloc[:,[0,1]]
TF_1.head(10)

Unnamed: 0,Longitude,Latitude
0,21.046224,105.849289
1,21.042412,105.852127
2,21.046067,105.849022
3,21.034974,105.835874
5,21.035962,105.839265
6,21.03598,105.838074
7,21.036582,105.836578
8,21.037553,105.83108
20,21.042534,105.841838
21,21.043551,105.842065


In [359]:
#Cordinates for Traffic Level 2
TF_2 = main[main['Traffic Level'] == 2].iloc[:,[0,1]]
TF_2.head(10)

Unnamed: 0,Longitude,Latitude
4,21.033849,105.835607
10,21.036802,105.806949
14,21.030388,105.838591
15,21.029578,105.841557
16,21.030178,105.836997
18,21.0286,105.805231
19,21.02843,105.805509
25,21.034975,105.836843
26,21.033652,105.836641
33,21.040651,105.842084


In [360]:
#Cordinates for Traffic Level 3
TF_3 = main[main['Traffic Level'] == 3].iloc[:,[0,1]]
TF_3

Unnamed: 0,Longitude,Latitude
9,21.037688,105.807348
11,21.034335,105.805245
12,21.029744,105.802636
13,21.047508,105.806128
27,21.031582,105.836236
28,21.03039,105.836007
38,21.0456,105.843339
44,21.032382,105.809576
48,21.030593,105.815841
49,21.023419,105.819744


In [361]:
#DATA CLEAN

In [362]:
#Defining Models and Clusters\
cluster1 = TF_1.to_numpy()
cluster2 = TF_2.to_numpy()
cluster3 = TF_3.to_numpy()

#Training Model for Traffic Level 1
model1 = DBSCAN(0.005,min_samples=5).fit(cluster1)
TF_1['labs'] = model1.labels_
print((TF_1['labs']).value_counts())
score1 = ss(cluster1, TF_1['labs'])
score1

after_fig1 = px.scatter(TF_1, 
                        y = "Latitude",
                        x = "Longitude", 
                        color=TF_1['labs'].astype(str),
                        labels={"color" : "Group"},
                      ) #Default eps, min
after_fig1.update_layout(
            title={
            'text' : "Area with low traffic",
            'x':0.5,
            'xanchor': 'center',
        })
after_fig1

#Training Model for Traffic Level 2
model2 = DBSCAN(0.005, min_samples=5).fit(cluster2)
TF_2['labs'] = model2.labels_
TF_2['labs'].value_counts()
#score2 = ss(cluster2, TF_2['labs'])

after_fig2 = px.scatter(TF_2,
                        y = "Latitude" , 
                        x = "Longitude", 
                        color=TF_2['labs'].astype(str),
                        )
after_fig2.update_layout(
            title={
            'text' : "Area with moderate traffic",
            'x':0.5,
            'xanchor': 'center',
        })
after_fig2

#Training Model for Traffic Level 3
model3 = DBSCAN(0.005,min_samples=5).fit(cluster3) 
TF_3['labs']= model3.labels_
TF_3['labs'].value_counts()
#score3 = ss(cluster3, TF_3['labs'])

after_fig3 = px.scatter(TF_3, 
                        y = "Latitude",
                        x = "Longitude", 
                        color=TF_3['labs'].astype(str))
after_fig3.update_layout(
            title={
            'text' : "Area with highly-condensed traffic",
            'x':0.5,
            'xanchor': 'center',
        })
after_fig3

In [405]:
#Range of Parameters
#epsilon = [0.0005,0.0006,0.0007,0.0008,0.0009,0.001,0.0011,0.0012,0.0013,0.0014,0.0015,0.0016,0.0017,0.0018,0.0019,0.002,0.0021,0.0022,0.0023,0.0024,0.0025,0.0026,0.0027,0.0028,0.0029,0.003,0.0031,0.0032,0.0033,0.0034,0.0035,0.0036,0.0037,0.0038,0.0039,0.004]
epsilon = np.linspace(0.0001, 0.01, num=1000).tolist()
min_samples = [3,4,5,6,7,8,9,10]
combinations = list(it.product(epsilon, min_samples))
len(combinations)

8000

In [427]:
#Case 2: Using mupitle functions to get the best score
def getScore1(combinations):
    loop_count = 1
    #Loop through parameters from combinations
    for i, (epsilon, min_samples) in enumerate(combinations):
        #Fit model
        model1 = DBSCAN(eps = epsilon, min_samples = min_samples).fit(cluster1)
        #Get lables
        TF_1['labs'] = model1.labels_
        num_clus = len(TF_1['labs'].value_counts()) 
        #Get scores
        if num_clus > 2:
            score = ss(cluster1, TF_1['labs'])
            if score > 0.4 and num_clus>3: 
                print("Loop number:",loop_count,"| Parameters:", epsilon,",", min_samples,"| Score for Trafic Level 1:", score,"| Number of clusters:", num_clus)
        loop_count += 1
def getScore2(combinations):
    loop_count = 1
    #Loop through parameters from combinations
    for i, (epsilon, min_samples) in enumerate(combinations):
        #Fit model
        model2 = DBSCAN(eps = epsilon, min_samples = min_samples).fit(cluster2)
        #Get lables
        TF_2['labs'] = model2.labels_
        num_clus = len(TF_2['labs'].value_counts()) 
        #Get scores
        if num_clus > 2:
            score = ss(cluster2, TF_2['labs']) 
            if score > 0.4568 and num_clus > 3:
                print("Loop number:",loop_count,"| Parameters:", epsilon,",", min_samples,"| Score for Trafic Level 2:", score,"| Number of clusters:", num_clus)
        loop_count += 1
def getScore3(combinations):
    loop_count = 1
    #Loop through parameters from combinations
    for i, (epsilon, min_samples) in enumerate(combinations):
        #Fit model
        model3 = DBSCAN(eps = epsilon, min_samples = min_samples).fit(cluster3)
        #Get lables
        TF_3['labs'] = model3.labels_
        num_clus = len(TF_3['labs'].value_counts()) 
        #Get scores
        if num_clus > 2:
            score = ss(cluster3, TF_3['labs']) 
            if score > 0.5 and num_clus > 3:
                print("Loop number:",loop_count,"| Parameters:", epsilon,",", min_samples,"| Score for Trafic Level 3:", score,"| Number of clusters:", num_clus)
        loop_count += 1

In [413]:
getScore1(combinations)

Loop number: 2157 | Parameters: 0.002765765765765766 , 7 | Score for Trafic Level 1: 0.4016969495894111 | Number of clusters: 4
Loop number: 2165 | Parameters: 0.0027756756756756756 , 7 | Score for Trafic Level 1: 0.4016969495894111 | Number of clusters: 4
Loop number: 2173 | Parameters: 0.0027855855855855854 , 7 | Score for Trafic Level 1: 0.4016969495894111 | Number of clusters: 4
Loop number: 2181 | Parameters: 0.002795495495495495 , 7 | Score for Trafic Level 1: 0.4016969495894111 | Number of clusters: 4
Loop number: 2189 | Parameters: 0.0028054054054054054 , 7 | Score for Trafic Level 1: 0.4016969495894111 | Number of clusters: 4
Loop number: 2197 | Parameters: 0.002815315315315315 , 7 | Score for Trafic Level 1: 0.4016969495894111 | Number of clusters: 4
Loop number: 2205 | Parameters: 0.002825225225225225 , 7 | Score for Trafic Level 1: 0.4016969495894111 | Number of clusters: 4
Loop number: 2213 | Parameters: 0.0028351351351351348 , 7 | Score for Trafic Level 1: 0.4016969495894

In [428]:
getScore2(combinations)

Loop number: 3802 | Parameters: 0.004807207207207207 , 4 | Score for Trafic Level 2: 0.4569707840575685 | Number of clusters: 4
Loop number: 3810 | Parameters: 0.004817117117117117 , 4 | Score for Trafic Level 2: 0.4569707840575685 | Number of clusters: 4
Loop number: 3818 | Parameters: 0.0048270270270270275 , 4 | Score for Trafic Level 2: 0.4569707840575685 | Number of clusters: 4
Loop number: 3826 | Parameters: 0.004836936936936937 , 4 | Score for Trafic Level 2: 0.4569707840575685 | Number of clusters: 4


In [418]:
getScore3(combinations)

Loop number: 4601 | Parameters: 0.005798198198198198 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4609 | Parameters: 0.005808108108108109 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4617 | Parameters: 0.005818018018018018 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4625 | Parameters: 0.005827927927927928 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4633 | Parameters: 0.005837837837837838 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4641 | Parameters: 0.005847747747747748 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4649 | Parameters: 0.005857657657657658 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4657 | Parameters: 0.005867567567567568 , 3 | Score for Trafic Level 3: 0.5171859719938379 

TF1
Loop number: 2317 | Parameters: 0.002963963963963964 , 7 | Score for Trafic Level 1: 0.41488584456394306 | Number of clusters: 4
Loop number: 2325 | Parameters: 0.0029738738738738736 , 7 | Score for Trafic Level 1: 0.41488584456394306 | Number of clusters: 4
Loop number: 2333 | Parameters: 0.002983783783783784 , 7 | Score for Trafic Level 1: 0.41488584456394306 | Number of clusters: 4

TF2
Loop number: 3802 | Parameters: 0.004807207207207207 , 4 | Score for Trafic Level 2: 0.4569707840575685 | Number of clusters: 4
Loop number: 3810 | Parameters: 0.004817117117117117 , 4 | Score for Trafic Level 2: 0.4569707840575685 | Number of clusters: 4
Loop number: 3818 | Parameters: 0.0048270270270270275 , 4 | Score for Trafic Level 2: 0.4569707840575685 | Number of clusters: 4
Loop number: 3826 | Parameters: 0.004836936936936937 , 4 | Score for Trafic Level 2: 0.4569707840575685 

TF3
Loop number: 4601 | Parameters: 0.005798198198198198 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4609 | Parameters: 0.005808108108108109 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4617 | Parameters: 0.005818018018018018 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4625 | Parameters: 0.005827927927927928 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4633 | Parameters: 0.005837837837837838 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4641 | Parameters: 0.005847747747747748 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4649 | Parameters: 0.005857657657657658 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4657 | Parameters: 0.005867567567567568 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4665 | Parameters: 0.005877477477477478 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4673 | Parameters: 0.005887387387387388 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4681 | Parameters: 0.005897297297297297 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4
Loop number: 4689 | Parameters: 0.005907207207207207 , 3 | Score for Trafic Level 3: 0.5171859719938379 | Number of clusters: 4