In [None]:
import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import pandas as pd
import numpy as np

from tqdm import tqdm

from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.datasets import make_blobs
from sklearn.neighbors import KNeighborsClassifier

from ipywidgets import interactive

from collections import defaultdict

import hdbscan
import folium
import re


cols = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
        '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', 
        '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', 
        '#000075', '#808080']*10

In [None]:
df = pd.read_csv('Data/taxi_data.csv')

In [None]:
df.head()

In [None]:
df.duplicated(subset=['LON', 'LAT']).values.any()

In [None]:
df.isna().values.any()

In [None]:
print(f'Before dropping NaNs and dupes\t:\tdf.shape = {df.shape}')
df.dropna(inplace = True)
df.drop_duplicates(subset =['LON', 'LAT'], keep = 'first', inplace = True)
print(f'After dropping NaNs and dupes\t:\tdf.shape = {df.shape}')

In [None]:
X = np.array(df[['LON', 'LAT']], dtype = 'float64')

In [None]:
plt.scatter(X[:,0], X[:,1],alpha = 0.2,s=50)

In [None]:
m = folium.Map(location = [df.LAT.mean(), df.LON.mean()],zoom_start = 9, tiles = 'Stamen toner')

for _, row in df.iterrows():
    folium.CircleMarker(
        location = [row.LAT, row.LON],
        radius = 5,
        popup = re.sub(r'[^a-zA-Z ]+','', row.NAME),
        color = '#1787FE',
        fill = True,
        fill_color = '#1787FE' 
    ).add_to(m)

In [None]:
m

In [None]:
X_blobs, _ = make_blobs(n_samples=1000, centers = 10, n_features=2, cluster_std=0.5, random_state=4)

In [None]:
class_predictions = np.load('Data/sample_clusters.npy')

In [None]:
unique_clusters = np.unique(class_predictions)

In [None]:
for unique_cluster in unique_clusters:
    X = X_blobs[class_predictions == unique_cluster]
    plt.scatter(X[:,0],X[:,1], alpha = 0.2, c = cols[unique_cluster])
silhouette_score(X_blobs, class_predictions)

In [None]:
class_predictions = np.load('Data/sample_clusters_improved.npy')

In [None]:
unique_clusters = np.unique(class_predictions)

In [None]:
for unique_cluster in unique_clusters:
    X = X_blobs[class_predictions == unique_cluster]
    plt.scatter(X[:,0],X[:,1], alpha = 0.2, c = cols[unique_cluster])
silhouette_score(X_blobs, class_predictions)

In [None]:
X_blobs, _ = make_blobs(n_samples=1000, centers=50, 
                        n_features=2, cluster_std=1, random_state=4)

In [None]:
data = defaultdict(dict)
for x in range(1,21):
    model = KMeans(n_clusters=3, random_state=17, 
                   max_iter=x, n_init=1).fit(X_blobs)
    
    data[x]['class_predictions'] = model.predict(X_blobs)
    data[x]['centroids'] = model.cluster_centers_
    data[x]['unique_classes'] = np.unique(class_predictions)

In [None]:
def f(x):
    class_predictions = data[x]['class_predictions']
    centroids = data[x]['centroids']
    unique_classes = data[x]['unique_classes']

    for unique_class in unique_classes:
            plt.scatter(X_blobs[class_predictions==unique_class][:,0], 
                        X_blobs[class_predictions==unique_class][:,1], 
                        alpha=0.3, c=cols[unique_class])
    plt.scatter(centroids[:,0], centroids[:,1], s=200, c='#000000', marker='v')
    plt.ylim([-15,15]); plt.xlim([-15,15])
    plt.title('How K-Means Clusters')

interactive_plot = interactive(f, x=(1, 20))
output = interactive_plot.children[-1]
output.layout.height = '350px'
interactive_plot

In [None]:
X = np.array(df[['LON','LAT']] , dtype ='float64')
k = 70
model = KMeans(n_clusters = k, random_state=17).fit(X)
class_predictions = model.predict(X)
df[f'CLUSTER_kmeans{k}'] = class_predictions

In [None]:
df.head()

In [None]:
def create_map (df, cluster_column):
    m = folium.Map(location=[df.LAT.mean(), df.LON.mean()], zoom_start=9, tiles='Stamen Toner')

    for _, row in df.iterrows():

        # get a colour
        if row[cluster_column] == -1:
            cluster_colour = '#000000'
        else:
            cluster_colour = cols[row[cluster_column]]

        folium.CircleMarker(
            location= [row['LAT'], row['LON']],
            radius=5,
            popup= row[cluster_column],
            color=cluster_colour,
            fill=True,
            fill_color=cluster_colour
        ).add_to(m)
    return m

m = create_map(df, 'CLUSTER_kmeans70')
print(f'K={k}')
print(f'Silhouette Score: {silhouette_score(X, class_predictions)}')

m.save('kmeans_70.html')

In [None]:
best_silhouette, best_k = -1, 0

for k in tqdm(range(2, 100)):
    model = KMeans(n_clusters=k, random_state=1).fit(X)
    class_predictions = model.predict(X)
    
    curr_silhouette = silhouette_score(X, class_predictions)
    if curr_silhouette > best_silhouette:
        best_k = k
        best_silhouette = curr_silhouette
        
print(f'K={best_k}')
print(f'Silhouette Score: {best_silhouette}') 

In [None]:
# code for changing out repeated values
dummy = np.array([-1, -1, -1, 2, 3, 4, 5, -1])
new = np.array([(counter+2)*x if x == -1 else x for counter,x in enumerate(dummy)])

In [None]:
# performing DBSCAN

model = DBSCAN(eps = 0.01, min_samples=5).fit(X)
class_predictions = model.labels_
df['CLUSTERS_DBSCAN'] = class_predictions

In [None]:
m = create_map(df, 'CLUSTERS_DBSCAN')

print(f'Number of clusters found: {len(np.unique(class_predictions))}')
print(f'Number of outliers found: {len(class_predictions[class_predictions==-1])}')

print(f'Silhouette ignoring outliers: {silhouette_score(X[class_predictions!=-1], class_predictions[class_predictions!=-1])}')

no_outliers = 0
no_outliers = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(class_predictions)])
print(f'Silhouette outliers as singletons: {silhouette_score(X, no_outliers)}')

In [None]:
m

In [None]:
model = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=2, cluster_selection_epsilon=0.01)
class_predictions = model.fit_predict(X)
df['CLUSTERS_HDBSCAN'] = class_predictions 


In [None]:
m = create_map(df, 'CLUSTERS_HDBSCAN')
print(f'Number of clusters found: {len(np.unique(class_predictions))-1}')
print(f'Number of outliers found: {len(class_predictions[class_predictions==-1])}')

print(f'Silhouette ignoring outliers: {silhouette_score(X[class_predictions!=-1], class_predictions[class_predictions!=-1])}')

no_outliers = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(class_predictions)])
print(f'Silhouette outliers as singletons: {silhouette_score(X, no_outliers)}')

m

In [None]:
classifier = KNeighborsClassifier(n_neighbors=1)

In [None]:
df_train = df[df.CLUSTERS_HDBSCAN!= -1]
df_predict = df[df.CLUSTERS_HDBSCAN== -1]

In [None]:
X_train = np.array(df_train[['LON','LAT']], dtype = 'float64')
y_train = np.array(df_train['CLUSTERS_HDBSCAN'])

X_predict = np.array(df_predict[['LON','LAT']], dtype = 'float64')

In [None]:
classifier.fit(X_train, y_train)

In [None]:
predictions = classifier.predict(X_predict)

In [None]:
df['CLUSTERS_hybrid'] = df['CLUSTERS_HDBSCAN']


In [None]:
df.loc[df.CLUSTERS_HDBSCAN == -1, 'CLUSTERS_hybrid'] = predictions

In [None]:
m = create_map(df, 'CLUSTERS_hybrid')

In [None]:
df['CLUSTERS_hybrid'].value_counts().plot.hist(bins = 70, alpha = 0.4, label = 'Hybrid')
df['CLUSTER_kmeans70'].value_counts().plot.hist(bins = 70, alpha = 0.4, label = 'K-Means (70)')
plt.legend()
plt.title('Comparing HDBSCAN and K-Means (70)')

In [None]:

print(f'Number of clusters found: {len(np.unique(class_predictions))}')
print(f'Silhouette: {silhouette_score(X, class_predictions)}')

m.save('hybrid.html')