# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [2]:
import pandas as pd
import numpy as np
import folium



In [2]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../../DATA/train_cleaned.csv')

In [3]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [4]:
from sklearn.cluster import KMeans

In [5]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [6]:
%%time
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster



Wall time: 1min 5s


KMeans(n_clusters=100, n_jobs=-1)

In [7]:
#get cluster centers
centers=myKMeans.cluster_centers_
    

In [8]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [9]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [18]:
def show_cluster(cluster_number, train_data, Map):
    
    #make prediction
    cluster_pred = myKMeans.predict(train_data)
    #get data for one cluster center
    cluster_data =train_data[cluster_pred == cluster_number].to_numpy()
    
    #plot center
    for i in range(len(cluster_data)):
        folium.CircleMarker([cluster_data[i,0], cluster_data[i,1]], radius=5,                
                            color="yellow", 
                            fill_opacity=0.1
                           ).add_to(Map)
        folium.CircleMarker([cluster_data[i,2], cluster_data[i,3]], radius=5,                
                            color="blue", 
                            fill_opacity=0.1
                           ).add_to(Map)
        
        return Map

In [20]:
show_cluster(50, train_data=coordinates, Map=cluster_map)

## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [34]:
def cluster_var(cluster_number, k):  
        
        numpy_coord = coordinates.to_numpy()
        
        for i in range(k):
            cluster_data = numpy_coord[cluster_number==i]
            np.var(cluster_data, axis=0)
            print(np.var(cluster_data, axis=0))

In [35]:
cluster_var(myKMeans.predict(coordinates.to_numpy()), 10)

[6.78184600e-05 4.69020270e-05 6.77488626e-05 8.42250111e-05]
[3.34704713e-05 2.62122595e-05 2.71045298e-05 2.87076559e-05]
[5.99371860e-05 1.31480493e-04 2.87858676e-04 2.70446642e-04]
[0.00028823 0.0002006  0.00014814 0.00011946]
[9.79454088e-05 2.88233672e-03 7.75491726e-05 1.09113459e-03]
[5.84022026e-05 8.62085753e-05 5.39068576e-05 8.28138390e-05]
[3.47860815e-05 2.52605997e-05 5.73631276e-05 6.06595573e-05]
[5.87590942e-04 9.56155678e-05 8.26924333e-04 1.30687762e-03]
[6.88594357e-05 7.52431911e-05 4.73101468e-04 2.18016609e-04]
[5.89589310e-05 7.00015559e-05 4.01237167e-05 3.35383254e-05]


In [36]:
cluster_var(myKMeans.predict(coordinates.to_numpy()), 100)

[6.78184600e-05 4.69020270e-05 6.77488626e-05 8.42250111e-05]
[3.34704713e-05 2.62122595e-05 2.71045298e-05 2.87076559e-05]
[5.99371860e-05 1.31480493e-04 2.87858676e-04 2.70446642e-04]
[0.00028823 0.0002006  0.00014814 0.00011946]
[9.79454088e-05 2.88233672e-03 7.75491726e-05 1.09113459e-03]
[5.84022026e-05 8.62085753e-05 5.39068576e-05 8.28138390e-05]
[3.47860815e-05 2.52605997e-05 5.73631276e-05 6.06595573e-05]
[5.87590942e-04 9.56155678e-05 8.26924333e-04 1.30687762e-03]
[6.88594357e-05 7.52431911e-05 4.73101468e-04 2.18016609e-04]
[5.89589310e-05 7.00015559e-05 4.01237167e-05 3.35383254e-05]
[0.00034927 0.00016017 0.00011939 0.00015423]
[0.00012589 0.00010388 0.00018936 0.00012268]
[5.47862265e-05 3.17199422e-05 4.93857902e-05 3.04200466e-05]
[0.00293911 0.00454012 0.00098063 0.00242336]
[1.07815740e-04 7.51692110e-05 1.24968486e-04 1.51340028e-04]
[3.01426472e-05 4.98549424e-05 3.66986267e-05 2.25923661e-05]
[0.00020453 0.00018061 0.00034477 0.00027656]
[3.08007389e-05 2.26118476