# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [None]:
import pandas as pd
import numpy as np
import folium


In [None]:
#check if notebook runs in colab
import sys
IN_COLAB = 'google.colab' in sys.modules
print('running in Colab:',IN_COLAB)
path='..'
if IN_COLAB:
  #in colab, we need to clone the data from the repo
  !git clone https://github.com/keuperj/DataScienceSS20.git
  path='DataScienceSS20'

In [None]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv(path+'/DATA/train_cleaned.csv')

In [None]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]
coordinates.head()

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [None]:
from sklearn.cluster import KMeans

In [None]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [None]:
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster

In [None]:
#get cluster centers
centers=myKMeans.cluster_centers_

labels=myKMeans.labels_
print(labels)

In [None]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [None]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [None]:
sub_data=coordinates.to_numpy()[:100000,:] #erzeugt sub_data in array form
Means=KMeans(n_clusters=100, n_jobs=-1)    #erzeugt ein KMeans Model
Means.fit(sub_data)                        #trainiert das Model mit subdaten
predy=Means.predict(sub_data)              #erzeugt die Zuordnung der Daten zu den Clustern

In [None]:
print(predy)
#print(np.shape(predy))
#np.shape(sub_data)
sub_data_frame=coordinates[:100000] #von dem data frame nur die ersten 100000
#np.shape(sub_data_frame)
sub_data_frame.head()
#print(sub_data_frame)

#bis hierher aus der Funktion

In [None]:
sub_data_frame['cluster_num']=predy #cluster number an daten hängen
sub_data_frame.head()

clust_number=50
test=sub_data_frame[sub_data_frame['cluster_num']==clust_number] #ausgeben der gewollten Koordinaten eines Clusters

print(np.shape(test)[0])
test=test.to_numpy()
test[0]

In [None]:
laufvar = np.shape(test)[0]

for j in range(laufvar):
    folium.CircleMarker([test[j,0], test[j,1]], radius=1,color="blue",fill_opacity=0.9).add_to(cluster_map)
        

In [None]:
cluster_map

In [None]:
#cluster_number = nummer für die Auswahl des Cluster Center
#

def show_cluster(cluster_number, cluster_anz, train_data, myMAP):
    
    sub_data=train_data.to_numpy()[:100000,:] #erzeugt sub_data in array form
    Means=KMeans(n_clusters=cluster_anz, n_jobs=-1)    #erzeugt ein KMeans Model
    Means.fit(sub_data)                        #trainiert das Model mit subdaten
    predy=Means.predict(sub_data)              #erzeugt die Zuordnung der Daten zu den Clustern
   
    sub_data_frame=train_data[:100000]
    sub_data_frame['cluster_num']=predy #cluster number an daten hängen

    data_cluster_num=sub_data_frame[sub_data_frame['cluster_num']==clust_number] #ausgeben der gewollten Koordinaten eines Clusters
    data_cluster_num=data_cluster_num.to_numpy()
    #get cluster centers
    #centers=Means.cluster_centers_

    laufvar = np.shape(data_cluster_num)[0]
    for j in range(laufvar):
        folium.CircleMarker([data_cluster_num[j,0], data_cluster_num[j,1]], radius=1,color="black",fill_opacity=0.9).add_to(myMAP)
        
    return(myMAP)

In [None]:
show_cluster(2,100,coordinates,cluster_map)

In [None]:
print(cluster_map)

## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.