##### This kernel used dataset from the New York City Taxi Trip Duration and copied from the 'Dynamics of New York city - Animation' written by Omri Goldstein.
##### Introduction to 'Dynamics of New York city - Animation' : **[URL](https://www.kaggle.com/drgilermo/dynamics-of-new-york-city-animation/notebook)**

##### Thanks for sharing kernel, Omri Goldstein

In [None]:
import pandas as pd
import numpy as np
from matplotlib.pyplot import *
import matplotlib.pyplot as plt
from matplotlib import animation
from matplotlib import cm
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from dateutil import parser
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import io # 
import base64 # 
from imblearn.under_sampling import RandomUnderSampler
from subprocess import check_output
print(check_output(['ls', '../input/nyc-taxi-trip-duration/']).decode('utf8'))

In [None]:
# use zipfile to unzip, open the train.csv
import os
import zipfile
file_pass = '../input/nyc-taxi-trip-duration/'
file_list = os.listdir('../input/nyc-taxi-trip-duration/')

for i in file_list:
    if i == 'train.zip':
        zf = zipfile.ZipFile(file_pass + i)
        df = pd.read_csv(zf.open('train.csv'))

In [None]:
# URL: https://stackoverflow.com/questions/26942476/reading-csv-zipped-files-in-python
# just same way, no need to unzip
df = pd.read_csv('../input/nyc-taxi-trip-duration/train.zip')

In [None]:
df.head()

In [None]:
# Remove rides to and from far away areas¶

xlim = [-74.03, -73.77]
ylim = [40.63, 40.85]
df = df[(df.pickup_longitude> xlim[0]) & (df.pickup_longitude < xlim[1])]
df = df[(df.dropoff_longitude> xlim[0]) & (df.dropoff_longitude < xlim[1])]
df = df[(df.pickup_latitude> ylim[0]) & (df.pickup_latitude < ylim[1])]
df = df[(df.dropoff_latitude> ylim[0]) & (df.dropoff_latitude < ylim[1])]

In [None]:
longitude = list(df.pickup_longitude) + list(df.dropoff_longitude)
latitude = list(df.pickup_latitude) + list(df.dropoff_latitude)
plt.figure(figsize = (10,10))
plt.plot(longitude,latitude,'.', alpha = 0.4, markersize = 0.05)
plt.show()

In [None]:
loc_df = pd.DataFrame()
loc_df['longitude'] = longitude
loc_df['latitude'] = latitude

#### Cluster
cluster New York City based on the pick-up and drop-off points of each taxi ride

In [None]:
kmeans = KMeans(n_clusters = 15, # number of clusters
                random_state = 2,
                # Number of time the k-means algorithm will be run with different centroid seeds.
                n_init = 10).fit(loc_df)
loc_df['label'] = kmeans.labels_ # cluster results from individual datas

loc_df = loc_df.sample(200000)
plt.figure(figsize = (10, 10))
for label in loc_df.label.unique():
    plt.plot(loc_df.longitude[loc_df.label == label], 
             loc_df.latitude[loc_df.label == label],
             '.', alpha = 0.3, markersize = 0.3)
    
plt.title('clusters of new york')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize = (10, 10))
for label in loc_df.label.unique():
    ax.plot(loc_df.longitude[loc_df.label == label],
            loc_df.latitude[loc_df.label == label], '.', alpha = 0.4, markersize = 0.1, color = 'grey')
    ax.plot(kmeans.cluster_centers_[label, 0], kmeans.cluster_centers_[label, 1],
            'o', color = 'r')
    ax.annotate(label, (kmeans.cluster_centers_[label, 0], kmeans.cluster_centers_[label, 1]),
                        color = 'blue', fontsize = 20)
ax.set_title('cluster centers')
plt.show()    

In [None]:
df['pickup_cluster'] = kmeans.predict(df[['pickup_longitude', 'pickup_latitude']])
df['dropoff_cluster'] = kmeans.predict(df[['dropoff_longitude', 'dropoff_latitude']])

# parser :
# This module offers a generic date/time string parser which is able to parse most known formats to represent a date and/or time.
df['pickup_hour'] = df.pickup_datetime.apply(lambda x:parser.parse(x).hour)

In [None]:
clusters = pd.DataFrame()
clusters['x'] = kmeans.cluster_centers_[:, 0]
clusters['y'] = kmeans.cluster_centers_[:, 1]
clusters['label'] = range(len(clusters))

In [None]:
loc_df = loc_df.sample(5000)

#### Neighborhood Analysis
Let manually assign the neighborhood name to each cluster

In [None]:
neighborhood = {-74.0019368351: 'Chelsea',-73.837549761: 'Queens',-73.7854240738: 'JFK',-73.9810421975:'Midtown-North-West',-73.9862336241: 'East Village',
                -73.971273324:'Midtown-North-East',-73.9866739677: 'Brooklyn-parkslope',-73.8690098118: 'LaGuardia',-73.9890572967:'Midtown',-74.0081765545: 'Downtown'
                ,-73.9213024854: 'Queens-Astoria',-73.9470256923: 'Harlem',-73.9555565018: 'Uppe East Side',
               -73.9453487097: 'Brooklyn-Williamsburgt',-73.9745967889:'Upper West Side'}

In [None]:
rides_df = pd.DataFrame(columns = neighborhood.values())
rides_df['name'] = neighborhood.values()

neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(np.array(list(neighborhood.keys())).reshape(-1, 1), list(neighborhood.values()))

In [None]:
# This extracts a numpy array with the values of your pandas Series object and then reshapes it to a 2D array.
df['pickup_neighborhood'] = neigh.predict(df.pickup_longitude.values.reshape(-1,1))
df['dropoff_neighborhood'] = neigh.predict(df.dropoff_longitude.values.reshape(-1,1))

for col in tqdm(rides_df.columns[:-1]):
    rides_df[col] = rides_df.name.apply(lambda x: len(df[(df.pickup_neighborhood == x) & (df.dropoff_neighborhood == col)]))

In [None]:
rides_df.head()

In [None]:
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected = True)

trace = go.Heatmap(z = np.array(rides_df.values),
                   x = rides_df.columns[:-1],
                   y = rides_df.columns)

layout = dict(title = ' <b>Neighborhoods Interaction</b>',
              titlefont = dict(size = 30,
                               color = ('rgb(100, 100, 100)')),
              margin = dict(t=100, r=100, b=100, l=100),
              yaxis = dict(title = '<b> From </b>'),
              xaxis = dict(title = '<b> To </b>'))
data = [trace]
fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename = 'labelled-heatmap')

In [None]:
rides_df.index = rides_df.name
rides_df = rides_df.drop('name', axis = 1)

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
for i in range(len(rides_df)):
    ax.plot(rides_df.sum(axis = 1)[i], rides_df.sum(axis = 0)[i], 'o',
            color = 'b')
    ax.annotate(rides_df.index.tolist()[i], (rides_df.sum(axis=1)[i],
                                             rides_df.sum(axis=0)[i]),
                color = 'b', fontsize = 12)
    
ax.plot([0,250000],[0,250000], color = 'r', linewidth = 1)
ax.grid('off')
ax.set_xlim([0,250000])
ax.set_ylim([0,250000])
ax.set_xlabel('Outbound Taxis')
ax.set_ylabel('Inbound Taxis')
ax.set_title('Inbound and Outbound rides for each cluster')

### Winter vs Summer

In [None]:
df['pickup_month'] = df.pickup_datetime.apply(lambda x: parser.parse(x).month)

In [None]:
fig, ax = plt.subplots(2, figsize = (12, 12))

rides_df = pd.DataFrame(columns = neighborhood.values())
rides_df['name'] = neighborhood.values()
rides_df.index = rides_df.name

for col in tqdm(rides_df.columns[:-1]):
    rides_df[col] = rides_df.name.apply(lambda x: len(df[(df.pickup_neighborhood == x) &
                                                         (df.dropoff_neighborhood == col) &
                                                         (df.pickup_month == 6)]))
for i in range(len(rides_df)):
    ax[0].plot(rides_df.sum(axis=1)[i], rides_df.sum(axis=0)[i], 'o', color = 'b')
    ax[0].annotate(rides_df.index.tolist()[i], (rides_df.sum(axis = 1)[i],
                                                rides_df.sum(axis = 0)[i]), 
                   color = 'b', fontsize = 12)

ax[0].grid('off')
ax[0].set_xlabel('Outbound Taxis')
ax[0].set_ylabel('Inbound Taxis')
ax[0].set_title('Inbound and Outbound rides for each cluster - June')
ax[0].set_xlim([0,40000])
ax[0].set_ylim([0,40000])
ax[0].plot([0,40000],[0,40000])

for col in tqdm(rides_df.columns[:-1]):
    rides_df[col] = rides_df.name.apply(lambda x: len(df[(df.pickup_neighborhood == x) & (df.dropoff_neighborhood == col) & (df.pickup_month == 1)]))
rides_df = rides_df.drop('name', axis = 1)
for i in range(len(rides_df)):  
    ax[1].plot(rides_df.sum(axis = 1)[i],rides_df.sum(axis = 0)[i],'o', color = 'b')
    ax[1].annotate(rides_df.index.tolist()[i], (rides_df.sum(axis = 1)[i],rides_df.sum(axis = 0)[i]), color = 'b', fontsize = 12)

ax[1].grid('off')
ax[1].set_xlabel('Outbound Taxis')
ax[1].set_ylabel('Inbound Taxis')
ax[1].set_title('Inbound and Outbound rides for each cluster - January')
ax[1].set_xlim([0,40000])
ax[1].set_ylim([0,40000])
ax[1].plot([0,40000],[0,40000])

As we can see, the patterns are almost identical regardless of the month. snowy January vs humid and touristic June produce very similar Taxi patterns