# Neighborhood clustering and predicting pickups using ML

## Adding neighborhoods as a feature

In [2]:
import json
from shapely.geometry import Point, Polygon
import time
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns


#This function takes the longitude and latitude of a pickup or dropoff location and maps it to a neighborhood (or state, county, etc.).
#Take note that this function uses dask dataframes instead of pandas dataframes becuase they are much quicker with computations.


def neighborhood_name_adder(data, geo_data, column_longitude, column_latitude, new_column_name):
    """
    data : pandas data frame of taxi trips
    geo_data: GeoJson containing the shape of each neighborhood
    column_longitude: Name of the longitude column in the dataset
    column_latitude: Name of the latitude column in the dataset
    new_column_name: Name of the new column (i.e. `pickup_neighborhood`)
    """
    from matplotlib import path
    import numpy as np
       
    
    lat = data[column_latitude]      # selecting longitude from dask dataframe
    lon = data[column_longitude]     # selecting latitude from dask dataframe
    
    data[new_column_name] = np.zeros(len(data)) # iniitalizing new column
    
    # The loop below goes through every neighborhood in the GeoJson file and maps \
    # \the neighborhood name to the new column if a trip is within taht neighborhood

    
    for feature in geo_data['features']:     # looping through every neighborhood in GeoJson file, and mapping all trips
        coords = feature['geometry']['coordinates'][0]
        p = path.Path(coords)
        index = p.contains_points(list(zip(lon, lat)))
        data[new_column_name].loc[index] = [str(feature['properties']['neighborhood'])]*np.sum(index)

    
    return(data)


In [3]:
apr_2014 = pd.read_csv("Desktop\\Taxi Demand\\uber-raw-data-apr14.csv")
apr_2014.columns = [c.replace(" ", "") for c in apr_2014.columns] 
may_2014 = pd.read_csv("Desktop\\Taxi Demand\\uber-raw-data-may14.csv")
may_2014.columns = [c.replace(" ", "") for c in may_2014.columns] 
taxi_trips=pd.concat([apr_2014, may_2014], ignore_index=True)

In [4]:

# An example of how to run this code is below. A link to the csv file I use in this example 
# is posted in this repository, as is the GeoJson file `NY_neighborhoods.geojson`

taxi_trips = taxi_trips[(((taxi_trips.Lat >= 40.5774) & (taxi_trips.Lat <= 40.9176)) & ((taxi_trips.Lon >= -74.15) & (taxi_trips.Lon <= -73.7004)))]
print ("old data set columns are: ", taxi_trips.columns)

geo_data =  json.load(open('NY_neighborhoods.geojson'))   # GeoJson file



old data set columns are:  Index(['Date/Time', 'Lat', 'Lon', 'Base'], dtype='object')


In [5]:
taxi_trips_updated = neighborhood_name_adder(taxi_trips, geo_data, 'Lon', 'Lat', 'Pickup_neighborhood')
print ("updated data set columns are: ", taxi_trips_updated.columns)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


updated data set columns are:  Index(['Date/Time', 'Lat', 'Lon', 'Base', 'Pickup_neighborhood'], dtype='object')


In [6]:
taxi_trips_updated

Unnamed: 0,Date/Time,Lat,Lon,Base,Pickup_neighborhood
0,4/1/2014 0:11:00,40.7690,-73.9549,B02512,Upper East Side
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512,0
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512,East Village
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512,Midtown
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512,Midtown
...,...,...,...,...,...
1216946,5/31/2014 23:45:00,40.7309,-74.0014,B02764,Greenwich Village
1216947,5/31/2014 23:52:00,40.7528,-73.9798,B02764,Midtown
1216948,5/31/2014 23:55:00,40.7158,-73.9519,B02764,Williamsburg
1216949,5/31/2014 23:56:00,40.6961,-73.8997,B02764,Ridgewood


In [7]:
taxi_trips=taxi_trips_updated[taxi_trips_updated.Pickup_neighborhood!=0]
taxi_trips

Unnamed: 0,Date/Time,Lat,Lon,Base,Pickup_neighborhood
0,4/1/2014 0:11:00,40.7690,-73.9549,B02512,Upper East Side
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512,East Village
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512,Midtown
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512,Midtown
6,4/1/2014 0:39:00,40.7223,-73.9887,B02512,Lower East Side
...,...,...,...,...,...
1216946,5/31/2014 23:45:00,40.7309,-74.0014,B02764,Greenwich Village
1216947,5/31/2014 23:52:00,40.7528,-73.9798,B02764,Midtown
1216948,5/31/2014 23:55:00,40.7158,-73.9519,B02764,Williamsburg
1216949,5/31/2014 23:56:00,40.6961,-73.8997,B02764,Ridgewood


In [8]:
taxi_trips["Date/Time"]=pd.to_datetime(taxi_trips["Date/Time"]).dt.strftime('%Y-%m-%d-%H')
taxi_trips.rename({'Date/Time':'Date'},axis=1,inplace=True)
taxi_trips.drop(['Base'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [9]:
taxi_trips

Unnamed: 0,Date,Lat,Lon,Pickup_neighborhood
0,2014-04-01-00,40.7690,-73.9549,Upper East Side
2,2014-04-01-00,40.7316,-73.9873,East Village
3,2014-04-01-00,40.7588,-73.9776,Midtown
4,2014-04-01-00,40.7594,-73.9722,Midtown
6,2014-04-01-00,40.7223,-73.9887,Lower East Side
...,...,...,...,...
1216946,2014-05-31-23,40.7309,-74.0014,Greenwich Village
1216947,2014-05-31-23,40.7528,-73.9798,Midtown
1216948,2014-05-31-23,40.7158,-73.9519,Williamsburg
1216949,2014-05-31-23,40.6961,-73.8997,Ridgewood


## Calculating number of trips in each neighborhood

In [10]:
df=taxi_trips.groupby(['Date','Pickup_neighborhood']).size().reset_index(name='trips')
df

Unnamed: 0,Date,Pickup_neighborhood,trips
0,2014-04-01-00,Battery Park City,3
1,2014-04-01-00,Bedford-Stuyvesant,2
2,2014-04-01-00,Borough Park,1
3,2014-04-01-00,Chelsea,13
4,2014-04-01-00,Chinatown,1
...,...,...,...
97447,2014-05-31-23,Washington Heights,5
97448,2014-05-31-23,West Village,172
97449,2014-05-31-23,Whitestone,1
97450,2014-05-31-23,Williamsburg,99


In [11]:
df.Pickup_neighborhood.unique().shape

(224,)

## Preprocessing

In [12]:
date=pd.to_datetime(df['Date'], format='%Y-%m-%d-%H')
df['Month']=date.dt.month
df['Day']=date.dt.day
df['Hour']=date.dt.hour
temp=pd.DatetimeIndex(date)
df['Weekday']=temp.weekday
# df['is_weekend'] = df.index.map(lambda x: 1 if x.Weekday > 4 else 0)
df

Unnamed: 0,Date,Pickup_neighborhood,trips,Month,Day,Hour,Weekday
0,2014-04-01-00,Battery Park City,3,4,1,0,1
1,2014-04-01-00,Bedford-Stuyvesant,2,4,1,0,1
2,2014-04-01-00,Borough Park,1,4,1,0,1
3,2014-04-01-00,Chelsea,13,4,1,0,1
4,2014-04-01-00,Chinatown,1,4,1,0,1
...,...,...,...,...,...,...,...
97447,2014-05-31-23,Washington Heights,5,5,31,23,5
97448,2014-05-31-23,West Village,172,5,31,23,5
97449,2014-05-31-23,Whitestone,1,5,31,23,5
97450,2014-05-31-23,Williamsburg,99,5,31,23,5


In [14]:
d = pd.get_dummies(df['Pickup_neighborhood'])

df_new = pd.concat([df, d], axis=1)      
df_new.drop(['Pickup_neighborhood'], inplace=True, axis=1)
df_new

Unnamed: 0,Date,trips,Month,Day,Hour,Weekday,Allerton,Alley Pond Park,Arrochar,Arverne,...,West Village,Westchester Square,Westerleigh,Whitestone,Williamsbridge,Williamsburg,Windsor Terrace,Woodhaven,Woodlawn,Woodside
0,2014-04-01-00,3,4,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2014-04-01-00,2,4,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2014-04-01-00,1,4,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2014-04-01-00,13,4,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2014-04-01-00,1,4,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97447,2014-05-31-23,5,5,31,23,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97448,2014-05-31-23,172,5,31,23,5,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
97449,2014-05-31-23,1,5,31,23,5,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
97450,2014-05-31-23,99,5,31,23,5,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [15]:
y=df_new.trips

In [18]:
x=df_new
x.drop(['Date','trips'],axis=1,inplace=True)
x

Unnamed: 0,Month,Day,Hour,Weekday,Allerton,Alley Pond Park,Arrochar,Arverne,Astoria,Bath Beach,...,West Village,Westchester Square,Westerleigh,Whitestone,Williamsbridge,Williamsburg,Windsor Terrace,Woodhaven,Woodlawn,Woodside
0,4,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97447,5,31,23,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97448,5,31,23,5,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
97449,5,31,23,5,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
97450,5,31,23,5,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


## ML model deployment

### Random forest regressor

In [26]:
from sklearn import model_selection
x_train,x_test,y_train,y_test=model_selection.train_test_split(x,y,test_size=0.2,random_state=2)

from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor()
rf

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [27]:
%%time
rf.fit(x_train,y_train)

Wall time: 7min 17s


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [28]:
from sklearn.metrics import r2_score
y_pred_rf=rf.predict(x_test)
s1=r2_score(y_test,y_pred_rf)
s1

0.9278895208555953

### Gradient Boosting Regressor

In [33]:
from sklearn import ensemble
xbr=ensemble.GradientBoostingRegressor(alpha=0.92,max_depth=8)
xbr

GradientBoostingRegressor(alpha=0.92, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=8,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [34]:
%%time
xbr.fit(x_train,y_train)

Wall time: 2min 37s


GradientBoostingRegressor(alpha=0.92, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=8,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [35]:
y_pred_xbr=xbr.predict(x_test)
s2=r2_score(y_test,y_pred_xbr)
s2

0.9201605096331944

### Xgboost using grid search cv

In [38]:
import xgboost as xgb
XGB=xgb.XGBRegressor(colsample_bytree=0.7, learning_rate= 0.07, max_depth= 8, min_child_weight= 4, n_estimators= 500, nthread=4, silent= True, subsample= 0.7)

XGB.fit(x_train, y_train)
print(r2_score(y_train, XGB.predict(x_train)))
s3=r2_score(y_test, XGB.predict(x_test))
s3

0.9728990626500029


0.9399989252000918