### Motivation

Motivated by the decreasing pattern visualized on the yearly distribution of the taxi pickups in New York, we started to research about the phenomena. After finding out that the causant was other ride-hailing companies we decided to study it deeper.

### Dataset

The Data
The dataset contains, roughly, four groups of files:

Uber trip data from 2014 (April - September), separated by month, with detailed location information

There are six files of raw data on Uber pickups in New York City from April to September 2014. The files are separated by month and each has the following columns:

Date/Time : The date and time of the Uber pickup
Lat : The latitude of the Uber pickup
Lon : The longitude of the Uber pickup
Base : The TLC base company code affiliated with the Uber pickup

In [None]:
#print(os.listdir("../project/nyc-taxi-trip-duration"))

As we did with the main code we will explain part by part instead of grouping in different topics. Since, we were creating new analysis as soon as we were advancing with the project.

Importing libraries

In [None]:
import pandas as pd
import numpy as np
import datetime
from sodapy import Socrata
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from matplotlib.ticker import PercentFormatter
import random
import os
import folium 
from folium.plugins import *

path='../project/'
sns.set(color_codes=True)
%matplotlib inline

Importing taxi dataset

In [None]:
#!/usr/bin/env python

# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", None)

# Example authenticated client (needed for non-public datasets):
client = Socrata("data.cityofnewyork.us",
                 "Hi16i3SK4y2UNiONqpKz4IROh",
                 username="**************",
                 password="**************",
                 timeout=120)
#https://data.cityofnewyork.us/resource/gkne-dk5s.json
# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("gkne-dk5s", limit=3000000) #Uber rows without NaN

# Convert to pandas DataFrame
df_taxi = pd.DataFrame.from_records(results)

### Data engineering

In the following lines of code we are going to adequate the code for the studies that comes in continuation and also filter the taxi and uber dataset columns to contain the same information.

In [None]:
df_taxi['pickup_datetime'] = pd.to_datetime(df_taxi['pickup_datetime'],format = '%Y-%m-%dT%H:%M')

In [None]:
df_taxi = df_taxi.iloc[:,[1,5,6,]]

In [None]:
#filter for the uber dataset timeline
df_taxi = df_taxi[df_taxi["pickup_datetime"].dt.month>3]
df_taxi = df_taxi[df_taxi["pickup_datetime"].dt.month<10]


In [None]:
#split date/time column
df_taxi['pickup_datetime'] = pd.to_datetime(df_taxi['pickup_datetime'].copy())        
#df_taxi['Year'] = df_taxi['pickup_datetime'].dt.year
df_taxi['Month'] = df_taxi['pickup_datetime'].dt.month
df_taxi['MonthDay'] = df_taxi['pickup_datetime'].dt.day
df_taxi['DayOfWeek'] = df_taxi['pickup_datetime'].dt.dayofweek
df_taxi['HourOfDay'] = df_taxi['pickup_datetime'].dt.hour
df_taxi["HourOfWeek"] = (df_taxi["DayOfWeek"]*24 )+df_taxi["HourOfDay"].astype(float)

In [None]:
df_taxi.drop(['pickup_datetime'],axis=1,inplace=True)

In [None]:
df_taxi=df_taxi.iloc[:,[1,0,2,3,4,5,6]]
df_taxi.rename(columns={"pickup_latitude": "Lat", "pickup_longitude": "Lon"},inplace=True)
df_taxi['Lon']=df_taxi['Lon'].apply(lambda x: float(x))
df_taxi['Lat']=df_taxi['Lat'].apply(lambda x: float(x))


In [None]:
#Create uber dataset
df_uber = (pd.read_csv(path+'uber/uber-raw-data-apr14.csv')
           .append(pd.read_csv(path+'uber/uber-raw-data-may14.csv'))
           .append(pd.read_csv(path+'uber/uber-raw-data-jun14.csv'))
           .append(pd.read_csv(path+'uber/uber-raw-data-jul14.csv'))
           .append(pd.read_csv(path+'uber/uber-raw-data-aug14.csv'))
           .append(pd.read_csv(path+'uber/uber-raw-data-sep14.csv')))
df_uber = df_uber[df_uber['Lat'].notna()]


In [None]:
#Set time to pamdas datetime format.
df_uber['Date/Time'] = pd.to_datetime(df_uber['Date/Time'],format = '%m/%d/%Y %H:%M:%S')

In [None]:
#split date/time column
df_uber['Date/Time'] = pd.to_datetime(df_uber['Date/Time'].copy())        
#df_uber['Year'] = df_uber['Date/Time'].dt.year
df_uber['Month'] = df_uber['Date/Time'].dt.month
df_uber['MonthDay'] = df_uber['Date/Time'].dt.day
df_uber['DayOfWeek'] = df_uber['Date/Time'].dt.dayofweek
df_uber['HourOfDay'] = df_uber['Date/Time'].dt.hour
df_uber["HourOfWeek"] = (df_uber["DayOfWeek"]*24 )+df_uber["HourOfDay"].astype(float)

In [None]:
df_uber.drop(['Date/Time','Base'],axis=1,inplace=True)


In [None]:
df_uber=df_uber.sample(len(df_taxi))

### Taxi-Uber Analysis

As we did in the main code, the following analysis will have a lot of time distributions. Therefore, it was decided to create a plot function in order to have the keep the same format

In [None]:
def Grouped_plot(x_taxi,x_uber,Index_list,label_plot,version):
    if version == 1:
        # set width of bars
        barWidth = 0.3

        # set heights of bars
        bars1 = list(x_taxi)
        bars2 = list(x_uber)


        # Set position of bar on X axis
        r1 = np.arange(len(bars1))
        r2 = [x + barWidth for x in r1]


        # Make the plot

        plt.figure(figsize = [12, 6])

        plt.bar(r1, x_taxi, width=barWidth,  label='Taxi', alpha=0.5, color='yellow')
        plt.bar(r2, x_uber, width=barWidth,  label='Uber', alpha=0.5, color='black')

        plt.xlabel(label_plot)
        plt.ylabel('Proportion %  ')
        title_plot=label_plot+' PickUps distribution'
        plt.title(title_plot)

        # Add xticks on the middle of the group bars
        plt.xlabel(label_plot, fontweight='bold')
        plt.xticks([r + barWidth for r in range(len(bars1))], Index_list)
    
    else:
        # set heights of bars
        bars1 = list(x_taxi)
        bars2 = list(x_uber)        
        
        # Set position of bar on X axis
        r1 = np.arange(len(bars1))
        r2 = np.arange(len(bars2))


        #plot
        plt.figure(figsize = [12, 6])
        ax = plt.bar(r1, x_taxi,  label='Taxi', alpha=0.5);
        ax = plt.bar(r1, x_uber,  label='Uber', alpha=0.5)
        #plt.legend(labels = ['DISORDERLY CONDUCT', 'RECOVERED VEHICLE']);

        plt.gca().yaxis.set_major_formatter(PercentFormatter(1))

        # Create legend & Show graphic
    plt.legend()
    plt.show()

The visualizations with histogram between Uber in black and Taxis in Yellow make easy to compare the differences in the time distributions. Being automatic to obtain the different insight underlying in these patterns.

In [None]:
df_dayhour_taxi = df_taxi.groupby("HourOfDay")["HourOfDay"].count()
df_dayhour_taxi=pd.DataFrame(df_dayhour_taxi/df_dayhour_taxi.sum())
df_dayhour_uber = df_uber.groupby("HourOfDay")["HourOfDay"].count()
df_dayhour_uber=pd.DataFrame(df_dayhour_uber/df_dayhour_uber.sum())

df_dayhour = pd.DataFrame({'Taxi':  df_dayhour_taxi['HourOfDay'].to_list(),
                   'Uber': df_dayhour_uber['HourOfDay'].to_list()})

In [None]:
Grouped_plot(df_dayhour['Taxi'].values,df_dayhour['Uber'].values,
             df_dayhour.index.to_list(),'Hour of the Day',1)

In [None]:
df_weekday_taxi = df_taxi.groupby("DayOfWeek")["DayOfWeek"].count()
df_weekday_taxi=pd.DataFrame(df_weekday_taxi/df_weekday_taxi.sum())
df_weekday_uber = df_uber.groupby("DayOfWeek")["DayOfWeek"].count()
df_weekday_uber=pd.DataFrame(df_weekday_uber/df_weekday_uber.sum())

df_weekday = pd.DataFrame({'Taxi':  df_weekday_taxi['DayOfWeek'].to_list(),
                   'Uber': df_weekday_uber['DayOfWeek'].to_list()},
                       index=[ 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
                              'Friday', 'Saturday', 'Sunday' ])

In [None]:

Grouped_plot(df_weekday['Taxi'].values,df_weekday['Uber'].values,
             df_weekday.index.to_list(),'Week Days',1)

In [None]:
df_weekhour_taxi = df_taxi.groupby("HourOfWeek")["HourOfWeek"].count()
df_weekhour_taxi=pd.DataFrame(df_weekhour_taxi/df_weekhour_taxi.sum())
df_weekhour_uber = df_uber.groupby("HourOfWeek")["HourOfWeek"].count()
df_weekhour_uber=pd.DataFrame(df_weekhour_uber/df_weekhour_uber.sum())

df_weekhour = pd.DataFrame({'Taxi':  df_weekhour_taxi['HourOfWeek'].to_list(),
                   'Uber': df_weekhour_uber['HourOfWeek'].to_list()})

In [None]:
Grouped_plot(df_weekhour['Uber'].values,df_weekhour['Taxi'].values,
             df_weekhour.index.to_list(),'Week Hours',2)

In [None]:
df_month_taxi = df_taxi.groupby("Month")["Month"].count()
df_month_taxi=pd.DataFrame(df_month_taxi/df_month_taxi.sum())
df_month_uber = df_uber.groupby("Month")["Month"].count()
df_month_uber=pd.DataFrame(df_month_uber/df_month_uber.sum())

df_month = pd.DataFrame({'Taxi':  df_month_taxi['Month'].to_list(),
                   'Uber': df_month_uber['Month'].to_list()},
                       index=[ 'April', 'May', 'June', 'July',
                              'Augost', 'September'])

In [None]:
Grouped_plot(df_month['Taxi'].values,df_month['Uber'].values,
             df_month.index.to_list(),'Month',1)

In order to try a more interactive visualization, we tried to do something similar in plotly. However, the result was not as good as the previous simple figures.

In [None]:
rows = 3
cols = 1
fig = make_subplots( rows=rows, cols=cols, subplot_titles=['Taxi Hour Distribution','Uber  Hour Distribution, Comparison'],
vertical_spacing=0.1, horizontal_spacing=0.10)

#hour
fig.add_trace(go.Bar(x=df_dayhour.index.to_list(),
                       y=df_dayhour_taxi['HourOfDay']), row=1, col=1)

fig.add_trace(go.Bar(x=df_dayhour.index.to_list(),
                       y=df_dayhour_uber['HourOfDay']), row=2, col=1)

fig.add_trace(go.Bar(x=df_dayhour.index.to_list(),
                       y=df_dayhour_taxi['HourOfDay']), row=3, col=1)

fig.add_trace(go.Bar(x=df_dayhour.index.to_list(),
                       y=df_dayhour_uber['HourOfDay']), row=3, col=1)



fig.update_layout(height=1000, width=900, showlegend=False)
fig.show()

In [None]:
rows = 3
cols = 1
fig = make_subplots( rows=rows, cols=cols, subplot_titles=['Taxi Hour Distribution','Uber  Hour Distribution, Comparison'],
vertical_spacing=0.1, horizontal_spacing=0.10)

#weekdays
fig.add_trace(go.Bar(x=[ 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
                              'Friday', 'Saturday', 'Sunday' ],
                       y=df_weekday_taxi['DayOfWeek']), row=1, col=1)

fig.add_trace(go.Bar(x=[ 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
                              'Friday', 'Saturday', 'Sunday' ],
                       y=df_weekday_uber['DayOfWeek']), row=2, col=1)
fig.add_trace(go.Bar(x=[ 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
                              'Friday', 'Saturday', 'Sunday' ],
                       y=df_weekday_taxi['DayOfWeek']), row=3, col=1)

fig.add_trace(go.Bar(x=[ 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
                              'Friday', 'Saturday', 'Sunday' ],
                       y=df_weekday_uber['DayOfWeek']), row=3, col=1)



fig.update_layout(height=1000, width=900, showlegend=False)
fig.show()





In [None]:

rows = 3
cols = 1
fig = make_subplots( rows=rows, cols=cols, subplot_titles=['Taxi Hour Distribution','Uber  Hour Distribution, Comparison'],
vertical_spacing=0.1, horizontal_spacing=0.10)

#weekdays
fig.add_trace(go.Bar(x= df_month.index.to_list(),
                       y=df_month_taxi['Month']), row=1, col=1)

fig.add_trace(go.Bar(x=df_month.index.to_list(),
                       y=df_month_uber['Month']), row=2, col=1)
fig.add_trace(go.Bar(x= df_month.index.to_list(),
                       y=df_month_taxi['Month']), row=3, col=1)

fig.add_trace(go.Bar(x=df_month.index.to_list(),
                       y=df_month_uber['Month']), row=3, col=1)


fig.update_layout(height=1000, width=900, showlegend=False)
fig.show()


As we did in the main code, the week hourly distribution was turned into a bokeh were it was easy to select the day we wanted to visualize. This make it easier to compare day by day what happens to the behaviour of the different transport methods.

In [None]:
#output_notebook() # open the bokeh viz on the notebook.
## it is a standard way to convert your df to bokeh
source_taxi = ColumnDataSource(df_grouped_taxi)

In [None]:
p_taxi = figure(x_range=FactorRange(factors=list(map(str, (df_grouped_taxi.index+1).values.tolist()))), 
           plot_height=400, plot_width=800, title='Hourly Week Days Distribution Taxi',
           x_axis_label='Hour of the Day', y_axis_label='Proportioned Frequency')

In [None]:
color = palettes.Category20[len(weekday_list)]
bar ={} # to store vbars
items = []
### here we will do a for loop:
for indx,i in enumerate(weekday_list):
    bar[i] = p_taxi.line(x='Hours', 
                    y=i, 
                    source=source_taxi, 
                    width=0.9,
                    muted=True, 
                    muted_alpha=0.005,
                    color=color[indx])
    items.append((i, [bar[i]]))
    
legend = Legend(items=items)
p_taxi.add_layout(legend, 'left')
p_taxi.legend.click_policy = 'mute'
p_taxi.legend.label_text_font_size='7pt'
show(p_taxi)

In [None]:
output_file('plot_taxi.html', mode='inline')
save(p_taxi)

In [None]:
html = file_html(p_taxi, CDN, "my plot")
with open("./plot_taxi.html","w+") as f:
    f.write(html)

In [None]:
df_uber['index']=df_uber.index
df_grouped_uber = pd.pivot_table(df_uber, index = "HourOfDay", 
                            columns = "DayOfWeek",values = 'index' ,aggfunc = 'count')

weekday_list=["Monday", "Tuesday", "Wednesday", "Thursday",
                    "Friday", "Saturday", "Sunday"]

In [None]:
df_grouped_uber = df_grouped_uber.div(df_grouped_uber.sum(axis=0), axis=1)

# Create Hour of the day column as the pivot_table Pandas function converts the HourOfTheDay column to an index.
# We need Hour of the day as a separate column to be handed to Bokeh.
df_grouped_uber.columns=weekday_list
df_grouped_uber['Hours']=df_grouped_uber.index.values
display(df_grouped_uber.shape)

In [None]:
#output_notebook() # open the bokeh viz on the notebook.
## it is a standard way to convert your df to bokeh
source_uber = ColumnDataSource(df_grouped_uber)

In [None]:
p_uber = figure(x_range=FactorRange(factors=list(map(str, (df_grouped_uber.index+1).values.tolist()))), 
           plot_height=400, plot_width=800, title='Hourly Week Days Distribution Uber',
           x_axis_label='Hour of the Day', y_axis_label='Proportioned Frequency')

In [None]:
color = palettes.Category20[len(weekday_list)]
bar ={} # to store vbars
items = []
### here we will do a for loop:
for indx,i in enumerate(weekday_list):
    bar[i] = p_uber.line(x='Hours', 
                    y=i, 
                    source=source_uber, 
                    width=0.9,
                    muted=True, 
                    muted_alpha=0.005,
                    color=color[indx])
    items.append((i, [bar[i]]))
    
legend = Legend(items=items)
p_uber.add_layout(legend, 'left')
p_uber.legend.click_policy = 'mute'
p_uber.legend.label_text_font_size='7pt'
show(p_uber)

In [None]:
from bokeh.embed import server_document
script = server_document("https://demo.bokeh.org/slider")

In [None]:

output_file('plot_uber.html', mode='inline')
save(p_uber)

In [None]:
html = file_html(p_uber, CDN, "my plot")

In [None]:
with open("./plot_uber.html","w+") as f:
    f.write(html)

Later, we decide to check different visualization methods in order to show the geographical patterns of the different transport companies.

Firstly, we implemented the different pick up locations on a folium heatmap, so that we can see the range and the concentration of the spatial distribution of each of the datasets. And then we did the same but in seaborn. However, as we did with the main code at the end the spatial distribution was visualized in plotly heatmap which gave a sharp and detailed image. In order to achieve that it was used DTU HPC due to the required computation.

In [None]:
"""Generate folium San Francisco base map with Stamen toner tile """
def generateBaseMap(default_location=[40.767937,-73.982155], default_zoom_start=10):
    base_map = folium.Map(location=default_location, control_scale=True, 
                          zoom_start=default_zoom_start, tiles="Stamen toner")
    return base_map

In [None]:
"""Creates array of coordinates to input folium map marker map"""
def to_coordinates(row):
    return [row["Lat"],row["Lon"]]

In [None]:
df_taxi['Coor']=df_taxi.apply(to_coordinates,axis=1)
df_uber['Coor']=df_uber.apply(to_coordinates,axis=1)
data_HM_taxi=df_taxi[['Lat', 'Lon', 'Coor']].groupby(['Lat', 'Lon']).count().reset_index().values.tolist()
data_HM_uber=df_uber[['Lat', 'Lon', 'Coor']].groupby(['Lat', 'Lon']).count().reset_index().values.tolist()

In [None]:
#mock for heatmap intensity
max_val=max(df_taxi[['Lat', 'Lon', 'Coor']].groupby(['Lat', 'Lon'])
            .count().reset_index()['Coor'].values)
data_HM_uber.insert(0, [float(90),float(40),float(max_val)])

In [None]:
base_map_taxi = generateBaseMap()

HeatMap(data=df_taxi['Coor'], 
        radius=10, max_zoom=12).add_to(base_map_taxi)
base_map_taxi

In [None]:
base_map_uber = generateBaseMap()
HeatMap(data=df_uber['Coor'], 
        radius=10, max_zoom=12,
        ).add_to(base_map_uber)
base_map_uber

In [None]:
#euclidian distances introduce errors. Therefore, we bin with haversine functions.
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r


# remove wrong data.
#plt.hist(df['X'], bins='auto')
#plt.hist(df['Y'], bins='auto')
indexNames = df_uber[(df_uber['Lon'] < -74.08) |(df_uber['Lon'] > -73.84) | (df_uber['Lat'] > 40.86)| (df_uber['Lat'] < 40.60)].index
df_uber_drop=df_uber.drop(indexNames)
#plt.hist(df['X'], bins='auto')
#plt.hist(df['Y'], bins='auto')
#bin estimation
lat_min = df_uber_drop['Lat'].min()
lat_max = df_uber_drop['Lat'].max()
lon_min = df_uber_drop['Lon'].min()
lon_max = df_uber_drop['Lon'].max()
yharvesine=(10/2)*(haversine(lon_min, lat_min, lon_min, lat_max)+haversine(lon_max, lat_min, lon_max, lat_max))
xharvesine=(10/2)*(haversine(lon_min, lat_max, lon_max, lat_max)+haversine(lon_min, lat_min, lon_max, lat_min))

count, lon, lat = np.histogram2d(df_uber_drop.Lat, df_uber_drop.Lon, bins = [int(xharvesine),int(yharvesine)])

#coordinates = df[['Lon', 'Lat']]

plt.figure(figsize=(10,10))
ax= plt.imshow(count, cmap='magma',vmax=500, origin='lower')

plt.grid(b=None)


In [None]:
# remove wrong data.
#plt.hist(df['X'], bins='auto')
#plt.hist(df['Y'], bins='auto')
indexNames = df_taxi[(df_taxi['Lon'] < -74.08) |(df_taxi['Lon'] > -73.84) | (df_taxi['Lat'] > 40.86)| (df_taxi['Lat'] < 40.60)].index
df_taxi_drop=df_taxi.drop(indexNames)
#plt.hist(df['X'], bins='auto')
#plt.hist(df['Y'], bins='auto')
#bin estimation

yharvesine=(10/2)*(haversine(lon_min, lat_min, lon_min, lat_max)+haversine(lon_max, lat_min, lon_max, lat_max))
xharvesine=(10/2)*(haversine(lon_min, lat_max, lon_max, lat_max)+haversine(lon_min, lat_min, lon_max, lat_min))

count, lon, lat = np.histogram2d(df_taxi_drop.Lat, df_taxi_drop.Lon, bins = [int(xharvesine),int(yharvesine)])

#coordinates = df[['Lon', 'Lat']]

plt.figure(figsize=(10,10))
ax= plt.imshow(count, cmap='magma',vmax=(500/(df_uber_drop.shape[0]/df_taxi_drop.shape[0])), origin='lower') 
#vmax 500/4 - 4 times less observations after dropping

plt.grid(b=None)


In [None]:

prep_data_uber = df_uber[['Lat', 'Lon','HourOfWeek']]
prep_data_uber = prep_data_uber.dropna(axis=0, subset=['Lat', 'Lon','HourOfWeek'])
Uber_timeline = [[[row['Lat'],row['Lon']] for index, row in prep_data_uber[prep_data_uber['HourOfWeek'] == i].iterrows()] for i in range(0,166)]

In [None]:
base_map = folium.Map([40.767937,-73.982155], tiles='Stamen Toner', zoom_start=10)
HeatMapWithTime(Uber_timeline, auto_play=True,
                             radius=6).add_to(base_map)
display(base_map)

In [None]:
prep_data_taxi = df_taxi[['Lat', 'Lon','HourOfWeek']]
prep_data_taxi = prep_data_taxi.dropna(axis=0, subset=['Lat', 'Lon','HourOfWeek'])
taxi_timeline = [[[row['Lat'],row['Lon']] for index, row in prep_data_taxi[prep_data_taxi['HourOfWeek'] == i].iterrows()] for i in range(0,166)]

In [None]:
base_map = folium.Map([40.767937,-73.982155], tiles='Stamen Toner', zoom_start=10)
HeatMapWithTime(taxi_timeline, auto_play=True,
                             radius=6).add_to(base_map)
display(base_map)

In this section a we tried to predict from a given location and time, if a person was going to ask a taxi or Uber. However, after balancing data and preprocess the values, the grid is entirelly blue. Meaning that everyone ask for Uber not being precise and therefore not being added to the website.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:

df_taxi_drop=df_taxi_drop.reset_index(drop=True).iloc[:,[0,1,4,5]]
df_uber_drop=df_uber_drop.sample(df_taxi_drop.shape[0]).reset_index(drop=True).iloc[:,[0,1,4,5]]

In [None]:
sample=100
df_taxi_drop=df_taxi_drop.sample(sample)
df_uber_drop=df_uber_drop.sample(sample)

In [None]:
df_taxi_drop.loc[:,'Category']=np.ones(df_taxi_drop.shape[0],dtype=int)
df_uber_drop.loc[:,'Category']=np.zeros(df_uber_drop.shape[0],dtype=int)

In [None]:

df_knn=df_taxi_drop.append(df_uber_drop, ignore_index=True)
x_train=df_knn.iloc[:,[0,1,2,3]].to_numpy()
y_train=df_knn.iloc[:,4].to_numpy()

In [None]:
# Create grid with numpy histogram2d
_, xedges, yedges = np.histogram2d(df_knn["Lon"], df_knn["Lat"], bins=[50,50])
# Get the x,y coordinate of the center of the points
x_coord = xedges[1:] - (xedges[1]-xedges[0])
y_coord = yedges[1:] - (yedges[1]-yedges[0])

In [None]:
X_points = np.array([[0,0,0,0]])
for d in range(7):
    for h in range(24):
        for x in x_coord:
            for y in y_coord:
                X_points =  np.append(X_points,[[x,y,d,h]],axis=0)
X_points = X_points[1:].tolist()

In [None]:
"""Assigns a color based on the category of the crime"""
def assigncolor(category):
    if category == 1.0:
        return "yellow"#taxi
    if category == 0.0:
        return "blue" #uber

In [None]:
model = KNeighborsClassifier(n_neighbors=5, weights='distance')
model.fit(x_train, y_train)
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=1,max_depth=1)
model.fit(x_train, y_train)

In [None]:
predicted = model.predict(x_train)
from sklearn.metrics import accuracy_score
accuracy_score(y_train, predicted)

In [None]:
Day=6
Hour=19
X_points_pd=pd.DataFrame(X_points)
X_points_pd=X_points_pd[(X_points_pd[2]==Day) & (X_points_pd[3]==Hour)].to_numpy()
predicted = model.predict(X_points_pd)

In [None]:
# Create folium map for 5 KNN
base_map = generateBaseMap()
for i in range(len(predicted)):
        x = X_points_pd[i][0]
        y = X_points_pd[i][1]
        color = assigncolor(predicted[i])
        folium.Circle([y, x], radius=70, fill_opacity=1.0,
                     fill_color=color, color=color).add_to(base_map)
base_map