In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from random import choice
from datetime import datetime

In [2]:
df = pd.read_csv('data/data_hackathon_v04.csv', sep='|')

In [3]:
df.tail()

Unnamed: 0,ride_id,created_at,pickup_lat,pickup_lng,dropoff_lat,dropoff_lng,user_id,driver_id,ride_distance,canceled_by_client,canceled_by_driver,ride_to_suburb
1117295,ffffdeb3057444619af8725b67e4ee75n,2019-01-13 08:25:37.110000000,49.839718,23.994425,49.828442,24.071106,3400450035003800380039003000360037004100430034...,890e0202a6d34b239d6fb01b31ed2f7f,6.95,0,0,0
1117296,fffff4173872433b8a4bd28e636ebe91n,2018-12-29 01:53:57.100000000,49.84314,24.028856,49.821728,23.980867,3200380033003300300038003000380032004500380030...,22f45ddc006044b9b2b6e5bc19e72560,5.26,0,0,0
1117297,fffff4bbb6c84bd39a39cddaa3c95eebn,2019-02-16 00:54:49.863000000,49.877899,23.950491,49.839718,23.994425,3300380034003100420035003300390036003800310038...,,7.9,0,0,0
1117298,fffffe4f6da945abaf640947ae2119dcn,2018-11-23 17:45:50.253000000,49.841377,24.02529,49.825935,24.012459,6227435C786564775C7839345C6E5C7839335C7830665C...,,2.8,0,0,0
1117299,ffffff3a4fc94061ac3a0d16d8a83ca4n,2018-10-07 15:47:53.013000000,49.830559,24.029322,49.800385,24.050665,6227735C7864325C78643051655C7831355C7838325C74...,,5.26,0,0,0


In [4]:
df.isna().sum()

ride_id                    0
created_at                 0
pickup_lat                 0
pickup_lng                 0
dropoff_lat                0
dropoff_lng                0
user_id                    0
driver_id             121509
ride_distance              0
canceled_by_client         0
canceled_by_driver         0
ride_to_suburb             0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1117300 entries, 0 to 1117299
Data columns (total 12 columns):
ride_id               1117300 non-null object
created_at            1117300 non-null object
pickup_lat            1117300 non-null float64
pickup_lng            1117300 non-null float64
dropoff_lat           1117300 non-null float64
dropoff_lng           1117300 non-null float64
user_id               1117300 non-null object
driver_id             995791 non-null object
ride_distance         1117300 non-null float64
canceled_by_client    1117300 non-null int64
canceled_by_driver    1117300 non-null int64
ride_to_suburb        1117300 non-null int64
dtypes: float64(5), int64(3), object(4)
memory usage: 102.3+ MB


In [6]:
from h3 import h3
import folium 

In [7]:
df = df[np.abs(df.pickup_lat-df.pickup_lat.mean()) <= (3*df.pickup_lat.std())]
df = df[np.abs(df.pickup_lng-df.pickup_lng.mean()) <= (3*df.pickup_lng.std())]
df = df[np.abs(df.dropoff_lat-df.dropoff_lat.mean()) <= (3*df.dropoff_lat.std())]
df = df[np.abs(df.dropoff_lng-df.dropoff_lng.mean()) <= (3*df.dropoff_lng.std())]

In [8]:
df['created_at'] = pd.to_datetime(df['created_at'])
df['month'] = df['created_at'].dt.month
df['month_day'] = df['created_at'].dt.day
df['week_day'] = df['created_at'].dt.weekday
df['hour'] = df['created_at'].dt.hour

In [9]:
mean_lng = pd.concat([df.pickup_lng, df.dropoff_lng]).mean()
mean_lat = pd.concat([df.pickup_lat, df.dropoff_lat]).mean()

In [10]:
from h3 import h3
import folium 

In [11]:
color_mapper = {
    1: 'black',
    2: 'blue',
    3: 'green',
    4: 'yellow',
    5: '#e26d00',
    6: 'red'
}

def ranking(value, max_value):
    '''
    Map value to rank
    '''
    if value <= 0.05 * max_value:
        return 1
    if value > 0.05 * max_value and value <= 0.15 * max_value:
        return 2
    if value > 0.15 * max_value and value <= 0.3 * max_value:
        return 3
    if value > 0.3 * max_value and value <= 0.45 * max_value:
        return 4
    if value > 0.45 * max_value and value <= 0.60 * max_value:
        return 5
    if value > 0.60 * max_value:
        return 6

In [12]:
h3_address = h3.geo_to_h3(mean_lat, mean_lng, 7)
hex_center_coordinates = h3.h3_to_geo(h3_address)

In [13]:
h3_addresses = h3.k_ring(h3_address, 4)
hexes = [h3.h3_to_geo_boundary(adress) for adress in h3_addresses]

Split city by hexagons with radius 1.22 km

In [15]:
m = folium.Map(location=hex_center_coordinates)
for hex_boundary in hexes:
    folium.Polygon(hex_boundary).add_to(m)

for _, point in df.sample(1000).iterrows():
    folium.Circle((point['pickup_lat'], point['pickup_lng']), radius=2).add_to(m)
m

In [16]:
import math

class Hexagon:   
    def __init__(self, index):
        self.index = index
        self.center = h3.h3_to_geo(self.index)
        self.vertices = h3.h3_to_geo_boundary(self.index)    
        self.radius = self.distance(self.center, self.vertices[0])
    
    def distance(self, point1, point2):
        '''
        Calculate distance between two distance
        '''
        return math.sqrt((point1[0]-point2[0])**2+(point1[1]-point2[1])**2)
    
    def distance_to_center(self, point):
        '''
        Calculate distance between point and hexagon center
        '''
        return self.distance(point, self.center)
    
    def distance_to_hexagon(self, hexagon):
        return self.distance_to_center(hexagon.center)

In [17]:
def h3_index_belonging(point, polygons):
    '''
    Return h3_address of polygon that point belongs to
    '''
    distances_to_polygon_centers = [polygon.distance_to_center(point) for polygon in polygons]
    polygon_index = distances_to_polygon_centers.index(min(distances_to_polygon_centers))
    return polygons[polygon_index].index

Label all points

In [18]:
polygons = [Hexagon(address) for address in h3_addresses]
def get_pickup_district(row):
    return h3_index_belonging((row['pickup_lat'], row['pickup_lng']), polygons)


def get_dropoff_district(row):
    return h3_index_belonging((row['dropoff_lat'], row['dropoff_lng']), polygons)

df.loc[:, 'pickup_district'] = df.apply(get_pickup_district, axis=1)
df.loc[:, 'dropoff_district'] = df.apply(get_dropoff_district, axis=1)

Number of orders in each district

In [42]:
districts = {}

for district in df.loc[:, 'pickup_district'].unique():
    districts[district] = df[df.loc[:, 'pickup_district'] == district].shape[0]

In [None]:
max_val = max(districts.values())

m = folium.Map(location=hex_center_coordinates)

for d in districts:
    folium.Polygon(h3.h3_to_geo_boundary(d),
                  fill_color=color_mapper[ranking(districts[d], max_val)], popup=str(districts[d])).add_to(m)
m

In [33]:
predictions = pd.read_csv('data/predictions.csv', index_col=0)

In [39]:
dt = str(datetime(year=2019, month=2, day=24, hour=22))



def get_today_data(date):
    for index_date in predictions.index:
        if pd.to_datetime(index_date).week == date.week and \
        pd.to_datetime(index_date).day == date.day:
            yield index_date

m = folium.Map(location=hex_center_coordinates)

districts_data = predictions.loc[list(get_today_data(pd.to_datetime(dt))), :]
max_ = predictions.max().max()
print(dt)

for key in predictions.loc[dt,:].keys():
    folium.Polygon(h3.h3_to_geo_boundary(key),
                   fill_color=color_mapper[ranking(predictions.loc[dt, :][key],
                                            max_)],
                  popup=str(int(predictions.loc[dt, :][key]))).add_to(m)
m

2019-02-24 22:00:00


In [40]:
def predict_for_hour(dt, district, predictions):
    return predictions.loc[str(dt), district]

In [41]:
from datetime import timedelta

def districts_demand(from_district, districts, dt):
    '''
    Params:
    from_district - district where driver is
    districts - districts where customers want to
    dt - current datetime
    '''
    plgns = [Hexagon(poly) for poly in districts]
    from_plgn = Hexagon(from_district)
    time_to_polygon_minutes = [from_plgn.distance_to_hexagon(plgn)*111 / 40 * 60 for plgn in plgns]
    arrival_time = [dt+timedelta(seconds=td*60) for td in time_to_polygon_minutes]
    arrival_time = [str(time.replace(second=0, minute=0, microsecond=0)) for time in arrival_time]
    return [(ranking(predictions.loc[time, dstrct], max_), int(predictions.loc[time, dstrct])) for dstrct, time in zip(districts, arrival_time)]
    
districts_demand('871e76891ffffff', ['871e7688affffff', '871e76884ffffff'], datetime(year=2019, month=2, day=24, hour=23, minute=50))

[(2, 17), (1, 1)]