In [1]:
%pylab inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import timeit
from scipy import stats
import seaborn
from IPython.display import Image

#### taxi data was downloaded from
www.nyc.gov/html/tlc/html/about/trip_record_data.shtml

In [2]:
taxi_may_16 = pd.read_csv("./data/yellow_tripdata_2016-05.csv")

In [3]:
print(taxi_may_16.shape)

In [4]:
taxi_may_16.head()

In [5]:
taxi_may_16.info()

In [6]:
taxi_may_16.isnull().values.any()

In [7]:
taxi_may_16.describe()

In [6]:
print(type(taxi_may_16.iloc[0].tpep_dropoff_datetime))

##### trips with zero duration, distance and without passenger will be ignored 

In [9]:
print(taxi_may_16[taxi_may_16.passenger_count == 0].shape)
print(taxi_may_16[taxi_may_16.trip_distance == 0].shape)
print(taxi_may_16[taxi_may_16.tpep_pickup_datetime == taxi_may_16.tpep_dropoff_datetime].shape)

##### New York city boundaries: [-74.25559, -73.70001] longitude degrees and [40.49612, 40.91553] latitude degrees. Leave only trips that start within this boundaries

In [4]:
print(taxi_may_16[(-74.25559 <= taxi_may_16.pickup_longitude) & (taxi_may_16.pickup_longitude <= -73.70001 ) 
                   & (40.49612 <= taxi_may_16.pickup_latitude) & (taxi_may_16.pickup_latitude <= 40.91553)].shape)

In [4]:
taxi_may_16.drop(taxi_may_16[taxi_may_16.passenger_count == 0].index, inplace=True)
taxi_may_16.drop(taxi_may_16[taxi_may_16.trip_distance == 0].index, inplace=True)
taxi_may_16.drop(taxi_may_16[taxi_may_16.tpep_pickup_datetime == taxi_may_16.tpep_dropoff_datetime].index, inplace=True)

In [7]:
taxi_may_16.shape

In [8]:
taxi_may_16.drop(taxi_may_16[taxi_may_16.pickup_longitude < -74.25559].index, inplace=True)
taxi_may_16.drop(taxi_may_16[taxi_may_16.pickup_longitude > -73.70001].index, inplace=True)
taxi_may_16.drop(taxi_may_16[taxi_may_16.pickup_latitude < 40.49612].index, inplace=True)
taxi_may_16.drop(taxi_may_16[taxi_may_16.pickup_latitude > 40.91553].index, inplace=True)

In [12]:
taxi_may_16.shape

In [77]:
regions = pd.read_csv("./data/regions.csv", delimiter=";")

In [14]:
regions.head()

In [15]:
regions.shape

In [18]:
print(type(regions.iloc[0].west), type(taxi_may_16.iloc[0].pickup_latitude))

##### lets cover the city with a rectangular grid - (50x50) - 2500 cells-regions. Each trip will be assigned the one region number, based on the geographical coordinates of its start position

##### too slow variant

In [42]:
latt = taxi_may_16.iloc[0].pickup_latitude
longi = taxi_may_16.iloc[0].pickup_longitude
reg = regions[(regions.west <= longi) & (longi < regions.east) & (regions.south <= latt) & (latt < regions.north)].region.values
print reg

In [70]:
def check_region(taxi_data, regions):
    return regions[(regions.west <= taxi_data.pickup_longitude) & (taxi_data.pickup_longitude < regions.east) & 
                   (regions.south <= taxi_data.pickup_latitude) & (taxi_data.pickup_latitude < regions.north)].region.values[0]

result = [check_region(taxi_may_16.iloc[i], regions) for i in range(2000)]

In [15]:
#region = taxi_may_16.loc[:3000000].apply(lambda x: regions[(regions.west <= x.pickup_longitude) & 
#                                                            (x.pickup_longitude < regions.east) & 
#                                                            (regions.south <= x.pickup_latitude) & 
#                                                            (x.pickup_latitude < regions.north)].region.values[0], axis=1)
#taxi_may_16["region"] = region

#### much faster

In [20]:
NY_long_west = -74.25559
NY_long_east = -73.70001
NY_latt_south = 40.49612
NY_latt_north = 40.91553

In [18]:
def region_counter(longitude, latitude):
    long_num = np.ceil((longitude - NY_long_west)*50.0 / (NY_long_east - NY_long_west))
    lat_num = np.ceil((latitude - NY_latt_south)*50.0 / (NY_latt_north - NY_latt_south))
    
    return (long_num - 1) * 50.0 + lat_num

In [98]:
print(region_counter(-74.255585, 40.496125))

In [29]:
%%time
s = region_counter(taxi_may_16.pickup_longitude, taxi_may_16.pickup_latitude)

In [30]:
taxi_may_16["region"] = s

In [31]:
taxi_may_16.head()

In [68]:
taxi_may_16.shape

##### round hours - discard minutes and seconds

In [69]:
taxi_may_16["hour_statistic"] = pd.DatetimeIndex(taxi_may_16.tpep_pickup_datetime).map(lambda x: x.replace(minute=0, second=0))

In [70]:
taxi_may_16.shape

In [71]:
taxi_may_16.head()

In [73]:
taxi_may_16.loc[1000].hour_statistic

In [108]:
taxi_may_16.columns

In [114]:
taxi_may_16.to_csv("./data/edit_yellow_taxi_may_2016.csv", sep='\t', columns=["VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count",
                                                 "trip_distance", "pickup_longitude", "pickup_latitude", "RatecodeID" ,
                                                 "store_and_fwd_flag", "dropoff_longitude", 'dropoff_latitude', 'payment_type',
                                                 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
                                                 'improvement_surcharge', 'total_amount', 'region', 'hour_statistic'])

##### make 2d statistics

In [2]:
#taxi_may_16_edit = pd.read_csv("./data/edit_yellow_taxi_may_2016.csv", sep='\t')
taxi_may_16_edit = pd.read_csv("./data/edit_yellow_taxi_may_2016.csv", sep='\t', usecols=["VendorID", "tpep_pickup_datetime", 
                            "tpep_dropoff_datetime", "passenger_count", "pickup_longitude", "pickup_latitude", "region", 
                                                                                   "hour_statistic"])

In [3]:
taxi_may_16_edit.shape

In [4]:
taxi_may_16_edit.hour_statistic = pd.DatetimeIndex(taxi_may_16_edit.hour_statistic)

In [5]:
taxi_may_16_edit.head()

In [6]:
taxi_may_16_edit[-5:]

In [7]:
taxi_may_16_edit.info()

In [8]:
taxi_may_16_edit.hour_statistic[0].value

In [12]:
print("amount of unique regions in May month: ", np.unique(taxi_may_16_edit.region).shape[0])

In [7]:
time_bins = np.sort(np.unique(taxi_may_16_edit.hour_statistic)).astype(np.int64)
time_bins.shape # 31 day * 24 hour

##### add last 00:00 hours of may 31 2016

In [6]:
time_bins = np.hstack((time_bins, time_bins[-1] + 3600000000000))

##### started trips and passenger amount statistics

In [8]:
region_bined_stat = stats.binned_statistic_2d(taxi_may_16_edit.region, taxi_may_16_edit.hour_statistic.astype(np.int64), 
                                            "None", 'count', bins=[np.arange(1, 2502), time_bins])
print(region_bined_stat.statistic.shape)
passanger_bined_stat = stats.binned_statistic_2d(taxi_may_16_edit.region, taxi_may_16_edit.hour_statistic.astype(np.int64), 
                                            taxi_may_16_edit.passenger_count, 'sum', bins=[np.arange(1, 2502), time_bins])

In [26]:
taxi_may_16_edit["day_hours"] = taxi_may_16_edit["hour_statistic"].apply(lambda x: x.hour)
taxi_may_16_edit.head()

In [14]:
hour_bins = np.hstack((np.sort(np.unique(taxi_may_16_edit.day_hours)), 24))
print(hour_bins)

##### leave only hours without minutes and seconds and calculate started trips statistics

In [12]:
reg_hour_stat = stats.binned_statistic_2d(taxi_may_16_edit.region, taxi_may_16_edit.day_hours, 
                                            "None", 'count', bins=[np.arange(1, 2502), hour_bins])
with open('region_bined_stat.pkl', 'wb') as f:
    pickle.dump(region_bined_stat.statistic, f)

##### plot calculated statistics result in unchanged scale and cube root scale

In [36]:
plt.figure(figsize=(20, 20))
seaborn.heatmap(region_bined_stat.statistic, cmap='coolwarm')
plt.legend("hour-NY region taxi counter")
plt.xlabel('hours in May 2016', fontsize=24)
plt.ylabel('NY regions', fontsize=24)

In [89]:
plt.figure(figsize=(20, 20))
seaborn.heatmap(np.cbrt(region_bined_stat.statistic), cmap='coolwarm')
plt.legend("hour-NY region taxi counter")
plt.xlabel('hours in May 2016', fontsize=24)
plt.ylabel('NY regions', fontsize=24)

##### passenger statistics

In [12]:
plt.figure(figsize=(20, 20))
seaborn.heatmap(passanger_bined_stat.statistic, cmap='coolwarm')
plt.legend("hour-NY region taxi counter")
plt.xlabel('hours in May 2016', fontsize=24)
plt.ylabel('NY regions', fontsize=24)

##### cumulative daily statistics for the May month in the unchanged and log scale

In [14]:
plt.figure(figsize=(20, 20))
seaborn.heatmap(reg_hour_stat.statistic, cmap='coolwarm')
plt.legend("24 hour region taxi counter", prop={'size': 6})
plt.xlabel('hours in May 2016', fontsize=24)
plt.ylabel('NY regions', fontsize=24)

In [88]:
plt.figure(figsize=(20, 20))
seaborn.heatmap(np.cbrt(reg_hour_stat.statistic), cmap='coolwarm')
plt.legend("24 hour region taxi counter", prop={'size': 6})
plt.xlabel('hours in May 2016', fontsize=24)
plt.ylabel('NY regions', fontsize=24)

##### cumulative map show region v.s. total amount of trips dependence. cubic root was applied to reduce the result spread

In [74]:
plt.figure(figsize=(20, 20))
seaborn.heatmap(np.cbrt(np.rot90(np.reshape(reg_hour_stat.statistic.sum(axis=1), (-1, 50)))), cmap='coolwarm')

In [42]:
''' http://www.mapdevelopers.com/geocode_bounding_box.php '''
Image("./data/new york.png")

##### several central regions heatmap for 1-st May 2016

In [65]:
plt.figure(figsize=(24,10))
seaborn.heatmap(region_bined_stat.statistic[1225:1235, :24], annot=True, cmap='coolwarm')
plt.legend("hour-NY region taxi counter")
plt.xlabel('hours in May 2016', fontsize=24)
plt.ylabel('NY regions', fontsize=24)

##### Empire State Building 40.748441В° N,   -73.985664В° W

In [78]:
print("Empire State building region: ", regions[(regions.west <= -73.985664) & (-73.985664 < regions.east) & 
                                                (regions.south <= 40.748441) & (40.748441 < regions.north)].region.values)

In [79]:
ESB_reg = 1231

##### amount_of_trips = f(t) dependence from region that contains Empire State Building

In [82]:
plt.figure(figsize=(15,6))
plt.plot(np.arange(744), region_bined_stat.statistic[ESB_reg-1,:])
plt.plot(np.arange(744), region_bined_stat.statistic[ESB_reg-1,:])
plt.xlim([0,745])
plt.ylabel("amount of trips")
plt.xlabel("hour")
plt.title("Empire State Building region");

##### number of cells with zero amount of trips

In [87]:
print("all regions: ", np.prod(region_bined_stat.statistic.shape))
print("zero regions: ", np.prod(region_bined_stat.statistic.shape) - np.count_nonzero(region_bined_stat.statistic))

In [13]:
#del bined_statistic
del taxi_may_16_edit