# NEW YORK CITY TAXI FARES (NYCTF)

## Project imports, folders and variables

### General imports

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from math import *

#pyplot
%pylab inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

#dealing with folder paths
import os
import sys

#serialisation and compression
import pickle

#refreshing modules imports
from importlib import reload

#display images
import IPython
from IPython.display import Image

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


### Folder paths

In [3]:
project_dir = os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir)
#notebook is in "NYCTF/python/notebooks/exploratory" or "NYCTF/python/notebooks/reports": this actually links to NYCTF

src_dir = os.path.join(project_dir, 'python/nytf')
sys.path.append(src_dir) #adds src to search path for modules

data_dir = os.path.join(project_dir, 'data')
pickle_dir = os.path.join(data_dir, 'processed/pickle')

### Global variables

In [10]:
global dist_unit
dist_unit = 'km'

### Modules imports

In [75]:
import nytf_data_preparation as dp; reload(dp);
import nytf_geo as geo; reload(geo);
import nytf_geo_extractor as geo_extractor; reload(geo_extractor);

### Local variables

In [15]:
b_use_pickle_files = True
#True should you want to use the data stored using pickle
#False should you want to (re)generate the data from scratch

b_data_prepared = False
#True if the loaded data are already prepared
#False if the loaded data are raw

readcsv_parse_dates = False
#only matters if usePickleFiles is set to False
#["pickup_datetime"] should you want to parse dates when reading the csv file
#False shoud you want to do it later (manually)

minmax_coordinates = [40.568973, 41.709555, -74.263242, -72.986532]
#[latitude min, latitude max, longitude min, longitude max]
#needs some more exploration and thinking. for now: extrema in training set for both pickup and dropoff locations

col_types = {'fare_amount': 'float32', #tried float16 but then you cannot calculate the mean and std
             'pickup_longitude': 'float32',
             'pickup_latitude': 'float32',
             'dropoff_longitude': 'float32',
             'dropoff_latitude': 'float32',
             'passenger_count': 'uint8'}

r_earth = 6371 #radius of the Earth in dist_unit

b_display_math_demo = False

## Data loading & preparation

### Get number of lines in the csv files (if loading from csv)

In [14]:
%%time

#choose method among 'unixwc' and 'readlines'. 'readlines' (resp. 'unixwc') faster for short (resp. long) files
method = 'readlines'

if not b_use_pickle_files:
    train_row_count = dp.row_count(os.path.join(data_dir, 'raw/train.csv'), method)
    test_row_count = dp.row_count(os.path.join(data_dir, 'raw/test.csv'), method)
    print('Number of lines in train file: ' + str(train_row_count))
    print('Number of lines in test file: ' + str(test_row_count))
else:
    print('Training and test datasets will not come from .csv files.')

Training and test datasets will not come from .csv files.
CPU times: user 66 µs, sys: 34 µs, total: 100 µs
Wall time: 82 µs


### Load the data (from csv or pickle files)

In [17]:
%%time
if b_use_pickle_files:
    if b_data_prepared:
        with open(os.path.join(data_dir, 'processed/pickle/train_prep.pickle'), 'rb') as f:
            df_train_prep = pickle.load(f)
        with open(os.path.join(data_dir, 'processed/pickle/test_prep.pickle'), 'rb') as f:
            df_test_prep = pickle.load(f)
    else:
        with open(os.path.join(data_dir, 'processed/pickle/train.pickle'), 'rb') as f:
            df_train = pickle.load(f)
        with open(os.path.join(data_dir, 'processed/pickle/test.pickle'), 'rb') as f:
            df_test = pickle.load(f)
else:
    df_train = pd.read_csv(os.path.join(data_dir, 'raw/train.csv'), parse_dates=readcsv_parse_dates)
    df_test = pd.read_csv(os.path.join(data_dir, 'raw/test.csv'), parse_dates=readcsv_parse_dates)
    with open(os.path.join(data_dir, 'processed/pickle/train.pickle'), 'wb') as f:
        pickle.dump(df_train, f)
    with open(os.path.join(data_dir, 'processed/pickle/test.pickle'), 'wb') as f:
        pickle.dump(df_test, f)

CPU times: user 16.4 s, sys: 9.35 s, total: 25.7 s
Wall time: 28.8 s


### Data preparation

In [18]:
%%time
if not b_data_prepared:
    df_test_prep = dp.prepare_data(df_test, 'test', col_types, readcsv_parse_dates==False, True, pickle_dir, 'test_prep.pickle')
    print(df_test_prep.info())
else:
    print('Data already prepared.')

test
Step 1/7 complete.
Step 2/7 skipped. Not a training set.
Step 3/7 skipped. Not a training set.
Step 4/7 complete. Dates have been parsed.
Step 5/7 skipped. Not a training set.
Step 6/7 complete. Indexes reset.
Step 7/7 complete. Prepared dataframe has been saved in a pickle file.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9914 entries, 0 to 9913
Data columns (total 7 columns):
key                  9914 non-null object
pickup_datetime      9914 non-null datetime64[ns, UTC]
pickup_longitude     9914 non-null float32
pickup_latitude      9914 non-null float32
dropoff_longitude    9914 non-null float32
dropoff_latitude     9914 non-null float32
passenger_count      9914 non-null uint8
dtypes: datetime64[ns, UTC](1), float32(4), object(1), uint8(1)
memory usage: 319.6+ KB
None
CPU times: user 26.1 ms, sys: 2.88 ms, total: 29 ms
Wall time: 31.9 ms


In [19]:
%%time
if not b_data_prepared:
    df_train_prep = dp.prepare_data(df_train, 'Train', col_types, readcsv_parse_dates==False, True, pickle_dir, 'train_prep.pickle')
    print(df_train_prep.info())
else:
    print('Data already prepared.')

Train
Step 1/7 complete.
Step 2/7 complete. Incomplete rows have been removed.
Step 3/7 complete. Types changed to more relevant ones.
Step 4/7 complete. Dates have been parsed.
Step 5/7 complete. Records with negative or >=$1000 fares have been removed.
Step 6/7 complete. Indexes reset.
Step 7/7 complete. Prepared dataframe has been saved in a pickle file.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55419634 entries, 0 to 55419633
Data columns (total 8 columns):
key                  object
fare_amount          float32
pickup_datetime      datetime64[ns, UTC]
pickup_longitude     float32
pickup_latitude      float32
dropoff_longitude    float32
dropoff_latitude     float32
passenger_count      uint8
dtypes: datetime64[ns, UTC](1), float32(5), object(1), uint8(1)
memory usage: 1.9+ GB
None
CPU times: user 1min 57s, sys: 24.7 s, total: 2min 21s
Wall time: 2min 21s


## Geo (cleaning + first ideas)

In [20]:
df_train_prep_geo = df_train_prep.copy(False)
df_test_prep_geo = df_test_prep.copy(False)

In [21]:
len(df_train.index)

55423856

### Cleaning

In [22]:
%%time
#Remove outlying coordinates
df_train_prep_geo = geo.remove_outlying_coordinates(df_train_prep_geo, minmax_coordinates)
df_train_prep_geo = df_train_prep_geo.reset_index(drop=True)

CPU times: user 4.44 s, sys: 3.25 s, total: 7.7 s
Wall time: 7.7 s


In [23]:
len(df_train_prep_geo)

54234152

In [24]:
%%time
df_test_prep_geo = geo.remove_outlying_coordinates(df_test_prep_geo, minmax_coordinates)
df_test_prep_geo = df_test_prep_geo.reset_index(drop=True)

CPU times: user 10.5 ms, sys: 0 ns, total: 10.5 ms
Wall time: 9.76 ms


### Adding a 'distance' feature

First method is flying distance.
Second method consists in rotating the map of Manhattan then using the 1-distance.
Third method consists of using Google Maps (or similar) - the itinerary may however not be the one the taxi driver actually took.

In [26]:
if b_display_math_demo:
    display(Image(os.path.join(data_dir, 'processed/images/sphere.jpg')))

#### Flying distance

In [27]:
if b_display_math_demo:
    display(Image(os.path.join(data_dir, 'processed/images/flying_distance.JPG')))

In [67]:
%%time
df_train_prep_geo = geo.add_flying_distance(df_train_prep_geo, 'deg', r_earth, dist_unit)

CPU times: user 1min 43s, sys: 3.9 s, total: 1min 47s
Wall time: 1min 47s


In [66]:
%%time
df_test_prep_geo = geo.add_flying_distance(df_test_prep_geo, 'deg', r_earth, dist_unit)

CPU times: user 34.1 ms, sys: 457 µs, total: 34.6 ms
Wall time: 33.4 ms


#### L1-distance

In [69]:
#Calculating the angle of which the Manhattan street grid is rotated from the north-south axis
#https://www.nytimes.com/2006/07/02/nyregion/thecity/02grid.html states 29°. Let us check this.
if b_display_math_demo:
    display(Image(os.path.join(data_dir, 'processed/images/Manhattan_street_angle.png')))

l_man = 197
h_man = 360
ang_man = - atan(l_man/h_man) #angle de rotation du repère (rad), et non de la map (opposé)
print('Angle: ' + str(ang_man) +' rad = ' + str(ang_man*180/pi) + '°.')

Angle: -0.5007080588399464 rad = -28.68845853971702°.


In [70]:
if b_display_math_demo:
    display(Image(os.path.join(data_dir, 'processed/images/L1_distance_1.JPG')))
    display(Image(os.path.join(data_dir, 'processed/images/L1_distance_2.JPG')))
    display(Image(os.path.join(data_dir, 'processed/images/L1_distance_3.JPG')))

In [76]:
%%time
df_train_prep_geo = geo.add_L1_distance(df_train_prep_geo, 'deg', r_earth, dist_unit, ang_man) #plane_rot_angle in rad please

CPU times: user 18min 42s, sys: 5min 37s, total: 24min 20s
Wall time: 18min 21s


In [77]:
%%time
df_test_prep_geo = geo.add_L1_distance(df_test_prep_geo, 'deg', r_earth, dist_unit, ang_man)

CPU times: user 228 ms, sys: 56.1 ms, total: 284 ms
Wall time: 204 ms


In [78]:
with open(os.path.join(data_dir, 'processed/pickle/train_prep_geo.pickle'), 'wb') as f:
        pickle.dump(df_train_prep_geo, f)
with open(os.path.join(data_dir, 'processed/pickle/test_prep_geo.pickle'), 'wb') as f:
        pickle.dump(df_test_prep_geo, f)

In [79]:
with open(os.path.join(data_dir, 'processed/pickle/train_prep_geo.pickle'), 'rb') as f:
            df_train_prep_geo = pickle.load(f)
with open(os.path.join(data_dir, 'processed/pickle/test_prep_geo.pickle'), 'rb') as f:
            df_test_prep_geo = pickle.load(f)

In [80]:
df_train_prep_geo.head(10)

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,flying_distance_km,L1_distance_km
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:00+00:00,-73.844315,40.721317,-73.841614,40.712276,1,1.030742,1.274083
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:00+00:00,-74.016045,40.711304,-73.979271,40.782005,1,8.45,9.440086
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00+00:00,-73.982735,40.761269,-73.991241,40.750561,2,1.389632,1.445443
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:00+00:00,-73.987129,40.733143,-73.99157,40.758091,1,2.799211,3.616691
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00+00:00,-73.968094,40.768009,-73.956657,40.783764,1,1.999081,2.002912
5,2011-01-06 09:50:45.0000002,12.1,2011-01-06 09:50:00+00:00,-74.000961,40.731628,-73.972893,40.758232,1,3.787118,4.38443
6,2012-11-20 20:35:00.0000001,7.5,2012-11-20 20:35:00+00:00,-73.980003,40.751663,-73.973801,40.764843,1,1.55586,1.781619
7,2012-01-04 17:22:00.00000081,16.5,2012-01-04 17:22:00+00:00,-73.951302,40.774139,-73.990097,40.751049,1,4.1555,5.454634
8,2012-12-03 13:10:00.000000125,9.0,2012-12-03 13:10:00+00:00,-74.006462,40.726711,-73.99308,40.731628,1,1.253181,1.747667
9,2009-09-02 01:11:00.00000083,8.9,2009-09-02 01:11:00+00:00,-73.980659,40.733871,-73.991539,40.758137,2,2.84959,3.298166


### Geographical clusters?

In [23]:
#1 cluster would be, for example, the JFK airport, or a "neighbourhood" in NYC (maybe not the official ones, can be customised/calculated zones)

## Time (first ideas)

In [24]:
#Time clusters / geo-time clusters / time-related features?

#weekday/WE, night, rushhour, before/after 201X when they changed the fares...: all those that impact the fare directly through surchages
#these should not be geo-dependent

#some other time-related paramaters/clusters might be geo-dependent (id est impact the traffic situation)

#---
#Estimate the duration of the trip?

#---
#Open data...?

#Weather data, traffic data...?