# Yello Taxi Trip Data (New York City, June 2022)

Map of pick-up and drop-off zones or location IDs:

https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc

In [2]:
import pandas as pd

In [3]:
# #NOTE: We might need to install "pyarrow" engine for the following code to work... Use "pip install pyarrow"
# parquet_file = "yellow_tripdata_2022-06.parquet"
# df = pd.read_parquet(parquet_file, engine='auto')

In [4]:
df = pd.read_csv("yellow_tripdata_2022-06.csv")
df.rename(columns={'latPU' : 'PULatitude'}, inplace=True)
df.rename(columns={'lonPU' : 'PULongitude'}, inplace=True)
df.rename(columns={'latDO' : 'DOLatitude'}, inplace=True)
df.rename(columns={'lonDO' : 'DOLongitude'}, inplace=True)

In [5]:
df.loc[:20]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,PULatitude,PULongitude,DOLatitude,DOLongitude
0,1,2022-06-01 00:25:41,2022-06-01 00:48:22,1.0,11.0,70,48,1,32.0,3.0,0.5,2.0,6.55,0.3,44.35,40.761212,-73.865136,40.777048,-73.967596
1,1,2022-06-01 00:10:17,2022-06-01 00:34:46,1.0,12.4,70,48,1,36.0,3.0,0.5,9.0,6.55,0.3,55.35,40.761212,-73.865136,40.777048,-73.967596
2,2,2022-06-01 09:18:54,2022-06-01 09:59:52,1.0,10.62,70,48,1,34.5,0.0,0.5,5.65,6.55,0.3,50.0,40.761212,-73.865136,40.777048,-73.967596
3,2,2022-06-01 10:57:55,2022-06-01 12:01:37,1.0,12.06,70,48,2,47.0,0.0,0.5,0.0,6.55,0.3,56.85,40.761212,-73.865136,40.777048,-73.967596
4,2,2022-06-01 12:10:46,2022-06-01 12:52:32,1.0,10.97,70,48,1,37.0,0.0,0.5,9.37,6.55,0.3,56.22,40.761212,-73.865136,40.777048,-73.967596
5,2,2022-06-01 12:55:59,2022-06-01 13:31:29,1.0,9.46,70,48,1,33.0,0.0,0.5,8.57,6.55,0.3,51.42,40.761212,-73.865136,40.777048,-73.967596
6,2,2022-06-01 13:38:40,2022-06-01 14:21:48,1.0,9.29,70,48,1,36.0,0.0,0.5,9.17,6.55,0.3,56.27,40.761212,-73.865136,40.777048,-73.967596
7,1,2022-06-01 13:16:28,2022-06-01 13:48:52,0.0,9.1,70,48,1,30.5,2.5,0.5,10.05,6.55,0.3,50.4,40.761212,-73.865136,40.777048,-73.967596
8,2,2022-06-01 15:42:56,2022-06-01 16:32:21,1.0,8.99,70,48,2,36.5,0.0,0.5,0.0,6.55,0.3,47.6,40.761212,-73.865136,40.777048,-73.967596
9,2,2022-06-01 16:45:28,2022-06-01 17:41:15,2.0,11.49,70,48,2,40.0,1.0,0.5,0.0,6.55,0.3,52.1,40.761212,-73.865136,40.777048,-73.967596


In [6]:
len(df)

3496580

In [7]:
df.isna().sum()

VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
PULatitude               0
PULongitude              0
DOLatitude               0
DOLongitude              0
dtype: int64

In [8]:
df.nunique()

VendorID                       3
tpep_pickup_datetime     1709403
tpep_dropoff_datetime    1707818
passenger_count               10
trip_distance               3818
PULocationID                 258
DOLocationID                 258
payment_type                   5
fare_amount                 7375
extra                         53
mta_tax                       11
tip_amount                  3203
tolls_amount                 689
improvement_surcharge          3
total_amount               12618
PULatitude                   210
PULongitude                  210
DOLatitude                   210
DOLongitude                  210
dtype: int64

In [9]:
df["PULocationID"].value_counts()

237    170644
132    164191
236    148638
161    139664
142    117339
        ...  
199         2
27          2
176         2
251         1
109         1
Name: PULocationID, Length: 258, dtype: int64

# Pre-processing:

In [10]:
def getTime(x):
    return str(x).split(" ")[1]
def getDate(x):
    return str(x).split(" ")[0]

date_time = df['tpep_pickup_datetime']
df.drop(axis='columns', labels='tpep_pickup_datetime', inplace=True)

df['tpep_pickup_date'] = date_time.apply(getDate)
df['tpep_pickup_time'] = date_time.apply(getTime)

date_time = df['tpep_dropoff_datetime']
df.drop(axis='columns', labels='tpep_dropoff_datetime', inplace=True)

df['tpep_dropoff_date'] = date_time.apply(getDate)
df['tpep_dropoff_time'] = date_time.apply(getTime)

In [11]:
#df.to_csv("ARX.csv", index=False)
df.head()

Unnamed: 0,VendorID,passenger_count,trip_distance,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,...,improvement_surcharge,total_amount,PULatitude,PULongitude,DOLatitude,DOLongitude,tpep_pickup_date,tpep_pickup_time,tpep_dropoff_date,tpep_dropoff_time
0,1,1.0,11.0,70,48,1,32.0,3.0,0.5,2.0,...,0.3,44.35,40.761212,-73.865136,40.777048,-73.967596,2022-06-01,00:25:41,2022-06-01,00:48:22
1,1,1.0,12.4,70,48,1,36.0,3.0,0.5,9.0,...,0.3,55.35,40.761212,-73.865136,40.777048,-73.967596,2022-06-01,00:10:17,2022-06-01,00:34:46
2,2,1.0,10.62,70,48,1,34.5,0.0,0.5,5.65,...,0.3,50.0,40.761212,-73.865136,40.777048,-73.967596,2022-06-01,09:18:54,2022-06-01,09:59:52
3,2,1.0,12.06,70,48,2,47.0,0.0,0.5,0.0,...,0.3,56.85,40.761212,-73.865136,40.777048,-73.967596,2022-06-01,10:57:55,2022-06-01,12:01:37
4,2,1.0,10.97,70,48,1,37.0,0.0,0.5,9.37,...,0.3,56.22,40.761212,-73.865136,40.777048,-73.967596,2022-06-01,12:10:46,2022-06-01,12:52:32


# Plotting trips within given time interval:

In [12]:
import numpy as np
import math

def isfloat(num):
    try:
        float(num)
        return True
    except ValueError:
        return False

def strToList(st):
    if st == '[]':
        return []
    factor = -1
    for ch in st:
        if ch != '[':
            break
        factor += 1
    if factor == 0:
        return [float(x) if isfloat(x) else x for x in st.split("[")[1].split("]")[0].split(", ")]
    
    sList = [x+("]"*factor) if x[len(x) - 1] != ']' else x for x in st[1:len(st)-1].split("]"*factor + ", ")]
    lst = []
    for s in sList:
        lst.append(strToList(s))
    return lst

def lessThanOrEqualToDate(date1, date2):
    yr1,mon1,day1 = (int(date1.split("-")[0]), int(date1.split("-")[1]), int(date1.split("-")[2]))
    yr2,mon2,day2 = (int(date2.split("-")[0]), int(date2.split("-")[1]), int(date2.split("-")[2]))
    
    if yr1 > yr2 or (yr1 == yr2 and mon1 > mon2) or (yr1 == yr2 and mon1 == mon2 and day1 > day2):
        return False
    else:
        return True

def lessThanOrEqualToTime(time1, time2):
    hr1,min1,sec1 = (int(time1.split(":")[0]), int(time1.split(":")[1]), int(time1.split(":")[2]))
    hr2,min2,sec2 = (int(time2.split(":")[0]), int(time2.split(":")[1]), int(time2.split(":")[2]))
    
    if hr1 > hr2 or (hr1 == hr2 and min1 > min2) or (hr1 == hr2 and min1 == min2 and sec1 > sec2):
        return False
    else:
        return True
    
def getPoint(point1,point2,d,curveDir):
    x1 = point1[0]
    x2 = point2[0]
    y1 = point1[1]
    y2 = point2[1]

    if y1 != y2:
        h = (x1-x2)/(y2-y1)

        q1 = math.sqrt((d*d)/((h*h)+1))
        q2 = -1*q1
        p1 = h*q1
        p2 = h*q2
    else:
        q1 = 0
        q2 = 0
        p1 = -1*d
        p2 = d
    
    A1 = np.array([q1,p1])
    A2 = np.array([q2,p2])
    B = np.array([x1-x2, y1-y2])
    
    q='dummy'
    p='dummy'
    
    if int((np.cross(A1,B))/(np.linalg.norm(np.cross(A1,B)))) == curveDir:
        q = q1
        p = p1
    #elif int((np.cross(A2,B))/(np.linalg.norm(np.cross(A2,B)))) == curveDir:
    else:
        q = q2
        p = p2
    
    a = ((x1+x2)/2) - q
    b = ((y1+y2)/2) - p
    
    return [a,b]

def getCurve(point1, point2, dFrac, curve, curveDir, addPosition, threshold):
    dist = np.linalg.norm(np.array(point1)-np.array(point2))
    d = dFrac*dist
    if dist <= threshold:
        return curve

    new_point = getPoint(point1,point2,d,curveDir)
    if addPosition == 'left':
        curve.insert([str(x) for x in curve].index(str(point2)), new_point)
    else:
        curve.insert([str(x) for x in curve].index(str(point1))+1, new_point)
    
    curve = getCurve(point1, new_point, dFrac, [point1, new_point], curveDir, 'left', threshold)[:-1] + getCurve(new_point, point2, dFrac, [new_point, point2], curveDir, 'right', threshold)
    return curve

In [35]:
start_time = '00:00:00'
end_time = '00:10:00'
start_date = '2022-06-05'
end_date = '2022-06-05'

In [36]:
#Filtering the dataframe:
indicator = []
for i in range(len(df)):
    indicator.append((lessThanOrEqualToDate(start_date,df['tpep_pickup_date'][i])) & (lessThanOrEqualToDate(df['tpep_dropoff_date'][i], end_date)) & (lessThanOrEqualToTime(start_time, df['tpep_pickup_time'][i])) & (lessThanOrEqualToTime(df['tpep_dropoff_time'][i], end_time)))
    indicator[i] = not indicator[i]
    
filtered_df = df.drop(axis="rows", labels=df.index[indicator])
filtered_df.reset_index(inplace=True)

In [37]:
#Filtered dataframe:
filtered_df

Unnamed: 0,index,VendorID,passenger_count,trip_distance,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,...,improvement_surcharge,total_amount,PULatitude,PULongitude,DOLatitude,DOLongitude,tpep_pickup_date,tpep_pickup_time,tpep_dropoff_date,tpep_dropoff_time
0,15192,2,1.0,0.55,48,48,1,4.0,0.5,0.5,...,0.3,10.14,40.777048,-73.967596,40.777048,-73.967596,2022-06-05,00:04:07,2022-06-05,00:07:04
1,23954,2,1.0,0.66,230,48,1,5.0,0.5,0.5,...,0.3,10.56,40.756718,-73.987827,40.777048,-73.967596,2022-06-05,00:02:33,2022-06-05,00:07:24
2,26817,1,1.0,0.60,163,48,1,5.5,3.0,0.5,...,0.3,11.30,40.760102,-73.978173,40.777048,-73.967596,2022-06-05,00:01:56,2022-06-05,00:07:38
3,45107,2,5.0,0.92,100,48,1,6.0,0.5,0.5,...,0.3,12.74,40.753694,-73.990517,40.777048,-73.967596,2022-06-05,00:01:50,2022-06-05,00:08:04
4,47321,2,2.0,1.82,239,48,1,8.0,0.5,0.5,...,0.3,13.57,40.778765,-73.996498,40.777048,-73.967596,2022-06-05,00:00:37,2022-06-05,00:08:35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,3294782,1,0.0,2.20,143,24,1,8.0,3.0,0.5,...,0.3,13.80,40.772319,-73.984401,40.799776,-73.967772,2022-06-05,00:02:57,2022-06-05,00:09:11
103,3353515,2,1.0,0.83,249,158,1,6.0,0.5,0.5,...,0.3,11.76,40.734186,-74.005580,40.734700,-74.004800,2022-06-05,00:01:49,2022-06-05,00:08:45
104,3376178,1,2.0,1.40,13,158,2,6.5,3.0,0.5,...,0.3,10.30,40.711017,-74.016937,40.734700,-74.004800,2022-06-05,00:02:13,2022-06-05,00:07:10
105,3398957,2,2.0,2.78,148,209,1,10.0,0.5,0.5,...,0.3,14.80,40.715936,-73.986806,40.705751,-74.002906,2022-06-05,00:00:28,2022-06-05,00:08:48


In [38]:
filtered_df.head()

Unnamed: 0,index,VendorID,passenger_count,trip_distance,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,...,improvement_surcharge,total_amount,PULatitude,PULongitude,DOLatitude,DOLongitude,tpep_pickup_date,tpep_pickup_time,tpep_dropoff_date,tpep_dropoff_time
0,15192,2,1.0,0.55,48,48,1,4.0,0.5,0.5,...,0.3,10.14,40.777048,-73.967596,40.777048,-73.967596,2022-06-05,00:04:07,2022-06-05,00:07:04
1,23954,2,1.0,0.66,230,48,1,5.0,0.5,0.5,...,0.3,10.56,40.756718,-73.987827,40.777048,-73.967596,2022-06-05,00:02:33,2022-06-05,00:07:24
2,26817,1,1.0,0.6,163,48,1,5.5,3.0,0.5,...,0.3,11.3,40.760102,-73.978173,40.777048,-73.967596,2022-06-05,00:01:56,2022-06-05,00:07:38
3,45107,2,5.0,0.92,100,48,1,6.0,0.5,0.5,...,0.3,12.74,40.753694,-73.990517,40.777048,-73.967596,2022-06-05,00:01:50,2022-06-05,00:08:04
4,47321,2,2.0,1.82,239,48,1,8.0,0.5,0.5,...,0.3,13.57,40.778765,-73.996498,40.777048,-73.967596,2022-06-05,00:00:37,2022-06-05,00:08:35


In [39]:
import folium
from branca.element import Figure

PULatitude = filtered_df['PULatitude']
PULongitude = filtered_df['PULongitude']

DOLatitude = filtered_df['DOLatitude']
DOLongitude = filtered_df['DOLongitude']

fig=Figure(height=550,width=750)
m=folium.Map(location=[40.730610, -73.935242],tiles='cartodbpositron',zoom_start=14)
fig.add_child(m)

def getAttribute(atrName, attributes):
    return attributes[list(filtered_df.columns).index(atrName)]

def plotCurve(curve, attributes, tripNum):
    infoString = ""
    infoString += "Vendor ID --> "+str(getAttribute("VendorID", attributes))+"<br>"
    infoString += "Passenger count --> "+str(getAttribute("passenger_count", attributes))+"<br>"
    infoString += "Pickup date --> "+str(getAttribute("tpep_pickup_date", attributes))+"<br>"
    infoString += "Dropoff date --> "+str(getAttribute("tpep_dropoff_date", attributes))+"<br>"
    infoString += "Pickup time --> "+str(getAttribute("tpep_pickup_time", attributes))+"<br>"
    infoString += "Dropoff time --> "+str(getAttribute("tpep_dropoff_time", attributes))+"<br>"
    infoString += "Trip distance --> "+str(getAttribute("trip_distance", attributes))+"<br>"
    infoString += "Fare amount --> "+str(getAttribute("fare_amount", attributes))+"<br>"
    infoString += "Tip amount --> "+str(getAttribute("tip_amount", attributes))+"<br>"
    infoString += "Total amount --> "+str(getAttribute("total_amount", attributes))
    
    iframe = folium.IFrame(infoString)
    popup = folium.Popup(iframe, min_width = 210, max_width=500)


    f=folium.FeatureGroup("Trip_"+str(curve[0])+"_"+str(curve[-1])+"_"+str(tripNum))
    line=folium.vector_layers.PolyLine(curve,popup=popup,tooltip='Click to see path details',color='orange',weight=1.5).add_to(f)
    f.add_to(m)

points = set([])
tripDict = {}
for i in range(len(filtered_df)):
    point1 = [PULatitude[i], PULongitude[i]]
    point2 = [DOLatitude[i], DOLongitude[i]]
    points.add(str(point1))
    points.add(str(point2))
    if not str(point1)+":"+str(point2) in tripDict.keys():
        tripDict[str(point1)+":"+str(point2)] = [0]
    tripDict[str(point1)+":"+str(point2)][0] += 1
    tripDict[str(point1)+":"+str(point2)].append([filtered_df[column][i] for column in filtered_df.columns])

for key in tripDict.keys():
    count = 0
    for frac in np.linspace(-0.1,0.1,tripDict[key][0]):
        if frac != 0:
            dir = int(frac/abs(frac))
        else:
            frac = 0.01
            dir = 1
        frac = abs(frac)
        point1 = [tripDict[key][count+1][list(filtered_df.columns).index('PULatitude')], tripDict[key][count+1][list(filtered_df.columns).index('PULongitude')]]
        point2 = [tripDict[key][count+1][list(filtered_df.columns).index('DOLatitude')], tripDict[key][count+1][list(filtered_df.columns).index('DOLongitude')]]
        #print("Plotting: "+str(point1)+" --> "+str(point2))
        dist = np.linalg.norm(np.array(point1)-np.array(point2))
        curve = getCurve(point1, point2, dFrac = frac, curve = [point1, point2], curveDir = dir, addPosition='right', threshold=0.00097*dist)
        plotCurve(curve, tripDict[key][count+1], count+1)
        count += 1
for point in points:
    iframe = folium.IFrame(str(strToList(point)[0])+", "+str(strToList(point)[1]))
    popup = folium.Popup(iframe, min_width = 180, max_width=180)
    folium.Marker(location=strToList(point),popup=popup,tooltip=str(strToList(point)[0])+", "+str(strToList(point)[1])).add_to(m)
    
folium.LayerControl().add_to(m)
m
