# Yello Taxi Trip Data (New York City, June 2022)

Map of pick-up and drop-off zones or location IDs:

https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc

In [13]:
import pandas as pd

In [14]:
# #NOTE: We might need to install "pyarrow" engine for the following code to work... Use "pip install pyarrow"
# parquet_file = "yellow_tripdata_2022-06.parquet"
# df = pd.read_parquet(parquet_file, engine='auto')

In [15]:
df = pd.read_csv("yellow_tripdata_2022-06.csv")
df.rename(columns={'latPU' : 'PULatitude'}, inplace=True)
df.rename(columns={'lonPU' : 'PULongitude'}, inplace=True)
df.rename(columns={'latDO' : 'DOLatitude'}, inplace=True)
df.rename(columns={'lonDO' : 'DOLongitude'}, inplace=True)

In [16]:
df.loc[:20]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,PULatitude,PULongitude,DOLatitude,DOLongitude
0,1,2022-06-01 00:25:41,2022-06-01 00:48:22,1.0,11.0,70,48,1,32.0,3.0,0.5,2.0,6.55,0.3,44.35,40.761212,-73.865136,40.777048,-73.967596
1,1,2022-06-01 00:10:17,2022-06-01 00:34:46,1.0,12.4,70,48,1,36.0,3.0,0.5,9.0,6.55,0.3,55.35,40.761212,-73.865136,40.777048,-73.967596
2,2,2022-06-01 09:18:54,2022-06-01 09:59:52,1.0,10.62,70,48,1,34.5,0.0,0.5,5.65,6.55,0.3,50.0,40.761212,-73.865136,40.777048,-73.967596
3,2,2022-06-01 10:57:55,2022-06-01 12:01:37,1.0,12.06,70,48,2,47.0,0.0,0.5,0.0,6.55,0.3,56.85,40.761212,-73.865136,40.777048,-73.967596
4,2,2022-06-01 12:10:46,2022-06-01 12:52:32,1.0,10.97,70,48,1,37.0,0.0,0.5,9.37,6.55,0.3,56.22,40.761212,-73.865136,40.777048,-73.967596
5,2,2022-06-01 12:55:59,2022-06-01 13:31:29,1.0,9.46,70,48,1,33.0,0.0,0.5,8.57,6.55,0.3,51.42,40.761212,-73.865136,40.777048,-73.967596
6,2,2022-06-01 13:38:40,2022-06-01 14:21:48,1.0,9.29,70,48,1,36.0,0.0,0.5,9.17,6.55,0.3,56.27,40.761212,-73.865136,40.777048,-73.967596
7,1,2022-06-01 13:16:28,2022-06-01 13:48:52,0.0,9.1,70,48,1,30.5,2.5,0.5,10.05,6.55,0.3,50.4,40.761212,-73.865136,40.777048,-73.967596
8,2,2022-06-01 15:42:56,2022-06-01 16:32:21,1.0,8.99,70,48,2,36.5,0.0,0.5,0.0,6.55,0.3,47.6,40.761212,-73.865136,40.777048,-73.967596
9,2,2022-06-01 16:45:28,2022-06-01 17:41:15,2.0,11.49,70,48,2,40.0,1.0,0.5,0.0,6.55,0.3,52.1,40.761212,-73.865136,40.777048,-73.967596


In [17]:
len(df)

3496580

In [18]:
df.isna().sum()

VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
PULatitude               0
PULongitude              0
DOLatitude               0
DOLongitude              0
dtype: int64

In [19]:
df.nunique()

VendorID                       3
tpep_pickup_datetime     1709403
tpep_dropoff_datetime    1707818
passenger_count               10
trip_distance               3818
PULocationID                 258
DOLocationID                 258
payment_type                   5
fare_amount                 7375
extra                         53
mta_tax                       11
tip_amount                  3203
tolls_amount                 689
improvement_surcharge          3
total_amount               12618
PULatitude                   210
PULongitude                  210
DOLatitude                   210
DOLongitude                  210
dtype: int64

In [20]:
df["PULocationID"].value_counts()

237    170644
132    164191
236    148638
161    139664
142    117339
        ...  
199         2
27          2
176         2
251         1
109         1
Name: PULocationID, Length: 258, dtype: int64

# Pre-processing:

In [21]:
def getTime(x):
    return str(x).split(" ")[1]
def getDate(x):
    return str(x).split(" ")[0]

date_time = df['tpep_pickup_datetime']
df.drop(axis='columns', labels='tpep_pickup_datetime', inplace=True)

df['tpep_pickup_date'] = date_time.apply(getDate)
df['tpep_pickup_time'] = date_time.apply(getTime)

date_time = df['tpep_dropoff_datetime']
df.drop(axis='columns', labels='tpep_dropoff_datetime', inplace=True)

df['tpep_dropoff_date'] = date_time.apply(getDate)
df['tpep_dropoff_time'] = date_time.apply(getTime)

In [22]:
def lessThanOrEqualToDate(date1, date2):
    yr1,mon1,day1 = (int(date1.split("-")[0]), int(date1.split("-")[1]), int(date1.split("-")[2]))
    yr2,mon2,day2 = (int(date2.split("-")[0]), int(date2.split("-")[1]), int(date2.split("-")[2]))
    
    if yr1 > yr2 or (yr1 == yr2 and mon1 > mon2) or (yr1 == yr2 and mon1 == mon2 and day1 > day2):
        return False
    else:
        return True
    
def lessThanOrEqualToTime(time1, time2):
    hr1,min1,sec1 = (int(time1.split(":")[0]), int(time1.split(":")[1]), int(time1.split(":")[2]))
    hr2,min2,sec2 = (int(time2.split(":")[0]), int(time2.split(":")[1]), int(time2.split(":")[2]))
    
    if hr1 > hr2 or (hr1 == hr2 and min1 > min2) or (hr1 == hr2 and min1 == min2 and sec1 > sec2):
        return False
    else:
        return True

In [23]:
indicator = []
for i in range(len(df)):
    indicator.append(lessThanOrEqualToDate(df['tpep_dropoff_date'][i], '2022-06-1') and lessThanOrEqualToTime(df['tpep_dropoff_time'][i], '15:30:00'))
    indicator[i] = not indicator[i]
    
df.drop(axis="rows", labels=df.index[indicator], inplace=True)

In [24]:
len(df)

57262

In [25]:
df.head()

Unnamed: 0,VendorID,passenger_count,trip_distance,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,...,improvement_surcharge,total_amount,PULatitude,PULongitude,DOLatitude,DOLongitude,tpep_pickup_date,tpep_pickup_time,tpep_dropoff_date,tpep_dropoff_time
0,1,1.0,11.0,70,48,1,32.0,3.0,0.5,2.0,...,0.3,44.35,40.761212,-73.865136,40.777048,-73.967596,2022-06-01,00:25:41,2022-06-01,00:48:22
1,1,1.0,12.4,70,48,1,36.0,3.0,0.5,9.0,...,0.3,55.35,40.761212,-73.865136,40.777048,-73.967596,2022-06-01,00:10:17,2022-06-01,00:34:46
2,2,1.0,10.62,70,48,1,34.5,0.0,0.5,5.65,...,0.3,50.0,40.761212,-73.865136,40.777048,-73.967596,2022-06-01,09:18:54,2022-06-01,09:59:52
3,2,1.0,12.06,70,48,2,47.0,0.0,0.5,0.0,...,0.3,56.85,40.761212,-73.865136,40.777048,-73.967596,2022-06-01,10:57:55,2022-06-01,12:01:37
4,2,1.0,10.97,70,48,1,37.0,0.0,0.5,9.37,...,0.3,56.22,40.761212,-73.865136,40.777048,-73.967596,2022-06-01,12:10:46,2022-06-01,12:52:32


In [26]:
df.columns

Index(['VendorID', 'passenger_count', 'trip_distance', 'PULocationID',
       'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount',
       'PULatitude', 'PULongitude', 'DOLatitude', 'DOLongitude',
       'tpep_pickup_date', 'tpep_pickup_time', 'tpep_dropoff_date',
       'tpep_dropoff_time'],
      dtype='object')

In [27]:
df.to_csv("ARX.csv", index=False)

# Computing current t-closeness

In [None]:
categorical = ['tpep_pickup_date','tpep_pickup_time','tpep_dropoff_time','tpep_dropoff_date','PULocationID','DOLocationID']
sensitive = ['PULocationID','DOLocationID']
insensitive = ['extra','mta_tax','imporvement_surcharge']
identifying = ['PULatitude','PULongitude','DOLatitude','DOLongitude']
quasi_identifying = list(set(df.columns)-set(categorical)-set(sensitive)-set(insensitive)-set(identifying))

In [None]:
def split(df, partition, column):
    """
    :param        df: The dataframe to split
    :param partition: The partition to split
    :param    column: The column along which to split
    :        returns: A tuple containing a split of the original partition
    """
    dfp = df[column][partition]
    if column in categorical:
        values = dfp.unique()
        lv = set(values[:len(values)//2])
        rv = set(values[len(values)//2:])
        return dfp.index[dfp.isin(lv)], dfp.index[dfp.isin(rv)]
    else:
        median = dfp.median()
        dfl = dfp.index[dfp < median]
        dfr = dfp.index[dfp >= median]
        return (dfl, dfr)

In [None]:
def get_spans(df, partition, categorical, scale=None):
    """
    :param        df: the dataframe for which to calculate the spans
    :param partition: the partition for which to calculate the spans
    :param     scale: if given, the spans of each column will be divided
                      by the value in `scale` for that column
    :        returns: The spans of all columns in the partition
    """
    spans = {}
    for column in df.columns:
        if column in categorical:
            
            span = len(df[column][partition].unique())
        else:
            print(column)
            span = df[column][partition].max()-df[column][partition].min()
        if scale is not None:
            span = span/scale[column]
        spans[column] = span
    return spans

In [None]:
full_spans = get_spans(df,df.index,categorical,scale=None)

VendorID
passenger_count
trip_distance
payment_type
fare_amount
extra
mta_tax
tip_amount
tolls_amount
improvement_surcharge
total_amount
PULatitude
PULongitude
DOLatitude
DOLongitude


In [None]:
def partition_dataset(df, feature_columns, sensitive_column, scale, is_valid):
    """
    :param               df: The dataframe to be partitioned.
    :param  feature_columns: A list of column names along which to partition the dataset.
    :param sensitive_column: The name of the sensitive column (to be passed on to the `is_valid` function)
    :param            scale: The column spans as generated before.
    :param         is_valid: A function that takes a dataframe and a partition and returns True if the partition is valid.
    :returns               : A list of valid partitions that cover the entire dataframe.
    """
    finished_partitions = []
    partitions = [df.index]
    while partitions:
        partition = partitions.pop(0)
        spans = get_spans(df[feature_columns], partition, scale)
        for column, span in sorted(spans.items(), key=lambda x:-x[1]):
            lp, rp = split(df, partition, column)
            if not is_valid(df, lp, sensitive_column) or not is_valid(df, rp, sensitive_column):
                continue
            partitions.extend((lp, rp))
            break
        else:
            finished_partitions.append(partition)
    return finished_partitions

In [None]:
global_freqs = {}
total_count = float(len(df))
for column in sensitive:
    group_counts = df.groupby(column)[column].agg('count')
    for value,count in group_counts.to_dict().items():
        p = count/total_count
        global_freqs[value] = p

In [None]:
def is_k_anonymous(df, partition, sensitive_column, k=3):
    """
    :param               df: The dataframe on which to check the partition.
    :param        partition: The partition of the dataframe to check.
    :param sensitive_column: The name of the sensitive column
    :param                k: The desired k
    :returns               : True if the partition is valid according to our k-anonymity criteria, False otherwise.
    """
    if len(partition) < k:
        return False
    return True

In [None]:
def t_closeness(df, partition, column, global_freqs):
    total_count = float(len(partition))
    d_max = None
    group_counts = df.loc[partition].groupby(column)[column].agg('count')
    for value, count in group_counts.to_dict().items():
        p = count/total_count
        d = abs(p-global_freqs[value])
        if d_max is None or d > d_max:
            d_max = d
    return d_max

In [None]:

def is_t_close(df, partition, sensitive_column,global_freqs, p=0.2):
    """
    :param               df: The dataframe for which to check t-closeness
    :param        partition: The partition of the dataframe on which to check t-closeness
    :param sensitive_column: The name of the sensitive column
    :param     global_freqs: The global frequencies of the sensitive attribute values
    :param                p: The maximum allowed Kolmogorov-Smirnov distance
    """
    if not sensitive_column in categorical:
        print(sensitive_column)
        raise ValueError("this method only works for categorical values")
    return t_closeness(df, partition, sensitive_column, global_freqs) <= p

In [None]:
finished_t_close_partitions = partition_dataset(df, categorical, sensitive[0], full_spans, lambda *args: is_k_anonymous(*args) and is_t_close(*args, global_freqs))

In [None]:
def t_closeness(df, partition, column, global_freqs):
    total_count = float(len(partition))
    d_max = None
    group_counts = df.loc[partition].groupby(column)[column].agg('count')
    for value, count in group_counts.to_dict().items():
        p = count/total_count
        d = abs(p-global_freqs[value])
        if d_max is None or d > d_max:
            d_max = d
    return d_max


def is_t_close(df, partition, sensitive_column, global_freqs, p=0.2):
    """
    :param               df: The dataframe for which to check t-closeness
    :param        partition: The partition of the dataframe on which to check t-closeness
    :param sensitive_column: The name of the sensitive column
    :param     global_freqs: The global frequencies of the sensitive attribute values
    :param                p: The maximum allowed Kolmogorov-Smirnov distance
    """
    if not sensitive_column in categorical:
        raise ValueError("this method only works for categorical values")
    return t_closeness(df, partition, sensitive_column, global_freqs) <= p