In [4]:
import pandas as pd
from tqdm import tqdm
import os
import pickle
import numpy as np

from thesis_library import read_data, plant_anomalies

YEAR, MONTH, df = 2012, 1, None

#### VendorID:
- 1= Creative Mobile Technologies LLC
- 2= VeriFone Inc.

#### Trip distance (is in miles)

#### RatecodeID?
- 1= Standard rate
- 2=JFK
- 3=Newark
- 4=Nassau or Westchester
- 5=Negotiated fare
- 6=Group ride

#### LocationID:
TLC Taxi Zone in which the taximeter was engaded/disengaged

#### Payment type (simplified by me):
- 0= unknown or no charge
- 1= card
- 2= cash

#### tip_amount - cash tips not included

#### pickup and dropoff times are already in some pandas._libs.tslibs.timestamps.Timestamp format

In [18]:
def read_and_preprocess_the_dataset(year: int, month: int) -> pd.DataFrame:
    '''Deletes errorenuous rows'''
    
    assert (year, month) in [(2012, 1), (2020, 1), (2023, 1), (2023, 9), (2023, 11)], \
    "(2012, 1), (2020, 1), (2023, 1), (2023, 9) or (2023, 11)"
    
    month = '0'+str(month) if month<10 else str(month)
    filename = './data/Taxi/yellow_tripdata_' + str(year) + '-' + month + '.parquet'
    
    #The dataset is quite large, so we'll preprocess it in one go to speed it up
    print("Reading data.")
    df = pd.read_parquet(filename, engine='fastparquet').\
    dropna(subset=['passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID', 'fare_amount', 'total_amount']).\
    reset_index(drop=True).drop(['store_and_fwd_flag'], axis=1)\
    .rename(columns={'tpep_pickup_datetime': 'pickup_time', 'tpep_dropoff_datetime': 'dropoff_time'})
    
    previous_size = df.shape[0]
    print("Preprocessing the data.")
    
    #Dealing with the trip duration:
    df['duration'] = df['dropoff_time'] - df['pickup_time']
    mask_positive = df['duration'] > pd.Timedelta(0)
    mask_reasonable = df['duration'] < pd.Timedelta(days=1)
    mask_full = mask_positive & mask_reasonable
    df = df[mask_full].reset_index(drop=True)
    
    #Dealing with negative values:
    mask_pos_passenger = df['passenger_count'] > 0
    mask_pos_distance = df['trip_distance'] > 0
    mask_pos_basic_fare = df['fare_amount'] > 0
    mask_pos_total_fare = df['total_amount'] > 0
    mask_all_positive = mask_pos_passenger & mask_pos_distance & mask_pos_basic_fare & mask_pos_total_fare
    df = df[mask_all_positive]
    
    #"Extracting crucial info"
    df = df[['pickup_time', 'dropoff_time', 'PULocationID', 'DOLocationID', 'passenger_count']]
    df['pickup_day'] = df['pickup_time'].apply(lambda x: x.day)
    df['pickup_quarter'] = df['pickup_time'].apply(lambda x: ((x.day-1)*24*60 + (x.hour)*60 + x.minute) // 15)
    
#     #Imputing the weekday name
#     df['weekday'] = df['pickup_time']
#     df['weekday'] = df['weekday'].apply(lambda x: x.day_name())
    
    print("Removed", previous_size - df.shape[0], " rows.")
    
    return df.reset_index(drop=True)

In [19]:
df = read_and_preprocess_the_dataset(YEAR, MONTH)
df

Reading data.
Preprocessing the data.
Removed 343909  rows.


Unnamed: 0,pickup_time,dropoff_time,PULocationID,DOLocationID,passenger_count,pickup_day,pickup_quarter
0,2012-01-01 00:07:56,2012-01-01 00:12:09,158,231,1,1,0
1,2012-01-01 00:18:49,2012-01-01 00:30:01,231,164,1,1,1
2,2012-01-01 00:31:38,2012-01-01 00:46:05,164,148,1,1,2
3,2012-01-01 00:47:35,2012-01-01 00:55:57,148,107,4,1,3
4,2012-01-01 00:57:08,2012-01-01 01:02:42,107,107,3,1,3
...,...,...,...,...,...,...,...
12714434,2012-01-31 23:43:30,2012-01-31 23:57:10,162,231,1,31,2974
12714435,2012-01-31 23:04:26,2012-01-31 23:23:21,148,50,1,31,2972
12714436,2012-01-31 23:58:54,2012-02-01 00:05:33,237,75,1,31,2975
12714437,2012-01-31 23:10:40,2012-01-31 23:19:57,13,68,1,31,2972


In [5]:
month = '0'+str(MONTH) if MONTH<10 else str(MONTH)
filename_X = 'Taxi_' + str(YEAR) + '_' + month + '_unplanted_edges.txt'
filename_y = 'Taxi_' + str(YEAR) + '_' + month + '_unplanted_labels.txt'

if df is not None:
    #df = df[df['pickup_quarter'] <= 1000]
    X = [[pick, drop, time] for pick, drop, time in zip(df['PULocationID'], df['DOLocationID'], df['pickup_quarter'])]
    
    #Pickling (saving) the list to the disk only if that's a new file:
    if filename_X not in os.listdir('./data/Taxi'):
        with open('./data/Taxi/'+filename_X, 'wb') as fp:
            pickle.dump(X, fp)
else:
    #Loading the saved file:
    with open('./data/Taxi/'+filename_X, 'rb') as fp:
        X = np.array(pickle.load(fp))

y = [0] * len(X)

In [23]:
#Plant a handful of cliques - guide
print("We want 1% of the data to be anomalous, so a bit over", len(X)//100, "edges to be planted.")
print("Since a clique has 1/2(n^2-n) edges per n vertices, we can use 17 nodes for", 17*(17-1)//2, "edges.")
print("Then, we will repeat every edge 10 times for", 10*17*(17-1)//2, "edges per clique.")
print("We will plant them in 16 of them for", 16*17*(17-1)//2*10, "edges.")

We want 1% of the data to be anomalous, so a bit over 38560 edges to be planted.
We will plant them in 29 cliques of 1360 edges to give us 39440 edges.
Since a clique has 1/2(n^2-n) edges per n vertices, we need 17 vertices for 136 edges
Then, we will repeat every edge 10 times for 1360 edges.


In [24]:
plant_anomalies(np.array(X), y, dataset='Taxi', n_imputations=29, n_vertices=17, n_repetitions=10, anomaly_type='clique')

Planting an anomaly at timestamp: 71
Planting an anomaly at timestamp: 161
Planting an anomaly at timestamp: 257
Planting an anomaly at timestamp: 287
Planting an anomaly at timestamp: 289
Planting an anomaly at timestamp: 317
Planting an anomaly at timestamp: 420
Planting an anomaly at timestamp: 461
Planting an anomaly at timestamp: 485
Planting an anomaly at timestamp: 487
Planting an anomaly at timestamp: 494
Planting an anomaly at timestamp: 496
Planting an anomaly at timestamp: 500
Planting an anomaly at timestamp: 532
Planting an anomaly at timestamp: 546
Planting an anomaly at timestamp: 547
Planting an anomaly at timestamp: 557
Planting an anomaly at timestamp: 563
Planting an anomaly at timestamp: 582
Planting an anomaly at timestamp: 620
Planting an anomaly at timestamp: 665
Planting an anomaly at timestamp: 705
Planting an anomaly at timestamp: 705
Planting an anomaly at timestamp: 706
Planting an anomaly at timestamp: 721
Planting an anomaly at timestamp: 740
Planting an a