# 4. Prepare discrete classifier data

In [1]:
import pandas as pd
import numpy as np
import datetime
import calendar
import os

In [2]:
# create new directory
output_relative_dirs = ['../data/curated/cleaned/discrete']

# check if it exists as it makedir will raise an error if it does exist
for output_relative_dir in output_relative_dirs:
    if not os.path.exists(output_relative_dir):
        os.makedirs(output_relative_dir)

# SCRIPT

In [3]:
cal = calendar.Calendar()
DAYS_IN_MONTH = [29, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] # for getting max day in month

for year in [2016]:
    for month in range(1, 7):

        print(f"BEGIN PROCESSING {year}-{month}")

        # load both datasets
        yellow_data = pd.read_csv(f'../data/curated/taxi_aggregated/yellow_{year}-{str(month).zfill(2)}.csv')
        green_data = pd.read_csv(f'../data/curated/taxi_aggregated/green_{year}-{str(month).zfill(2)}.csv')
        data = yellow_data.append(green_data)

        # turn the hour labels into its //3 value, effectively using a label to group data into 3 hour blocks
        data['rounded_lepep_pickup_datetime'] = pd.to_datetime(data['rounded_lepep_pickup_datetime']).dt.date
        data['hour'] = data['hour']//3

        # Group into 3 hour blocks (both yellow and green now together) by DOLocation and PULocation
        agg_data = data.groupby(['DOLocationID', 'hour', 'PULocationID','rounded_lepep_pickup_datetime']).agg({
                                                                               'count':sum}).reset_index()

        # Get discrete labels: PULocation with maximum pickups in every {DOLocation, 3HrB} set
        max_PULocationID_data = pd.DataFrame({'DOLocationID': [], 'hour': [], 'Max_PULocationID': [],
                                              'Max_PULocationIDs':[], 'max_count':[],
                                              'rounded_lepep_pickup_datetime': []})

        for id, agg_data in agg_data.groupby(['DOLocationID', 'hour', 'rounded_lepep_pickup_datetime']):

            agg_data.index = range(len(agg_data))

            max_count = max(agg_data['count'])
            max_count_rows = agg_data[agg_data['count'] == max_count]
            max_PULocationIDs = list(max_count_rows['PULocationID'])

            # if multiple maxes, then create multiple instances with same attribute values but different labels
            for max_PULocationID in max_PULocationIDs:
                tmp = pd.DataFrame({'DOLocationID': [id[0]], 'hour': [id[1]],
                                'Max_PULocationID': [max_PULocationID], 'Max_PULocationIDs': [max_PULocationIDs],
                                    'max_count': [max(agg_data['count'])],
                                    'rounded_lepep_pickup_datetime': [agg_data.loc[0]['rounded_lepep_pickup_datetime']]})

                max_PULocationID_data = max_PULocationID_data.append(tmp)

        # Remove any dates that are outside this month
        if year == 2016 and month == 2:
            days_in_month = DAYS_IN_MONTH[0]
        else:
            days_in_month = DAYS_IN_MONTH[month]

        max_PULocationID_data.index = range(len(max_PULocationID_data))

        max_PULocationID_data = max_PULocationID_data[(max_PULocationID_data['rounded_lepep_pickup_datetime'] <= datetime.date(year=year,
                                                                            month=month, day=days_in_month)) &
                                                      (max_PULocationID_data['rounded_lepep_pickup_datetime'] >= datetime.date(year=year,
                                                                            month=month, day=1))]


        ## LEGACY CODE FOR FILLING NON-OBSERVATIONS - LATER DISCARDED

        # # Fill date-hour-DOLocation with no records with PUlocation of 0, denoting missing id
        #
        # date_list = [date_ for date_ in cal.itermonthdates(year = year, month = month) if date_.year == year and date_.month == month]
        #
        # for dolocation in range(1, 264):
        #     for hour in range(8):
        #         for date_ in date_list:
        #             if len(max_PULocationID_data[(max_PULocationID_data['DOLocationID'] == dolocation) & (max_PULocationID_data['hour'] == hour) & (max_PULocationID_data['rounded_lepep_pickup_datetime'] == date_)]) == 0:
        #
        #                 tmp = pd.DataFrame({'DOLocationID': [dolocation], 'hour': [hour], 'Max_PULocationID': [0], 'Max_PULocationIDs': [[0]], 'max_count': [0], 'rounded_lepep_pickup_datetime': [date_]})
        #                 max_PULocationID_data = max_PULocationID_data.append(tmp)
        #
        # max_PULocationID_data.index = range(len(max_PULocationID_data))


        # rename a column to datetime
        max_PULocationID_data = max_PULocationID_data.rename(columns = {'rounded_lepep_pickup_datetime': 'datetime'})

        # output
        max_PULocationID_data.to_csv(f'../data/curated/cleaned/discrete/{year}-{str(month).zfill(2)}_discrete.csv', index= False)

BEGIN PROCESSING 2016-1
BEGIN PROCESSING 2016-2
BEGIN PROCESSING 2016-3
BEGIN PROCESSING 2016-4
BEGIN PROCESSING 2016-5
BEGIN PROCESSING 2016-6


# BUILDING UP THE SCRIPT (DRAFTING LEGACY)

load both dataset

In [4]:
# # load both datasets
# yellow_data = pd.read_csv('../data/curated/taxi_csv/yellow_2016-01.csv')
# green_data = pd.read_csv('../data/curated/taxi_csv/green_2016-01.csv')
# data = yellow_data.append(green_data)

get the hours into labels that works in groups of 3

In [5]:
# # get the hours into labels that works in groups of 3
# data['rounded_lepep_pickup_datetime'] = pd.to_datetime(data['rounded_lepep_pickup_datetime']).dt.date
# data['hour'] = data['hour']//3
# data

In [6]:
# data.sort_values(['rounded_lepep_pickup_datetime', 'DOLocationID', 'hour', 'PULocationID'])

Group into 3 hour blocks (both yellow and green)

In [7]:
# # Group into 3 hour blocks (both yellow and green)
# agg_data = data.groupby(['DOLocationID', 'hour', 'PULocationID']).agg({'rounded_lepep_pickup_datetime':min, 'count':sum}).reset_index()

In [8]:
# agg_data.sort_values(['rounded_lepep_pickup_datetime', 'DOLocationID', 'hour', 'PULocationID'])

Get labels: maximum label per week

In [9]:
# # Get labels: maximum labels per week
# max_PULocationID_data = pd.DataFrame({'DOLocationID': [], 'hour': [], 'Max_PULocationID': [], 'Max_PULocationIDs':[], 'max_count':[], 'rounded_lepep_pickup_datetime': []})
#
# for id, agg_data in agg_data.groupby(['DOLocationID', 'hour', 'rounded_lepep_pickup_datetime']):
#
#     agg_data.index = range(len(agg_data))
#
#     max_count = max(agg_data['count'])
#     max_count_rows = agg_data[agg_data['count'] == max_count]
#     max_PULocationIDs = list(max_count_rows['PULocationID'])
#
#     # if multiple maxes, then give both
#     for max_PULocationID in max_PULocationIDs:
#         tmp = pd.DataFrame({'DOLocationID': [id[0]], 'hour': [id[1]],
#                         'Max_PULocationID': [max_PULocationID], 'Max_PULocationIDs': [max_PULocationIDs], 'max_count': [max(agg_data['count'])], 'rounded_lepep_pickup_datetime': [agg_data.loc[0]['rounded_lepep_pickup_datetime']]})
#
#         max_PULocationID_data = max_PULocationID_data.append(tmp)
#
# max_PULocationID_data.index = range(len(max_PULocationID_data))
# max_PULocationID_data

Remove any dates that are outside this month

In [10]:
# # Remove any dates that are outside this month
# max_PULocationID_data = max_PULocationID_data[(max_PULocationID_data['rounded_lepep_pickup_datetime'] <= datetime.date(year=2016,month=1, day=31)) & (max_PULocationID_data['rounded_lepep_pickup_datetime'] >= datetime.date(year=2016,month=1, day=1))]

In [11]:
# max_PULocationID_data

Fill date-hour-DOLocation with no records with PUlocation of 264, denoting missing id

In [12]:
# # Fill date-hour-DOLocation with no records with PUlocation of 0, denoting missing id
# cal = calendar.Calendar()
# date_list = [date_ for date_ in cal.itermonthdates(year = 2016, month = 1) if date_.year == 2016 and date_.month == 1]
#
# for dolocation in range(1, 264):
#     for hour in range(8):
#         for date_ in date_list:
#             if len(max_PULocationID_data[(max_PULocationID_data['DOLocationID'] == dolocation) & (max_PULocationID_data['hour'] == hour) & (max_PULocationID_data['rounded_lepep_pickup_datetime'] == date_)]) == 0:
#
#                 tmp = pd.DataFrame({'DOLocationID': [dolocation], 'hour': [hour], 'Max_PULocationID': [0], 'Max_PULocationIDs': [[0]], 'max_count': [0], 'rounded_lepep_pickup_datetime': [date_]})
#                 max_PULocationID_data = max_PULocationID_data.append(tmp)
# max_PULocationID_data

In [13]:
# max_PULocationID_data.index = range(len(max_PULocationID_data))
# max_PULocationID_data = max_PULocationID_data.rename(columns = {'rounded_lepep_pickup_datetime': 'datetime'})

In [14]:
# max_PULocationID_data.sort_values(['datetime', 'hour', 'DOLocationID'])