In [27]:
import numpy as np
import pandas as pd
from backroom import *
from datetime import datetime, timedelta


# Structure the dataset
for cv_split in range(1,6):
    datasets = []
    train_df = pd.read_csv(f"../../data/interim/train_df_fold_{cv_split}.csv", parse_dates=['tpep_pickup_datetime'])
    val_df = pd.read_csv(f"../../data/interim/val_df_fold_{cv_split}.csv", parse_dates=['tpep_pickup_datetime'])
    if cv_split == 5:
        test_df = pd.read_csv("../../data/interim/test_df.csv", parse_dates=['tpep_pickup_datetime'])
        datasets = [train_df, val_df, test_df]
    else:
        datasets = [train_df, val_df]
    def combine_day_of_week_and_hour(dayofweek, hour):
        return str(dayofweek) + '_' + str(hour)
    def get_closest_hour(time):
        """
        Given a datetime object `time`, returns the nearest hour as a datetime object.
        """
        # Round the minute component of the input time to the nearest hour
        rounded_hour = (time.replace(second=0, microsecond=0, minute=0, hour=time.hour)
                        + timedelta(hours=round(time.minute / 60)))

        return rounded_hour.hour
    def get_closest_hour_and_a_half(time):
        """
        Given a datetime object `time`, returns the nearest hour or half-hour as a datetime object.
        """
        # Round the minute component of the input time to the nearest half-hour
        rounded_minute = round(time.minute / 30) * 30
        if rounded_minute == 60:
            rounded_minute = 0
            time = time + timedelta(hours=1)
        rounded_time = time.replace(second=0, microsecond=0, minute=rounded_minute)

        # If the rounded minute is exactly 30, round the hour up or down based on the minute component
        if rounded_minute == 30:

            rounded_hour = (rounded_time.replace(minute=0, hour=rounded_time.hour)
                            + timedelta(hours=round(rounded_time.minute / 60)))
        else:
            rounded_hour = rounded_time.replace(hour=rounded_time.hour)

        return str(rounded_hour.hour) + '_' + str(rounded_minute)

    for dataset in datasets:
        dataset.insert(loc=len(dataset.columns) - 1, column='hour', value=dataset['tpep_pickup_datetime'].apply(lambda x: int(str(x)[11:13])))
        dataset.insert(loc=len(dataset.columns) - 1, column='closest_hour', value=dataset['tpep_pickup_datetime'].apply(lambda x: get_closest_hour(x)))
        dataset.insert(loc=len(dataset.columns) - 1, column='closest_hour_and_a_half', value=dataset['tpep_pickup_datetime'].apply(lambda x: get_closest_hour_and_a_half(x)))
        dataset.insert(loc=len(dataset.columns) - 1, column='dayofweek_plus_hour', value=dataset[['pickup_day_of_week', 'hour']].apply(lambda x: combine_day_of_week_and_hour(*x), axis='columns'))
        dataset.insert(loc=len(dataset.columns) - 1, column='dayofweek_plus_closest_hour', value=dataset[['pickup_day_of_week', 'closest_hour']].apply(lambda x: combine_day_of_week_and_hour(*x), axis='columns'))
        dataset.insert(loc=len(dataset.columns) - 1, column='dayofweek_plus_closest_hour_and_a_half', value=dataset[['pickup_day_of_week', 'closest_hour_and_a_half']].apply(lambda x: combine_day_of_week_and_hour(*x), axis='columns'))
        dataset.drop(['tpep_pickup_datetime', 'pickup_time_of_day'], axis='columns', inplace=True)

        # Fix datatypes
        dataset['duration_seconds'] = dataset['duration_seconds'].astype(int)

        # Feature Elimination
            # After much analysis, only id needs to be dropped
            # Most promising features are in this order: Location, time+day of week, time, passenger count, day of week, and lastly vendor id.
            # Dropping id as a feature but keeping it for lookup purposes.
        dataset.set_index('id', inplace=True)
        dataset['duration_seconds'] = dataset['duration_seconds'].apply(lambda x: x if x < 4000 else 4000)
        dataset['passenger_count'] = dataset['passenger_count'].apply(lambda x: 1 if x <= 1 else x)
    if cv_split == 5:
        train_df, val_df, test_df = datasets
        test_df = test_df[test_df['duration_seconds'] > 120].copy()
    else:
        train_df, val_df = datasets
    train_df = train_df[train_df['duration_seconds'] > 120].copy()
    val_df = val_df[val_df['duration_seconds'] > 120].copy()
    train_df.to_csv(f"../../data/interim/train_df_2_fold_{cv_split}.csv", index=True)
    val_df.to_csv(f"../../data/interim/val_df_2_fold_{cv_split}.csv", index=True)
    if cv_split == 5:
        test_df.to_csv(f"../../data/interim/test_df_2.csv", index=True)
    