In [4]:
import gc
import hashlib
import numpy as np
import pandas as pd
from backroom import *
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

# Structure the dataset
df = pd.read_csv("../../data/interim/df.csv")
cheating_fields = df[['id', # UID
    'trip_distance', # High analytic value
    'RatecodeID', 'fare_amount', 'tolls_amount', 'total_amount', # Medium analytic value
    'payment_type', 'extra', 'mta_tax', 'tip_amount', 'improvement_surcharge', 'is_store_and_fwd', # Low analytic value
    ]]
cheating_fields.to_csv("../../data/interim/cheating_fields.csv")
df = df[['id', # UID
    'PULocationID', 'DOLocationID', # Location
    'tpep_pickup_datetime', 'pickup_day_of_week', 'pickup_time_of_day', # Time
    'VendorID', # Low importance metadata
    'passenger_count', # Low importance metadata
    'duration_seconds' # Target         
   ]]
assert set(df.columns).intersection(set(cheating_fields.columns)) == {'id'}, "Cheating fields found remaining in the dataframe"
assert df.id.nunique() == df.shape[0], "Duplicate fields found remaining in the dataframe"

# Generate a seed and split the dataframe into train_val_df and test_df
hasher = hashlib.sha256()
hasher.update(df.to_string().encode('utf-8'))
hash_value = hasher.hexdigest()
seed = int(hash_value[:8], 16)
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=seed)
test_df.to_csv("../../data/interim/test_df.csv", index=False)

kf = KFold(n_splits=5, shuffle=True, random_state=seed)
fold = 1
for train_index, val_index in kf.split(train_val_df):
    train_df = train_val_df.iloc[train_index]
    val_df = train_val_df.iloc[val_index]
    train_df.to_csv(f"../../data/interim/train_df_fold_{fold}.csv", index=False)
    val_df.to_csv(f"../../data/interim/val_df_fold_{fold}.csv", index=False)
    fold += 1
free_up_memory(locals())
