In [14]:
import numpy as np
import pandas as pd
from utils import read_product_data, read_train_data, read_test_data

In [15]:
train_data = read_train_data()

In [16]:
train_data.head(3)

Unnamed: 0,prev_items,next_item,locale
0,['B09W9FND7K' 'B09JSPLN1M'],B09M7GY217,DE
1,['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B...,B001B4THSA,DE
2,['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ...,B0767DTG2Q,DE


In [19]:
def split(df: pd.DataFrame, ratio: float, locale: list=None, seed: int=42) -> tuple:
    """Split the DataFrame into two part with given ratio.
    
    Args:
        df: the large DataFrame to be splited.
        ratio: the split ratio.
        locale: constraints that the smaller part only consists of sessions in those locales. If None, there is no locale constraints.
    
    Returns:
        tuple: two DataFrames.
    """
    if locale is None:
        test_df = df.sample(frac=ratio, random_state=seed)
    else:
        num_test = int(len(df) * ratio)
        locale = set(locale)
        locale_df = df[df['locale'].isin(locale)]
        test_df = locale_df.sample(num_test, random_state=seed)
    train_df = df.iloc[df.index.difference(test_df.index)]
    print("#Train: {}; #Test: {}".format(len(train_df), len(test_df)))
    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)
    

In [20]:
sampled_trn_df, sampled_test_df = split(train_data, 0.1, ['DE', 'JP', 'UK'], seed=42)

#Train: 3245625; #Test: 360624


In [21]:
sampled_trn_df.to_csv("../raw_data/sampled_train_data.csv", index=None)
sampled_test_df.to_csv("../raw_data/sampled_test_data.csv", index=None)