In [1]:
import numpy as np
import pandas as pd

In [2]:
INTERIM_DATA_PATH = '/pio/scratch/1/recommender_systems/interim/Amazon/'
FINAL_DATA_PATH = '/pio/scratch/1/i313924/data/'

In [3]:
def split_ratings_dataset(df, seed=None, frac_users=0.7, frac_items=0.7):

    if seed is not None:
        np.random.seed(seed)

    users = df['reviewerID'].unique()
    np.random.shuffle(users)
    train_size = int(frac_users * users.shape[0])

    train_df = df[df['reviewerID'].isin(users[:train_size])]
    test_df = df[df['reviewerID'].isin(users[train_size:])]

    test_known_df, test_unknown_df = split_testing_set(test_df, seed=seed, frac=frac_items)

    return pd.concat((train_df, test_known_df)), test_unknown_df


def split_testing_set(test_df, seed=None, frac=0.7):

    if seed is not None:
        np.random.seed(seed)

    grouped = test_df.groupby(by='reviewerID')
    test_known = []
    test_unknown = []
    for _, df in grouped:
        df_size = df.shape[0]

        known_size = int(round(frac * df_size))
        known_indices = np.random.choice(df_size, known_size, replace=False)
        known_data = df.iloc[known_indices]
        test_known.append(known_data)

        unknown_indices = np.setdiff1d(np.arange(df_size), known_indices)
        unknown_data = df.iloc[unknown_indices]
        test_unknown.append(unknown_data)

    return pd.concat(test_known), pd.concat(test_unknown)

In [4]:
df = pd.read_parquet(f'{INTERIM_DATA_PATH}/Clothing_Shoes_and_Jewelry_clean.parquet')

In [5]:
active_users = df['reviewerID'].value_counts().loc[lambda x: x > 10].index
popular_items = df['asin'].value_counts().loc[lambda x: x > 10].index

In [6]:
df = df[df['reviewerID'].isin(active_users)]
df = df[df['asin'].isin(popular_items)]

In [7]:
train_df, test_df = split_ratings_dataset(df)

In [9]:
train_df[['reviewerID', 'overall', 'unixReviewTime', 'asin']]

Unnamed: 0,reviewerID,overall,unixReviewTime,asin
0,A2IC3NZN488KWK,5.0,1399161600,0871167042
6,A22ZX01TPWQY4G,2.0,1409702400,0871167042
8,AA7PNT2OPS3RP,4.0,1401926400,0871167042
19,A7LYNE6GEDAG8,5.0,1497312000,1519588135
21,A2DE9PDS1V4UYY,2.0,1494633600,1519588135
...,...,...,...,...
5856993,AZZY4W8E5AX2K,5.0,1458259200,B00KO9GRY0
6062173,AZZY4W8E5AX2K,5.0,1492992000,B00LMS37CM
5079334,AZZY4W8E5AX2K,5.0,1428278400,B00GJB14ZE
5473735,AZZY4W8E5AX2K,5.0,1460937600,B00INGQ29G


In [25]:
print(len(train_df))
print(len(train_df['reviewerID'].unique()))
print(len(train_df['asin'].unique()))

3382904
235435
166882


In [26]:
print(len(test_df))
print(len(test_df['reviewerID'].unique()))
print(len(test_df['asin'].unique()))

337307
70631
103940


In [11]:
train_df[['asin', 'reviewerID', 'overall', 'unixReviewTime']].to_csv(f'{FINAL_DATA_PATH}/train_data/train_slim_data.csv', index=False, header=False)
test_df[['asin', 'reviewerID', 'overall', 'unixReviewTime']].to_csv(f'{FINAL_DATA_PATH}/test_data/test_slim_data.csv', index=False, header=False)