In [1]:
import numpy as np
import pandas as pd
import pickle
import copy
from tqdm import tqdm

In [2]:
datasets = ['BookCrossing', 'Epinions', 'LFM360K', 'ML1M', 'ML20M', 'Yelp']

In [3]:
train_ratio = 0.7
vali_ratio = 0.1
test_ratio = 0.2

In [4]:
for dataset in datasets:
    print('now process', dataset, "-----------------------------")

    df = pd.read_csv(f'../mod_data/{dataset}/{dataset}.csv', sep=',', names=['userId', 'itemId'] , skiprows=1)
    user_list = df['userId'].unique()
    item_list = df['itemId'].unique()

    # Set size of validation set and test set
    vali_size = int(vali_ratio * len(df))
    test_size = int(test_ratio * len(df))

    vali_idx = np.random.choice(np.arange(len(df)), vali_size, replace=False).tolist()
    vali_df = df.loc[vali_idx]
    df.drop(vali_idx, axis=0, inplace=True)
    vali_df.reset_index(drop=True, inplace=True)
    df.reset_index(drop=True, inplace=True)

    test_idx = np.random.choice(np.arange(len(df)), test_size, replace=False).tolist()
    test_df = df.loc[test_idx]
    df.drop(test_idx, axis=0, inplace=True)
    test_df.reset_index(drop=True, inplace=True)
    df.reset_index(drop=True, inplace=True)

    train_array = df[['userId', 'itemId']].values
    vali_array = vali_df[['userId', 'itemId']].values
    test_array = test_df[['userId', 'itemId']].values

    user_train_like = []
    user_test_like = [] 
    user_vali_like = [] 

    # make sure that the test/validation set is not empty
    for u in tqdm(user_list):
        train_like = (train_array[list(np.where(train_array[:, 0] == u)[0]), 1]).astype(int)
        vali_like = (vali_array[list(np.where(vali_array[:, 0] == u)[0]), 1]).astype(int)
        test_like = (test_array[list(np.where(test_array[:, 0] == u)[0]), 1]).astype(int)

        if len(train_like) == 0:
            train_array = np.append(train_array, vali_array[list(np.where(vali_array[:, 0] == u)[0])], axis = 0)
            train_array = np.append(train_array, test_array[list(np.where(test_array[:, 0] == u)[0])], axis = 0)
            vali_array = np.delete(vali_array, np.where(vali_array[:, 0] == u)[0], axis = 0)
            test_array = np.delete(test_array, np.where(test_array[:, 0] == u)[0], axis = 0)
            train_like = (train_array[list(np.where(train_array[:, 0] == u)[0]), 1]).astype(int)
            vali_like = (vali_array[list(np.where(vali_array[:, 0] == u)[0]), 1]).astype(int)
            test_like = (test_array[list(np.where(test_array[:, 0] == u)[0]), 1]).astype(int)

        # when validation set is empty
        if len(vali_like) == 0:
            if len(train_like) > len(test_like):
                new_vali_idx = np.random.choice(np.arange(len(train_like)), size=1)
                # add the record 'new_vali' to validation set
                new_vali = train_like[new_vali_idx]
                vali_like = np.array(new_vali)
                train_like = np.delete(train_like, new_vali_idx)
                train_array = np.delete(train_array, np.where((train_array[:, 0] == u) & (train_array[:, 1] == new_vali))[0], axis=0)
                vali_array = np.append(vali_array, [[u, new_vali[0]]], axis=0)
            else:
                new_vali_idx = np.random.choice(np.arange(len(test_like)), size=1)
                new_vali = test_like[new_vali_idx]
                vali_like = np.array(new_vali)
                test_like = np.delete(test_like, new_vali_idx)
                test_array = np.delete(test_array, np.where((test_array[:, 0] == u) & (test_array[:, 1] == new_vali))[0], axis=0)
                vali_array = np.append(vali_array, [[u, new_vali[0]]], axis=0)        

        # when test set is empty
        if len(test_like) == 0:
            new_test_idx = np.random.choice(np.arange(len(train_like)), size=1)
            # add the record 'new_test' to validation set
            new_test = train_like[new_test_idx]
            test_like = np.array(new_test)
            train_like = np.delete(train_like, new_test_idx)
            train_array = np.delete(train_array, np.where((train_array[:, 0] == u) & (train_array[:, 1] == new_test))[0], axis=0)
            test_array = np.append(test_array, [[u, new_test[0]]], axis=0)

        user_train_like.append(train_like)
        user_vali_like.append(vali_like)
        user_test_like.append(test_like)

    np.save(f'../mod_data/{dataset}/sep_data/user_train_like.npy', np.array(user_train_like))
    np.save(f'../mod_data/{dataset}/sep_data/user_vali_like.npy', np.array(user_vali_like))
    np.save(f'../mod_data/{dataset}/sep_data/user_test_like.npy', np.array(user_test_like))

    # Convert to DataFrame
    train_df = pd.DataFrame({'userId': train_array[:, 0], 'itemId': train_array[:, 1]})
    vali_df = pd.DataFrame({'userId': vali_array[:, 0], 'itemId': vali_array[:, 1]})
    test_df = pd.DataFrame({'userId': test_array[:, 0], 'itemId': test_array[:, 1]})

    # Save as .csv file
    train_df.to_csv(f'../mod_data/{dataset}/sep_data/train_df.csv', index=False)
    vali_df.to_csv(f'../mod_data/{dataset}/sep_data/vali_df.csv', index=False)
    test_df.to_csv(f'../mod_data/{dataset}/sep_data/test_df.csv', index=False)

now process BookCrossing -----------------------------


100%|██████████| 5107/5107 [00:02<00:00, 1888.31it/s]
  np.save(f'../mod_data/{dataset}/sep_data/user_train_like.npy', np.array(user_train_like))
  np.save(f'../mod_data/{dataset}/sep_data/user_vali_like.npy', np.array(user_vali_like))
  np.save(f'../mod_data/{dataset}/sep_data/user_test_like.npy', np.array(user_test_like))


now process Epinions -----------------------------


100%|██████████| 8521/8521 [00:05<00:00, 1482.05it/s]


now process LFM360K -----------------------------


100%|██████████| 52966/52966 [02:51<00:00, 308.44it/s]


now process ML1M -----------------------------


100%|██████████| 6040/6040 [00:06<00:00, 980.15it/s] 


now process ML20M -----------------------------


100%|██████████| 55845/55845 [08:19<00:00, 111.77it/s]


now process Yelp -----------------------------


100%|██████████| 13991/13991 [00:15<00:00, 894.58it/s] 
