In [22]:
import torch
import pandas as pd
import numpy as np
from pathlib import Path
import datetime
import logging
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
import tqdm

In [2]:
RAW_DATA_FOLDER = Path("../data/raw/")
PROCESSED_DATA_FOLDER = Path("../data/processed/")

spain_sales_raw = RAW_DATA_FOLDER / "spanish_sales.csv"

In [33]:
MIN_RATINGS = 4
USER_COLUMN = 'userId'
ITEM_COLUMN = 'itemId'
VALID_NEGATIVE = 10

In [34]:
sales_df = pd.read_csv(spain_sales_raw, parse_dates=True,dtype={"customer_id": int, "product_id": int})

In [35]:
sales_df['rating'] = 1.0
sales_df.date = pd.to_datetime(sales_df.date)

In [36]:
sales_df.rename(columns={'customer_id':'uid',
                          'product_id':'mid',
                          'date':'timestamp'}, 
                 inplace=True)

In [37]:
sales_df = pd.read_csv(spain_sales_raw, parse_dates=True,dtype={"customer_id": int, "product_id": int})
sales_df['ratings'] = 1.0
sales_df.date = pd.to_datetime(sales_df.date)
sales_df.rename(columns={'customer_id':'uid',
                      'product_id':'mid',
                      'date':'timestamp'}, 
             inplace=True)
df = sales_df.copy()

# first let us filter out the users with less than MIN_RATINGS interations
logger.info(
    "Filtering out users with less than {} ratings".format(MIN_RATINGS))
grouped = df.groupby('uid')
df = grouped.filter(lambda x: len(x) >= MIN_RATINGS).copy()

# now let us factoriyze (re-index users)
logger.info("Mapping original user and item IDs to new sequential IDs")
df['userId'] = pd.factorize(df['uid'])[0]
df['itemId'] = pd.factorize(df['mid'])[0]

# Need to sort before popping to get last item
df.sort_values(by='timestamp', inplace=True)

logger.info('Range of userId is [{}, {}]'.format(
    df.userId.min(), df.userId.max()))
logger.info('Range of itemId is [{}, {}]'.format(
    df.userId.min(), df.itemId.max()))

num_users = len(df['userId'].unique())
num_items = len(df['itemId'].unique())
logger.info("num_users is {}, num_items is {}".format(
    num_users, num_items))

# clean up data
del df['ratings'], df['timestamp']
df = df.drop_duplicates()  # assuming it keeps order

INFO:__main__:Filtering out users with less than 4 ratings
INFO:__main__:Mapping original user and item IDs to new sequential IDs
INFO:__main__:Range of userId is [0, 204100]
INFO:__main__:Range of itemId is [0, 6112]
INFO:__main__:num_users is 204101, num_items is 6113


In [38]:
# now we have filtered and sorted by time data, we can split test data out
grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN)
# need to pop for each group
train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])

In [39]:
# Note: no way to keep reference training data ordering because use of python set and multi-process
# It should not matter since it will be later randomized again
# save train and val data that is fixed.
train_ratings = torch.from_numpy(train_data.values)
torch.save(train_ratings, PROCESSED_DATA_FOLDER/'train_ratings.pt')
test_ratings = torch.from_numpy(test_data.values)
torch.save(test_ratings, PROCESSED_DATA_FOLDER/'test_ratings.pt')

Now the difficult part:
we have to sample negatives in a memory efficient way

In [40]:
class _TestNegSampler:
    def __init__(self, train_ratings, nb_neg):
        self.nb_neg = nb_neg
        self.nb_users = int(train_ratings[:, 0].max()) + 1
        self.nb_items = int(train_ratings[:, 1].max()) + 1

        # compute unique ids for quickly created hash set and fast lookup
        ids = (train_ratings[:, 0] * self.nb_items) + train_ratings[:, 1]
        self.set = set(ids)

    def generate(self, batch_size=128*1024):
        users = torch.arange(0, self.nb_users).reshape(
            [1, -1]).repeat([self.nb_neg, 1]).transpose(0, 1).reshape(-1)

        items = [-1] * len(users)

        random_items = torch.LongTensor(
            batch_size).random_(0, self.nb_items).tolist()
        print('Generating validation negatives...')
        for idx, u in enumerate(tqdm.tqdm(users.tolist())):
            if not random_items:
                random_items = torch.LongTensor(
                    batch_size).random_(0, self.nb_items).tolist()
            j = random_items.pop()
            while u * self.nb_items + j in self.set:
                if not random_items:
                    random_items = torch.LongTensor(
                        batch_size).random_(0, self.nb_items).tolist()
                j = random_items.pop()

            items[idx] = j
        items = torch.LongTensor(items)
        return items

In [41]:
sampler = _TestNegSampler(train_ratings.cpu().numpy(), VALID_NEGATIVE)
test_negs = sampler.generate().cuda()
test_negs = test_negs.reshape(-1, VALID_NEGATIVE)
torch.save(test_negs, PROCESSED_DATA_FOLDER/'/test_negatives.pt')

  0%|          | 133032/273283150 [00:00<03:25, 1330318.10it/s]

Generating validation negatives...


100%|██████████| 273283150/273283150 [03:04<00:00, 1484856.44it/s]


In [42]:
train_ratings.shape

torch.Size([1673629, 4])

In [43]:
test_ratings.shape

torch.Size([204101, 4])

In [44]:
test_negs.shape

torch.Size([27328315, 10])