In [12]:
import pandas as pd
import numpy as np
import torch
import os

In [30]:
class Dataset(object):
    def __init__(self, path, sep=',', session_key='SessionID', item_key='ItemID', time_key = 'Time', n_sample=10, itemmap = None, itemstamp = None, time_sort = False):
        self.df = pd.read_csv(path, sep=sep, dtype = {session_key: int, item_key: int, time_key: float})
        self.session_key = session_key
        self.item_key = item_key
        self.time_key = time_key
        self.time_sort = time_sort
        if n_sample > 0:
            self.df = self.df[:n_sample]
        
        # Add column item index to data
        self.add_item_indices(itemmap = itemmap)
        '''
        Sort the df by time, and the by session ID. That is, df is sorted by session ID and clicks within a session are next to each other, wherw the clicks within a session are time-ordered.
        '''
        
        self.df.sort_values([session_key, time_key], inplace = True)
        self.click_offsets = self.get_click_offset()
        self.session_idx_arr = self.order_session_idx()
        
    def add_item_indices(self, itemmap=None):
        '''
        Add item index column named "item_idx" to the df
        Args:
        itemmap (pd.DataFrame): mapping between the item Ids and indices
        '''
        
        if itemmap is None:
            item_ids = self.df[self.item_key].unique()
            item2idx = pd.Series(data = np.arange(len(item_ids)), index = item_ids)
            itemmap = pd.DataFrame({self.item_key: item_ids, 'item_idx' : item2idx[item_ids].values})
        
        self.itemmap = itemmap
        self.df = pd.merge(self.df, self.itemmap, on=self.item_key, how = 'inner')
        
    def get_click_offset(self):
        '''
        self.df[self.session_key] return a set of session_key
        slef.df[self.session_key].nunique() return the size of session_key set (int)
        self.df.groupby(self.session _key).size() return the size of each session_id
        self.df.groupby(self.session_key).size().cumsum() return cumulative sum
        '''
        offsets = np.zeros(self.df[self.session_key].nunique() + 1, dtype = np.int32)
        offsets[1:] = self.df.groupby(self.session_key).size().cumsum()
        return offsets
    
    def order_session_idx(self):
        if self.time_sort:
            sessions_start_time = self.df.groupby(self.session_key)[self.time_key].min().values
            session_idx_arr = np.argsort(sessions_start_time)
        else:
            session_idx_arr = np.arange(self.df[self.session_key].nunique())
        return session_idx_arr
        
    @property
    def items(self):
        return self.itemmap[self.item_key].unique()

In [7]:
class DataLoader():
    def __init__(self, dataset, batch_size = 50):
        '''
        A class for creating session-parallel mini-batches.
        
        Args:
            dataset (SessionDataset): the session dataset to generate the batches from
            batch_size (int): size of the batch
        '''
        self.dataset = dataset
        self.batch_size = batch_size
        
    def __iter__(self):
        '''
        Returns the iterator for producing session-parallel training mini-batches.
        
        Tields:
            input (B,): torch.FloatTensor. Item indices tha will be endcoded as one-hot vectors later.
            target (B,): a Variable tha stores the target item indices
            masks: Numpy arrayindicating the positions of the sessions to be terminated
        '''
        
        df = self.dataset.df
        click_offsets = self.dataset.click_offsets
        session_idx_arr = self.dataset.session_idx_arr
        
        iters = np.arange(self.batch_size)
        maxiter = iters.max()
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters] + 1]
        mask = []
        finished = False
        
        while not finished:
            minlen = (end - start).min()
            idx_target = df.item_idx.values[start]
            
            for i in range(minlen -1):
                idx_input = idx_target
                idx_target = df.item_idx.values[start + i + 1]
                input = torch.LongTensor(idx_input)
                target = torch.LongTensor(idx_target)
                yield input, target, mask
                
            start = start + (minlen -1)
            mask = np.arange(len(iters))[(end - start) <= 1]
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets) -1:
                    finished = True
                    break
                    
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter] + 1]

In [8]:
train_data_path = 'recSys15TrainOnly.txt'
valid_data_path = 'recSys15Valid.txt'

In [9]:
!pwd

/Recommendation/GRU4REC


In [21]:
train_data = Dataset(os.path.join('../YooChoose/', train_data_path))
valid_data = Dataset(os.path.join('../YooChoose/', valid_data_path))

In [27]:
len(train_data.items)

37298

In [31]:
len(train_data.session_idx_arr)

7966041

In [32]:
len(train_data.session_key)

9

In [34]:
print(train_data)

<__main__.Dataset object at 0x7f8f1ae338b0>
