In [1]:
import torch
from copy import deepcopy
import random
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset

In [2]:
torch.cuda.is_available()

True

In [3]:
!ls /data/ml-1m/

README	movies.dat  ratings.dat  users.dat


Load dataset

In [4]:
MIN_RATINGS = 20

In [6]:
# Load Data
ml1m_dir = '/data/ml-1m/ratings.dat'
ml1m_rating = pd.read_csv(ml1m_dir, sep='::', 
                          header=None, 
                          names=['uid', 'mid', 'rating', 'timestamp'],  
                          engine='python')

**This way is better**

In [7]:
df = pd.read_csv(ml1m_dir, sep='::', 
                          header=None, 
                          names=['uid', 'mid', 'rating', 'timestamp'],  
                          engine='python')

In [8]:
print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
grouped = df.groupby('uid')
df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

Filtering out users with less than 20 ratings


In [9]:
print("Mapping original user and item IDs to new sequential IDs")
df['userId'] = pd.factorize(df['uid'])[0]
df['itemId'] = pd.factorize(df['mid'])[0]

Mapping original user and item IDs to new sequential IDs


In [10]:
df.head(10)

Unnamed: 0,uid,mid,rating,timestamp,userId,itemId
0,1,1193,5,978300760,0,0
1,1,661,3,978302109,0,1
2,1,914,3,978301968,0,2
3,1,3408,4,978300275,0,3
4,1,2355,5,978824291,0,4
5,1,1197,3,978302268,0,5
6,1,1287,5,978302039,0,6
7,1,2804,5,978300719,0,7
8,1,594,4,978302268,0,8
9,1,919,4,978301368,0,9


In [11]:
print('Range of userId is [{}, {}]'.format(df.userId.min(), df.userId.max()))
print('Range of itemId is [{}, {}]'.format(df.userId.min(), df.itemId.max()))

Range of userId is [0, 6039]
Range of itemId is [0, 3705]


Yep, definitely the _factorize_ method is better

In [12]:
class UserItemRatingDataset(Dataset):
    def __init__(self, user_tensor, item_tensor, target_tensor):
        """
        args:
            target_tensor: torch.Tensor that corresponds to (user, item) pair
        """
        self.user_tensor = user_tensor
        self.item_tensor = item_tensor
        self.target_tensor = target_tensor
        
    def __getitem__(self, index):
        """
        Map-style datasets https://pytorch.org/docs/stable/data.html#map-style-datasets
        """
        return self.user_tensor[index], self.item_tensor[index], self.target_tensor[index]
    
    def __len__(self):
        return self.user_tensor.size(0)

Befor summarizing processing steps into the generator class let us do some exploratory things

Sampling negative

In [12]:
item_pool = set(df['itemId'].unique())
"""return all negative items & 100 sampled negative items"""
interact_status = df.groupby('userId')['itemId'].apply(set).reset_index().rename(
    columns={'itemId': 'interacted_items'})
interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: item_pool - x)
interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, 99))
#interact_status[['userId', 'negative_items', 'negative_samples']]

In [13]:
interact_status.head()

Unnamed: 0,userId,interacted_items,negative_items,negative_samples
0,0,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[518, 1566, 1743, 1278, 2949, 3301, 796, 1878,..."
1,1,"{0, 18, 20, 42, 47, 48, 52, 53, 54, 55, 56, 57...","{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[3442, 1758, 1747, 2214, 3648, 1925, 1068, 189..."
2,2,"{128, 4, 5, 22, 166, 168, 41, 44, 175, 176, 17...","{0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...","[3464, 2317, 3578, 2393, 1348, 3628, 479, 1926..."
3,3,"{139, 26, 156, 43, 44, 48, 63, 64, 208, 209, 2...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[1193, 1057, 2438, 1439, 2172, 106, 1341, 1668..."
4,4,"{3, 4, 9, 18, 27, 38, 39, 43, 48, 51, 59, 62, ...","{0, 1, 2, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, ...","[3127, 2867, 2090, 3154, 3625, 572, 3565, 3353..."


Leave one (last in time) out for testing purposes

In [14]:
#df.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)

In [15]:
# """leave one out train/test split """
# df['rank_latest'] = df.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)
# test = df[ratings['rank_latest'] == 1]
# train = df[ratings['rank_latest'] > 1]
# assert train['userId'].nunique() == test['userId'].nunique()
# return train[['userId', 'itemId', 'rating']], test[['userId', 'itemId', 'rating']]

### Preparation of epoch data

In [16]:
df.head()

Unnamed: 0,uid,mid,rating,timestamp,userId,itemId
0,1,1193,5,978300760,0,0
1,1,661,3,978302109,0,1
2,1,914,3,978301968,0,2
3,1,3408,4,978300275,0,3
4,1,2355,5,978824291,0,4


In [17]:
def binarize(ratings):
    """binarize into 0 or 1, imlicit feedback"""
    ratings = deepcopy(ratings)
    ratings['rating'][ratings['rating'] > 0] = 1.0
    return ratings      

def sample_negative(ratings, item_pool):
    """return all negative items & 100 sampled negative items"""
    interact_status = ratings.groupby('userId')['itemId'].apply(set).reset_index().rename(
        columns={'itemId': 'interacted_items'})
    interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: item_pool - x)
    interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, 99))
    return interact_status[['userId', 'negative_items', 'negative_samples']]


def leave_last_out(ratings):
    """leave one out train/test split """
    ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)
    test = ratings[ratings['rank_latest'] == 1]
    train = ratings[ratings['rank_latest'] > 1]
    assert train['userId'].nunique() == test['userId'].nunique()
    return train[['userId', 'itemId', 'rating']], test[['userId', 'itemId', 'rating']]

In [18]:
preprocess_ratings = binarize(df)
user_pool = set(df['userId'].unique())
item_pool = set(df['itemId'].unique())

negatives = sample_negative(df, item_pool)

train_ratings, test_ratings = leave_last_out(preprocess_ratings)

In [19]:
negatives.head()

Unnamed: 0,userId,negative_items,negative_samples
0,0,"{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[269, 1121, 452, 1086, 751, 3705, 1781, 3483, ..."
1,1,"{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[1749, 1959, 2916, 559, 3094, 1149, 2663, 414,..."
2,2,"{0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...","[991, 304, 2428, 1051, 2243, 75, 2436, 3262, 1..."
3,3,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[3242, 98, 1727, 986, 3648, 1326, 676, 2817, 2..."
4,4,"{0, 1, 2, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, ...","[1456, 2161, 3585, 2943, 3584, 2553, 2842, 127..."


In [20]:
train_with_negatives = pd.merge(train_ratings, negatives[['userId', 'negative_items']], on='userId')

In [27]:
train_with_negatives.head()

Unnamed: 0,userId,itemId,rating,negative_items,epoch_sampled_negatives
0,0,0,1,"{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[1970, 3524, 3230, 1539]"
1,0,1,1,"{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[69, 918, 3291, 1995]"
2,0,2,1,"{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[1248, 234, 602, 637]"
3,0,3,1,"{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[2862, 3052, 3617, 3249]"
4,0,4,1,"{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[340, 2144, 1216, 1914]"


In [22]:
!pip install pandarallel

Collecting pandarallel
  Downloading pandarallel-1.4.8.tar.gz (14 kB)
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25ldone
[?25h  Created wheel for pandarallel: filename=pandarallel-1.4.8-py3-none-any.whl size=16111 sha256=4c73543447ecf8b73ab41c7a2b8c6bc69923ca03e3c6fa59a9606c9366bd7133
  Stored in directory: /root/.cache/pip/wheels/e9/46/8f/698a358b97ccf5efe84a301297dbf98b19207ff527e0f2512e
Successfully built pandarallel
Installing collected packages: pandarallel
Successfully installed pandarallel-1.4.8


In [23]:
from pandarallel import pandarallel

In [24]:
pandarallel.initialize()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [25]:
num_negatives = 4
train_with_negatives['epoch_sampled_negatives'] = train_with_negatives['negative_items'].parallel_apply(lambda x: random.sample(x, num_negatives))

In [28]:
train_with_negatives.head()

Unnamed: 0,userId,itemId,rating,negative_items,epoch_sampled_negatives
0,0,0,1,"{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[1970, 3524, 3230, 1539]"
1,0,1,1,"{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[69, 918, 3291, 1995]"
2,0,2,1,"{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[1248, 234, 602, 637]"
3,0,3,1,"{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[2862, 3052, 3617, 3249]"
4,0,4,1,"{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[340, 2144, 1216, 1914]"


## What do we really do with so-called "negative-users"

In [29]:
eval_test_ratings = pd.merge(test_ratings, train_with_negatives, on='userId')

In [30]:
eval_test_ratings.head()

Unnamed: 0,userId,itemId_x,rating_x,itemId_y,rating_y,negative_items,epoch_sampled_negatives
0,0,25,1,0,1,"{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[1970, 3524, 3230, 1539]"
1,0,25,1,1,1,"{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[69, 918, 3291, 1995]"
2,0,25,1,2,1,"{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[1248, 234, 602, 637]"
3,0,25,1,3,1,"{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[2862, 3052, 3617, 3249]"
4,0,25,1,4,1,"{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[340, 2144, 1216, 1914]"


In [31]:
test_users, test_items, negative_users, negative_items = [], [], [], []
for row in eval_test_ratings.itertuples():
    test_users.append(int(row.userId))
    test_items.append(int(row.itemId))
    for i in range(len(row.negative_items)):
        negative_users.append(int(row.userId))
        negative_items.append(int(row.negative_items[i]))

AttributeError: 'Pandas' object has no attribute 'negative_samples'

## ?!

I am not sure that assigning zero as a rating to negative sample acutally makes sense?!. TODO: should check

In [34]:
users, items, ratings = [], [], []
for idx, row in enumerate(train_with_negatives.itertuples()):
        users.append(int(row.userId))
        items.append(int(row.itemId))
        ratings.append(float(row.rating))
        for i in range(num_negatives):
            users.append(int(row.userId))
            items.append(int(row.epoch_sampled_negatives[i]))
            ratings.append(float(0))  # negative samples get 0 rating
        if idx == 10:
            break

In [None]:
users, items, ratings = [], [], []
for row in train_ratings.itertuples():
        users.append(int(row.userId))
        items.append(int(row.itemId))
        ratings.append(float(row.rating))
        for i in range(num_negatives):
            users.append(int(row.userId))
            items.append(int(row.negatives[i]))
            ratings.append(float(0))  # negative samples get 0 rating
    dataset = UserItemRatingDataset(user_tensor=torch.LongTensor(users),
                                    item_tensor=torch.LongTensor(items),
                                    target_tensor=torch.FloatTensor(ratings))

In [45]:
class SampleGenerator():
    def __init__(self, ratings):
        """
        args:
            ratings: pd.DataFrame, which contains 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
        """
        assert 'userId' in ratings.columns
        assert 'itemId' in ratings.columns
        assert 'rating' in ratings.columns

        self.ratings = ratings
        
        self.preprocess_ratings = self._binarize(ratings)
        self.user_pool = set(self.ratings['userId'].unique())
        self.item_pool = set(self.ratings['itemId'].unique())
        # create negative item samples for NCF learning
        self.negatives = self._sample_negative(ratings)
        self.train_ratings, self.test_ratings = self._leave_last_out(self.preprocess_ratings)
        
        self.train_with_negatives = pd.merge(self.train_ratings, 
                                 self.negatives[['userId', 'negative_items']], on='userId')
        
    def _binarize(self, ratings):
        """binarize into 0 or 1, imlicit feedback"""
        ratings = deepcopy(ratings)
        ratings['rating'][ratings['rating'] > 0] = 1.0
        return ratings    
    
    def _leave_last_out(self, ratings):
        """leave one out train/test split """
        ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)
        test = ratings[ratings['rank_latest'] == 1]
        train = ratings[ratings['rank_latest'] > 1]
        assert train['userId'].nunique() == test['userId'].nunique()
        return train[['userId', 'itemId', 'rating']], test[['userId', 'itemId', 'rating']]
    
    def _sampltrain_with_negativese_negative(self, ratings):
        """return all negative items & 100 sampled negative items"""
        interact_status = ratings.groupby('userId')['itemId'].apply(set).reset_index().rename(
            columns={'itemId': 'interacted_items'})
        interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: self.item_pool - x)
        interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, 99))
        return interact_status[['userId', 'negative_items', 'negative_samples']]
    
    def __prepare_epoch(self, num_negatives, batch_size):
        users, items, ratings = [], [], []
        ## OK, now i understand why they do it in every epoch... 
        train_with_negatives['epoch_sampled_negatives'] = \
            train_with_negatives['negative_items'].parallel_apply(lambda x: random.sample(x, num_negatives))
        for row in train_ratings.itertuples():
                users.append(int(row.userId))
                items.append(int(row.itemId))
                ratings.append(float(row.rating))
                for i in range(num_negatives):
                    users.append(int(row.userId))
                    items.append(int(row.negatives[i]))
                    ratings.append(float(0))  # negative samples get 0 rating
        dataset = UserItemRatingDataset(user_tensor=torch.LongTensor(users),
                                        item_tensor=torch.LongTensor(items),
                                        target_tensor=torch.FloatTensor(ratings)) 
        return DataLoader(dataset, batch_size=batch_size, shuffle=True)        

    @property
    def evaluation_data(self):
        test_ratings = pd.merge(self.test_ratings, self.negatives[['userId', 'negative_samples']], on='userId')
        test_users, test_items, negative_users, negative_items = [], [], [], []
        for row in test_ratings.itertuples():
            test_users.append(int(row.userId))
            test_items.append(int(row.itemId))
            for i in range(len(row.negative_samples)):
                negative_users.append(int(row.userId))
                negative_items.append(int(row.negative_samples[i]))
        return [torch.LongTensor(test_users), torch.LongTensor(test_items), torch.LongTensor(negative_users),
                torch.LongTensor(negative_items)]