In [1]:
import wget
import numpy as np
import pandas as pd
from tqdm import tqdm

from abc import *
from pathlib import Path
import os
import tempfile
import shutil
import pickle

import pandas as pd
import time 
from datetime import datetime

from abc import *
import random
import torch
import torch.utils.data as data_utils

In [2]:
# Static Class that will hold our parameter values as attributes, thus making it easy to call

class Parameters:
    def __init__(self):
        self.random_seed = 10
        
        #Data Split
        self.train_size = 0.8
        self.test_size = 1- self.train_size
        self.save_folder = '../data/bert_4_rec_data'
        
        #Since we are doing collaborative filtering, we are only interested in users and movies with significant overlaps
        self.min_movie_overlap = 5
        self.min_user_overlap = 5

        #Sampling
        self.train_negative_sampler_code = 'random'
        self.train_negative_sample_size = 0
        self.train_negative_sampling_seed = 0
        self.test_negative_sampler_code = 'random'
        self.test_negative_sample_size = 100
        self.test_negative_sampling_seed = 98765

        #BERT Model
        self.bert_max_len = 100
        self.bert_mask_prob = 0.15
        self.batch_size = 128



args = Parameters()

In [3]:
tqdm.pandas()

  from pandas import Panel


In [4]:
try:
    dataset = pd.read_pickle('../data/bert_4_rec_data/dataset.pkl')

except:
    data = pd.read_excel('../data/processed_data/full_data.xlsx')
    data = data.drop(columns = "Unnamed: 0")

    ## Retriving the relevant features
    ratings = pd.DataFrame()

    ratings['userId'] = data['userId']
    ratings['movieId'] = data['movieId']
    ratings['rating'] = data['rating']
    ratings['timestamp'] = data['timestamp_ratings']

    # Since we are considering good movies we will filter out those rating below 3.5 (Determined by the histogram of ratings)
    ratings = ratings[ratings['rating'] >= 3.5]

    # Filter only user and movies with significant overlaps

    item_sizes = ratings.groupby('movieId').size()
    good_items = item_sizes.index[item_sizes >= args.min_movie_overlap]
    ratings = ratings[ratings['movieId'].isin(good_items)]


    user_sizes = ratings.groupby('userId').size()
    good_users = user_sizes.index[user_sizes >= args.min_user_overlap]
    ratings = ratings[ratings['userId'].isin(good_users)]

    # Probability Density  Distribution of user and movie

    print('Densifying index')
    umap = {u: i for i, u in enumerate(set(ratings['userId']))}
    smap = {s: i for i, s in enumerate(set(ratings['movieId']))}
    ratings['userId'] = ratings['userId'].map(umap)
    ratings['movieId'] = ratings['movieId'].map(smap)

    # Since we are considering the changing user preference overtime, we have to model the data as a timeseries and split train/test by specifying a specific timepoint to split the data (Hold-Out Sampling)

    np.random.seed(args.random_seed)
    user_count = len(ratings.userId.unique())

    # Generate user indices
    permuted_index = np.random.permutation(user_count)
    eval_set_size = int(len(permuted_index)*args.test_size)

    train_user_index = permuted_index[                :-2*eval_set_size]
    val_user_index   = permuted_index[-2*eval_set_size:  -eval_set_size]
    test_user_index  = permuted_index[  -eval_set_size:                ]

    # Split DataFrames
    train_df = ratings.loc[ratings['userId'].isin(train_user_index)]
    val_df   = ratings.loc[ratings['userId'].isin(val_user_index)]
    test_df  = ratings.loc[ratings['userId'].isin(test_user_index)]

    # DataFrame to dict => {uid : list of sid's}
    train = dict(train_df.groupby('userId').apply(lambda d: list(d['movieId'])))
    val   = dict(val_df.groupby('userId').apply(lambda d: list(d['movieId'])))
    test  = dict(test_df.groupby('userId').apply(lambda d: list(d['movieId'])))

    dataset = {'train': train,
                'val': val,
                'test': test,
                'umap': umap,
                'smap': smap}

    with open('../data/bert_4_rec_data/dataset.pkl', "wb") as e:
        pickle.dump(dataset, e)

In [28]:
data = pd.read_excel('../data/processed_data/full_data.xlsx')
data = data.drop(columns = "Unnamed: 0")
data.head()

Unnamed: 0,userId,movieId,rating,timestamp_ratings,tag,timestamp_tags,title,genres_x,MovieYear,tconst,...,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres_y,averageRating,numVotes,directors,writers
0,4,7569,3.5,1573943431,so bad it's good,1573943455,You Only Live Twice,Action|Adventure|Sci-Fi|Thriller,1977,tt0062512,...,You Only Live Twice,0,1967,\N,117,"Action,Adventure,Thriller",6.9,99691,nm0318150,"nm0089169,nm0001094,nm0001220,nm0420845"
1,18853,7569,3.0,1248068143,007,1248068150,You Only Live Twice,Action|Adventure|Sci-Fi|Thriller,1977,tt0062512,...,You Only Live Twice,0,1967,\N,117,"Action,Adventure,Thriller",6.9,99691,nm0318150,"nm0089169,nm0001094,nm0001220,nm0420845"
2,18853,7569,3.0,1248068143,james bond,1248068148,You Only Live Twice,Action|Adventure|Sci-Fi|Thriller,1977,tt0062512,...,You Only Live Twice,0,1967,\N,117,"Action,Adventure,Thriller",6.9,99691,nm0318150,"nm0089169,nm0001094,nm0001220,nm0420845"
3,21096,7569,4.0,1269243364,franchise,1246471298,You Only Live Twice,Action|Adventure|Sci-Fi|Thriller,1977,tt0062512,...,You Only Live Twice,0,1967,\N,117,"Action,Adventure,Thriller",6.9,99691,nm0318150,"nm0089169,nm0001094,nm0001220,nm0420845"
4,21096,7569,4.0,1269243364,James Bond,1246471305,You Only Live Twice,Action|Adventure|Sci-Fi|Thriller,1977,tt0062512,...,You Only Live Twice,0,1967,\N,117,"Action,Adventure,Thriller",6.9,99691,nm0318150,"nm0089169,nm0001094,nm0001220,nm0420845"


In [29]:
## Retriving the relevant features
ratings = pd.DataFrame()

ratings['userId'] = data['userId']
ratings['movieId'] = data['movieId']
ratings['rating'] = data['rating']
ratings['timestamp'] = data['timestamp_ratings']

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,4,7569,3.5,1573943431
1,18853,7569,3.0,1248068143
2,18853,7569,3.0,1248068143
3,21096,7569,4.0,1269243364
4,21096,7569,4.0,1269243364


In [30]:
ratings.rating.value_counts()

4.0    202717
5.0    167506
3.5    141415
4.5    140241
3.0     98314
2.5     49452
2.0     37708
1.5     18866
1.0     16465
0.5     13742
Name: rating, dtype: int64

In [31]:
# Filter only user and movies with significant overlaps

item_sizes = ratings.groupby('movieId').size()
good_items = item_sizes.index[item_sizes >= min_movie_overlap]
ratings = ratings[ratings['movieId'].isin(good_items)]


user_sizes = ratings.groupby('userId').size()
good_users = user_sizes.index[user_sizes >= min_user_overlap]
ratings = ratings[ratings['userId'].isin(good_users)]

In [32]:
# Probability Density  Distribution of user and movie

print('Densifying index')
umap = {u: i for i, u in enumerate(set(ratings['userId']))}
smap = {s: i for i, s in enumerate(set(ratings['movieId']))}
ratings['userId'] = ratings['userId'].map(umap)
ratings['movieId'] = ratings['movieId'].map(smap)

Densifying index


In [33]:
# Since we are considering the changing user preference overtime, we have to model the data as a timeseries and split train/test by specifying a specific timepoint to split the data (Hold-Out Sampling)

np.random.seed(10)

user_count = len(ratings.userId.unique())

# Generate user indices
permuted_index = np.random.permutation(user_count)
eval_set_size = int(len(permuted_index)*args.test_size)

train_user_index = permuted_index[                :-2*eval_set_size]
val_user_index   = permuted_index[-2*eval_set_size:  -eval_set_size]
test_user_index  = permuted_index[  -eval_set_size:                ]

# Split DataFrames
train_df = ratings.loc[ratings['userId'].isin(train_user_index)]
val_df   = ratings.loc[ratings['userId'].isin(val_user_index)]
test_df  = ratings.loc[ratings['userId'].isin(test_user_index)]

# DataFrame to dict => {uid : list of sid's}
train = dict(train_df.groupby('userId').apply(lambda d: list(d['movieId'])))
val   = dict(val_df.groupby('userId').apply(lambda d: list(d['movieId'])))
test  = dict(test_df.groupby('userId').apply(lambda d: list(d['movieId'])))

dataset = {'train': train,
            'val': val,
            'test': test,
            'umap': umap,
            'smap': smap}

with open('../data/bert_4_rec_data/dataset.pkl', "wb") as e:
    pickle.dump(dataset, e)


AttributeError: 'DataFrameGroupBy' object has no attribute 'progress_apply'

In [31]:
### Implement a Negative Sampling Class
from tqdm import trange

from collections import Counter

class PopularNegativeSampler(metaclass=ABCMeta):

    def __init__(self, train, val, test, user_count, item_count, sample_size, seed, save_folder):
        self.train = train
        self.val = val
        self.test = test
        self.user_count = user_count
        self.item_count = item_count
        self.sample_size = sample_size
        self.seed = seed
        self.save_folder = save_folder
    
    def get_negative_samples(self):
        savefile_path = self._get_save_path()
        if savefile_path.is_file():
            print('Negatives samples exist. Loading.')
            negative_samples = pickle.load(savefile_path.open('rb'))
            return negative_samples
        print("Negative samples don't exist. Generating.")
        negative_samples = self.generate_negative_samples()
        with savefile_path.open('wb') as f:
            pickle.dump(negative_samples, f)
        return negative_samples

    def _get_save_path(self):
        folder = Path(self.save_folder)
        filename = 'Popular-sample_size{}-seed{}.pkl'.format(self.sample_size, self.seed)
        return folder.joinpath(filename)

    def generate_negative_samples(self):
        popular_items = self.items_by_popularity()

        negative_samples = {}
        print('Sampling negative items')
        for user in trange(self.user_count):
            try:
                seen = set(self.train[user])
                seen.update(self.val[user])
                seen.update(self.test[user])

                samples = []
                for item in popular_items:
                    if len(samples) == self.sample_size:
                        break
                    if item in seen:
                        continue
                    samples.append(item)

                negative_samples[user] = samples
            except:
                continue

        return negative_samples

    def items_by_popularity(self):
        popularity = Counter()

        for user in self.train.keys():
            popularity.update(self.train[user])

        for user in self.val.keys():
            popularity.update(self.val[user])

        for user in self.test.keys():
            popularity.update(self.test[user])
            
        popular_items = sorted(popularity, key=popularity.get, reverse=True)
        return popular_items

In [39]:
### Implement a Dataloader class that will load data into model
from abc import *
import random

class model_Dataloader(metaclass=ABCMeta):
    def __init__(self, args, dataset):
        self.args = args
        seed = args.random_seed
        self.rng = random.Random(seed)
        self.train = dataset['train']
        self.val = dataset['val']
        self.test = dataset['test']
        self.umap = dataset['umap']
        self.smap = dataset['smap']
        self.user_count = len(self.umap)
        self.item_count = len(self.smap)
        self.max_len = args.bert_max_len
        self.mask_prob = args.bert_mask_prob
        self.CLOZE_MASK_TOKEN = self.item_count + 1
        self.save_folder = args.save_folder

        code = args.train_negative_sampler_code
        train_negative_sampler = PopularNegativeSampler(self.train, self.val, self.test,
                                                          self.user_count, self.item_count,
                                                          args.train_negative_sample_size,
                                                          args.train_negative_sampling_seed,
                                                          self.save_folder)
        code = args.test_negative_sampler_code
        test_negative_sampler = PopularNegativeSampler(self.train, self.val, self.test,
                                                         self.user_count, self.item_count,
                                                         args.test_negative_sample_size,
                                                         args.test_negative_sampling_seed,
                                                         self.save_folder)

        self.train_negative_samples = train_negative_sampler.get_negative_samples()
        self.test_negative_samples = test_negative_sampler.get_negative_samples()

    def code(cls):
        return 'bert'

    def get_pytorch_dataloaders(self):
        train_loader = self._get_train_loader()
        val_loader = self._get_val_loader()
        test_loader = self._get_test_loader()
        return train_loader, val_loader, test_loader

    def _get_train_loader(self):
        dataset = self._get_train_dataset()
        dataloader = data_utils.DataLoader(dataset, batch_size=self.args.batch_size,
                                           shuffle=True, pin_memory=True)
        return dataloader

    def _get_train_dataset(self):
        dataset = BertTrainDataset(self.train, self.max_len, self.mask_prob, self.CLOZE_MASK_TOKEN, self.item_count, self.rng)
        return dataset

    def _get_val_loader(self):
        return self._get_eval_loader(mode='val')

    def _get_test_loader(self):
        return self._get_eval_loader(mode='test')

    def _get_eval_loader(self, mode):
        batch_size = self.args.batch_size
        dataset = self._get_eval_dataset(mode)
        dataloader = data_utils.DataLoader(dataset, batch_size=batch_size,
                                           shuffle=False, pin_memory=True)
        return dataloader

    def _get_eval_dataset(self, mode):
        answers = self.val if mode == 'val' else self.test
        dataset = BertEvalDataset(self.train, answers, self.max_len, self.CLOZE_MASK_TOKEN, self.test_negative_samples)
        return dataset


class BertTrainDataset(data_utils.Dataset):
    def __init__(self, u2seq, max_len, mask_prob, mask_token, num_items, rng):
        self.u2seq = u2seq
        self.users = sorted(self.u2seq.keys())
        self.max_len = max_len
        self.mask_prob = mask_prob
        self.mask_token = mask_token
        self.num_items = num_items
        self.rng = rng

    def __len__(self):
        return len(self.users)

    def __getitem__(self, index):
        user = self.users[index]
        seq = self._getseq(user)

        tokens = []
        labels = []
        for s in seq:
            prob = self.rng.random()
            if prob < self.mask_prob:
                prob /= self.mask_prob

                if prob < 0.8:
                    tokens.append(self.mask_token)
                elif prob < 0.9:
                    tokens.append(self.rng.randint(1, self.num_items))
                else:
                    tokens.append(s)

                labels.append(s)
            else:
                tokens.append(s)
                labels.append(0)

        tokens = tokens[-self.max_len:]
        labels = labels[-self.max_len:]

        mask_len = self.max_len - len(tokens)

        tokens = [0] * mask_len + tokens
        labels = [0] * mask_len + labels

        return torch.LongTensor(tokens), torch.LongTensor(labels)

    def _getseq(self, user):
        return self.u2seq[user]


class BertEvalDataset(data_utils.Dataset):
    def __init__(self, u2seq, u2answer, max_len, mask_token, negative_samples):
        self.u2seq = u2seq
        self.users = sorted(self.u2seq.keys())
        self.u2answer = u2answer
        self.max_len = max_len
        self.mask_token = mask_token
        self.negative_samples = negative_samples

    def __len__(self):
        return len(self.users)

    def __getitem__(self, index):
        user = self.users[index]
        seq = self.u2seq[user]
        answer = self.u2answer[user]
        negs = self.negative_samples[user]

        candidates = answer + negs
        labels = [1] * len(answer) + [0] * len(negs)

        seq = seq + [self.mask_token]
        seq = seq[-self.max_len:]
        padding_len = self.max_len - len(seq)
        seq = [0] * padding_len + seq

        return torch.LongTensor(seq), torch.LongTensor(candidates), torch.LongTensor(labels)


In [40]:
dataloader = model_Dataloader(args, dataset)

Negatives samples exist. Loading.
Negatives samples exist. Loading.


In [41]:
train, val, test = dataloader.get_pytorch_dataloaders()


In [28]:
# Since we are considering the changing user preference overtime, we have to model the data as a timeseries and split train/test by specifying a specific timepoint to split the data (Hold-Out Sampling)

np.random.seed(10)
eval_set_size = int(len(result)*test_size)
user_count = len(result.userId.unique())

# Generate user indices
permuted_index = np.random.permutation(user_count)
train_user_index = permuted_index[                :-2*eval_set_size]
val_user_index   = permuted_index[-2*eval_set_size:  -eval_set_size]
test_user_index  = permuted_index[  -eval_set_size:                ]

# Split DataFrames
train_df = result.loc[result['userId'].isin(train_user_index)]
val_df   = result.loc[result['userId'].isin(val_user_index)]
test_df  = result.loc[result['userId'].isin(test_user_index)]

# DataFrame to dict => {uid : list of sid's}
train = dict(train_df.groupby('userId').progress_apply(lambda d: list(d['movieId'])))
val   = dict(val_df.groupby('userId').progress_apply(lambda d: list(d['movieId'])))
test  = dict(test_df.groupby('userId').progress_apply(lambda d: list(d['movieId'])))

with open('../data/bert_4_rec_data/train.pkl', "wb") as e:
    pickle.dump(train, e)

with open('../data/bert_4_rec_data/validation.pkl', "wb") as e:
    pickle.dump(val, e)

with open('../data/bert_4_rec_data/test.pkl', "wb") as e:
    pickle.dump(test, e)



0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 716/716 [00:00<00:00, 13545.82it/s]


In [None]:
### Preprocessing Data
#Probability Mass Distibution of user and movie
def densify_index(df):
    print('Densifying index')
    umap = {u: i for i, u in enumerate(set(df['userId']))}
    smap = {s: i for i, s in enumerate(set(df['movieId']))}
    df['uid'] = df['uid'].map(umap)
    df['sid'] = df['sid'].map(smap)
    return df, umap, smap

In [23]:
umap = {u: i for i, u in enumerate(set(result['userId']))}
len(umap)

10644

In [24]:
len(result)

886426

In [20]:
### Implement a Dataloader class

class model_Dataloader(metaclass=ABCMeta):
    def __init__(self, args, dataset):
        self.args = args
        seed = args.random_seed
        self.rng = random.Random(seed)
        self.train = dataset['train']
        self.val = dataset['val']
        self.test = dataset['test']
        self.umap = dataset['umap']
        self.smap = dataset['smap']
        self.user_count = len(self.umap)
        self.item_count = len(self.smap)
        self.max_len = args.bert_max_len
        self.mask_prob = args.bert_mask_prob
        self.CLOZE_MASK_TOKEN = self.item_count + 1

        code = args.train_negative_sampler_code
        train_negative_sampler = negative_sampler_factory(code, self.train, self.val, self.test,
                                                          self.user_count, self.item_count,
                                                          args.train_negative_sample_size,
                                                          args.train_negative_sampling_seed,
                                                          self.save_folder)
        code = args.test_negative_sampler_code
        test_negative_sampler = negative_sampler_factory(code, self.train, self.val, self.test,
                                                         self.user_count, self.item_count,
                                                         args.test_negative_sample_size,
                                                         args.test_negative_sampling_seed,
                                                         self.save_folder)

        self.train_negative_samples = train_negative_sampler.get_negative_samples()
        self.test_negative_samples = test_negative_sampler.get_negative_samples()


In [None]:
from .base import AbstractDataloader
from .negative_samplers import negative_sampler_factory

import torch
import torch.utils.data as data_utils


class BertDataloader(AbstractDataloader):
    def __init__(self, args, dataset):
        super().__init__(args, dataset)
        args.num_items = len(self.smap)
        self.max_len = args.bert_max_len
        self.mask_prob = args.bert_mask_prob
        self.CLOZE_MASK_TOKEN = self.item_count + 1

        code = args.train_negative_sampler_code
        train_negative_sampler = negative_sampler_factory(code, self.train, self.val, self.test,
                                                          self.user_count, self.item_count,
                                                          args.train_negative_sample_size,
                                                          args.train_negative_sampling_seed,
                                                          self.save_folder)
        code = args.test_negative_sampler_code
        test_negative_sampler = negative_sampler_factory(code, self.train, self.val, self.test,
                                                         self.user_count, self.item_count,
                                                         args.test_negative_sample_size,
                                                         args.test_negative_sampling_seed,
                                                         self.save_folder)

        self.train_negative_samples = train_negative_sampler.get_negative_samples()
        self.test_negative_samples = test_negative_sampler.get_negative_samples()

    @classmethod
    def code(cls):
        return 'bert'

    def get_pytorch_dataloaders(self):
        train_loader = self._get_train_loader()
        val_loader = self._get_val_loader()
        test_loader = self._get_test_loader()
        return train_loader, val_loader, test_loader

    def _get_train_loader(self):
        dataset = self._get_train_dataset()
        dataloader = data_utils.DataLoader(dataset, batch_size=self.args.batch_size,
                                           shuffle=True, pin_memory=True)
        return dataloader

    def _get_train_dataset(self):
        dataset = BertTrainDataset(self.train, self.max_len, self.mask_prob, self.CLOZE_MASK_TOKEN, self.item_count, self.rng)
        return dataset

    def _get_val_loader(self):
        return self._get_eval_loader(mode='val')

    def _get_test_loader(self):
        return self._get_eval_loader(mode='test')

    def _get_eval_loader(self, mode):
        batch_size = self.args.val_batch_size if mode == 'val' else self.args.test_batch_size
        dataset = self._get_eval_dataset(mode)
        dataloader = data_utils.DataLoader(dataset, batch_size=batch_size,
                                           shuffle=False, pin_memory=True)
        return dataloader

    def _get_eval_dataset(self, mode):
        answers = self.val if mode == 'val' else self.test
        dataset = BertEvalDataset(self.train, answers, self.max_len, self.CLOZE_MASK_TOKEN, self.test_negative_samples)
        return dataset


class BertTrainDataset(data_utils.Dataset):
    def __init__(self, u2seq, max_len, mask_prob, mask_token, num_items, rng):
        self.u2seq = u2seq
        self.users = sorted(self.u2seq.keys())
        self.max_len = max_len
        self.mask_prob = mask_prob
        self.mask_token = mask_token
        self.num_items = num_items
        self.rng = rng

    def __len__(self):
        return len(self.users)

    def __getitem__(self, index):
        user = self.users[index]
        seq = self._getseq(user)

        tokens = []
        labels = []
        for s in seq:
            prob = self.rng.random()
            if prob < self.mask_prob:
                prob /= self.mask_prob

                if prob < 0.8:
                    tokens.append(self.mask_token)
                elif prob < 0.9:
                    tokens.append(self.rng.randint(1, self.num_items))
                else:
                    tokens.append(s)

                labels.append(s)
            else:
                tokens.append(s)
                labels.append(0)

        tokens = tokens[-self.max_len:]
        labels = labels[-self.max_len:]

        mask_len = self.max_len - len(tokens)

        tokens = [0] * mask_len + tokens
        labels = [0] * mask_len + labels

        return torch.LongTensor(tokens), torch.LongTensor(labels)

    def _getseq(self, user):
        return self.u2seq[user]



class BertEvalDataset(data_utils.Dataset):
    def __init__(self, u2seq, u2answer, max_len, mask_token, negative_samples):
        self.u2seq = u2seq
        self.users = sorted(self.u2seq.keys())
        self.u2answer = u2answer
        self.max_len = max_len
        self.mask_token = mask_token
        self.negative_samples = negative_samples

    def __len__(self):
        return len(self.users)

    def __getitem__(self, index):
        user = self.users[index]
        seq = self.u2seq[user]
        answer = self.u2answer[user]
        negs = self.negative_samples[user]

        candidates = answer + negs
        labels = [1] * len(answer) + [0] * len(negs)

        seq = seq + [self.mask_token]
        seq = seq[-self.max_len:]
        padding_len = self.max_len - len(seq)
        seq = [0] * padding_len + seq

        return torch.LongTensor(seq), torch.LongTensor(candidates), torch.LongTensor(labels)

In [None]:
### Processing the data

def convert_time(x):
    timestamp = int(time.mktime(x.timetuple()))
    return timestamp

def clean_user(x):
    x = x.strip("::")
    return x 

def strain(df, column, cutoffs):
    criteria = (lambda x: (len(x) > cutoffs[0]) & (len(x) < cutoffs[1]))
    subset = df.groupby(column).filter(criteria)
    return subset

def convert(path, data):
    with open(path, 'w') as filehandle:
        for x in data:
            user = str(x[0])
            item = str(x[1])
            rating = str(x[2])
            timestamp = str(x[3])
            line = user + "::" + item + "::" + rating + "::" + timestamp
            filehandle.write('%s\n' % line)

df = pd.read_json('movies_total.json')
df['Date'] = df['Date'].apply(convert_time)
df['User'] = df['User'].apply(clean_user)
df.rename(columns = {'Date': 'Timestamp'}, inplace=True)
data = sub[['User', 'Movie ID', 'Rating', 'Timestamp']].values.tolist()
sub = strain(df, "Movie ID", [10, 10000])
sub = strain(sub, "User", [5, 10000])

print("The total number of users:", len(sub['User'].value_counts()))
print("The total number of movies:", len(sub['Movie ID'].value_counts()))

path = 'full_ratings.dat'
convert(path, data)

In [None]:
## For this model we only need the history of ratings
def getRatings(df):
    result = pd.DataFrame()
    result[userId] = df[]


## Get Size of the Data
def get_count(tp, id):
    groups = tp[[id]].groupby(id, as_index=False)
    count = groups.size()
    return count

#Get movies and users which share at least 3 intersection
def filter_triplets(tp, min_uc=5, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users.
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]

    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]

    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId')
    return tp, usercount, itemcount

#Possible use to reduce size of data (we are only interested in the movies that were rated 4-5 )
def make_implicit(df, min_rating):
    print('Turning into implicit ratings')
    df = df[df['rating'] >= min_rating]
    # return df[['uid', 'sid', 'timestamp']]
    return df


#Probability Mass Distibution of user and movie
def densify_index(df):
    print('Densifying index')
    umap = {u: i for i, u in enumerate(set(df['uid']))}
    smap = {s: i for i, s in enumerate(set(df['sid']))}
    df['uid'] = df['uid'].map(umap)
    df['sid'] = df['sid'].map(smap)
    return df, umap, smap



In [None]:
import pandas as pd

from datetime import date

#Create a Dataset Class that cr
class ML20MDataset(metaclass=ABCMeta):
    def __init__(self, args):
        self.args = args
        self.min_rating = args.min_rating
        self.min_uc = args.min_uc
        self.min_sc = args.min_sc
        self.split = args.split

        assert self.min_uc >= 2, 'Need at least 2 ratings per user for validation and test'
    
    def make_implicit(self, df):
        print('Turning into implicit ratings')
        df = df[df['rating'] >= self.min_rating]
        # return df[['uid', 'sid', 'timestamp']]
        return df



    def densify_index(self, df):
        print('Densifying index')
        umap = {u: i for i, u in enumerate(set(df['uid']))}
        smap = {s: i for i, s in enumerate(set(df['sid']))}
        df['uid'] = df['uid'].map(umap)
        df['sid'] = df['sid'].map(smap)
        return df, umap, smap

    def split_df(self, df, user_count):
        if self.args.split == 'leave_one_out':
            print('Splitting')
            user_group = df.groupby('uid')
            user2items = user_group.progress_apply(lambda d: list(d.sort_values(by='timestamp')['sid']))
            train, val, test = {}, {}, {}
            for user in range(user_count):
                items = user2items[user]
                train[user], val[user], test[user] = items[:-2], items[-2:-1], items[-1:]
            return train, val, test
        elif self.args.split == 'holdout':
            print('Splitting')
            np.random.seed(self.args.dataset_split_seed)
            eval_set_size = self.args.eval_set_size

            # Generate user indices
            permuted_index = np.random.permutation(user_count)
            train_user_index = permuted_index[                :-2*eval_set_size]
            val_user_index   = permuted_index[-2*eval_set_size:  -eval_set_size]
            test_user_index  = permuted_index[  -eval_set_size:                ]

            # Split DataFrames
            train_df = df.loc[df['uid'].isin(train_user_index)]
            val_df   = df.loc[df['uid'].isin(val_user_index)]
            test_df  = df.loc[df['uid'].isin(test_user_index)]

            # DataFrame to dict => {uid : list of sid's}
            train = dict(train_df.groupby('uid').progress_apply(lambda d: list(d['sid'])))
            val   = dict(val_df.groupby('uid').progress_apply(lambda d: list(d['sid'])))
            test  = dict(test_df.groupby('uid').progress_apply(lambda d: list(d['sid'])))
            return train, val, test
        else:
            raise NotImplementedError

    def all_raw_file_names(cls):
        return ['genome-scores.csv',
                'genome-tags.csv',
                'links.csv',
                'movies.csv',
                'ratings.csv',
                'README.txt',
                'tags.csv']

    
    def load_ratings_df(self):
        folder_path = self._get_rawdata_folder_path()
        file_path = folder_path.joinpath('ratings.csv')
        df = pd.read_csv(file_path)
        df.columns = ['uid', 'sid', 'rating', 'timestamp']
        return df

       def load_dataset(self):
        self.preprocess()
        dataset_path = self._get_preprocessed_dataset_path()
        dataset = pickle.load(dataset_path.open('rb'))
        return dataset

    def preprocess(self):
        dataset_path = self._get_preprocessed_dataset_path()
        if dataset_path.is_file():
            print('Already preprocessed. Skip preprocessing')
            return
        if not dataset_path.parent.is_dir():
            dataset_path.parent.mkdir(parents=True)
        self.maybe_download_raw_dataset()
        df = self.load_ratings_df()
        df = self.make_implicit(df)
        df = self.filter_triplets(df)
        df, umap, smap = self.densify_index(df)
        train, val, test = self.split_df(df, len(umap))
        dataset = {'train': train,
                   'val': val,
                   'test': test,
                   'umap': umap,
                   'smap': smap}
        with dataset_path.open('wb') as f:
            pickle.dump(dataset, f)


    def _get_rawdata_root_path(self):
        return Path(RAW_DATASET_ROOT_FOLDER)

    def _get_rawdata_folder_path(self):
        root = self._get_rawdata_root_path()
        return root.joinpath(self.raw_code())

    def _get_preprocessed_root_path(self):
        root = self._get_rawdata_root_path()
        return root.joinpath('preprocessed')

    def _get_preprocessed_folder_path(self):
        preprocessed_root = self._get_preprocessed_root_path()
        folder_name = '{}_min_rating{}-min_uc{}-min_sc{}-split{}' \
            .format(self.code(), self.min_rating, self.min_uc, self.min_sc, self.split)
        return preprocessed_root.joinpath(folder_name)

    def _get_preprocessed_dataset_path(self):
        folder = self._get_preprocessed_folder_path()
        return folder.joinpath('dataset.pkl')