In [2]:
import numpy as np 
import pandas as pd 
from nfc.src.mlp import MLP as mlp
from scipy.sparse import csr_matrix
import os
import sys

In [3]:
DATA_DIR = '/home/timos/Downloads/'
raw_data = pd.read_csv(os.path.join(DATA_DIR, 'ratings.csv'), header=0)
raw_data = raw_data[raw_data['rating'] > 3.5]
raw_data

Unnamed: 0,userId,movieId,rating,timestamp
6,1,151,4.0,1094785734
7,1,223,4.0,1112485573
8,1,253,4.0,1112484940
9,1,260,4.0,1112484826
10,1,293,4.0,1112484703
11,1,296,4.0,1112484767
12,1,318,4.0,1112484798
15,1,541,4.0,1112484603
22,1,1036,4.0,1112485480
23,1,1079,4.0,1094785665


In [4]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

def filter_triplets(tp, min_uc=5, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users. 
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 
    return tp, usercount, itemcount

In [11]:
#Only keep items that are clicked on by at least 5 users
raw_data, user_activity, item_popularity = filter_triplets(raw_data)
sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

user_activity
print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

unique_uid = user_activity.index

np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

# create train/validation/test users
n_users = unique_uid.size
n_heldout_users = 10000

tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

After filtering, there are 9990682 watching events from 136677 users and 20720 movies (sparsity: 0.353%)


In [12]:
user_activity.index


Int64Index([     1,      2,      3,      4,      5,      6,      7,      8,
                 9,     10,
            ...
            138483, 138484, 138485, 138486, 138487, 138489, 138490, 138491,
            138492, 138493],
           dtype='int64', name='userId', length=136677)

In [13]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('userId')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 1000 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [14]:
train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)]

In [15]:
unique_sid = pd.unique(train_plays['movieId'])

In [16]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [17]:
pro_dir = os.path.join(DATA_DIR, 'pro_sg')
if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

In [33]:
vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]

vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)]

In [19]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)

0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled


In [20]:
test_plays = raw_data.loc[raw_data['userId'].isin(te_users)]
test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)]

In [21]:
test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)

0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled


In [22]:
def numerize(tp):
    uid = map(lambda x: profile2id[x], tp['userId'])
    sid = map(lambda x: show2id[x], tp['movieId'])
    return pd.DataFrame(data={'uid': list(uid), 'sid': list(sid)}, columns=['uid', 'sid'])

In [23]:
train_data = numerize(train_plays)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)

In [24]:
vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)

In [25]:
vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)

In [26]:
test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)

In [None]:
test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)

In [32]:
test_data_te.uid.values.max()
test_data_te.uid.values

array([134677, 134677, 134677, ..., 135470, 135470, 135470])

In [5]:
# Headers and imports
import numpy as np 
import pandas as pd 
import mf
from nfc.src.mlp import MLP as mlp
from scipy.sparse import csr_matrix
from mf import MF 
import os
import sys

from pytorch.torchmf import BasePipeline,bpr_loss,BPRModule,PairwiseInteractions
from scipy.sparse import coo_matrix
import torch

train_data = pd.read_csv('ml-20m/pro_sg/train.csv')
test_data_tr = pd.read_csv('ml-20m/pro_sg/test_tr.csv')
users = 136677 
movies = 20720


In [6]:
users_train = train_data.uid.values
movies_train = train_data.sid.values
data_train = np.ones((users_train.shape[0]))
coo_train = coo_matrix((data_train, (users_train,movies_train)),shape=(users, movies))

#Dimensions of the coo_matrix (Users x Movies)
n_test_users = test_data_tr.uid.unique()
n_test_movies =  test_data_tr.sid.unique()

users_test = test_data_tr.uid.values
movies_test = test_data_tr.sid.values
data_test = np.ones((users_test.shape[0]))
coo_test = coo_matrix((data_test, (users_test,movies_test)),shape=(users, movies))

In [7]:
pipeline = BasePipeline(train= coo_train, test=coo_test, verbose=True,
                        batch_size=1024, num_workers=4,
                        n_factors=20, weight_decay=0,
                        dropout_p=0., lr=.2, sparse=True,
                        optimizer=torch.optim.SGD, n_epochs=40,
                        random_seed=2017, loss_function=bpr_loss,
                        model=BPRModule, hogwild=True,
                        interaction_class=PairwiseInteractions,
                        eval_metrics=('auc', 'patk'))


In [88]:
import torch
import numpy as np
from spotlight.datasets.movielens import get_movielens_dataset,_get_movielens
from utils.helper_functions import make_implicit
from utils.arg_extractor import get_args
from spotlight.datasets import _transport
from spotlight.interactions import Interactions
import pandas as pd

In [None]:
users,items,ratings,timestamps =(_get_movielens('movielens_20M'))


In [121]:
I = Interactions(users,items,ratings,timestamps)

In [122]:
I

<Interactions dataset (138494 users x 26745 items x 9990682 interactions)>

In [7]:
import torch
import numpy as np
from spotlight.cross_validation import random_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import rmse_score,precision_recall_score
from spotlight.factorization.implicit import ImplicitFactorizationModel
from spotlight.factorization.representations import BilinearNet
from utils.helper_functions import make_implicit
import logging
import spotlight.optimizers as optimizers

dataset_name = '100K'


logging.info("DataSet MovieLens_%s will be used"%dataset_name)

path = 'datasets/movielens/'

dataset = get_movielens_dataset(variant=dataset_name,path=path)

# ------------------- #
#Transform the dataset to implicit feedback

dataset = make_implicit(dataset)

DataSet MovieLens_100K will be used
Data will be read from file: datasets/movielens/movielens_100K.hdf5


In [8]:
dataset


<Interactions dataset (938 users x 1447 items x 55361 interactions)>

In [11]:
import os
import requests
import zipfile

import numpy as np
import pandas as pd
import scipy.sparse as sp

"""
Shamelessly stolen from
https://github.com/maciejkula/triplet_recommendations_keras
"""


def train_test_split(interactions, n=10):
    """
    Split an interactions matrix into training and test sets.
    Parameters
    ----------
    interactions : np.ndarray
    n : int (default=10)
        Number of items to select / row to place into test.
    Returns
    -------
    train : np.ndarray
    test : np.ndarray
    """
    test = np.zeros(interactions.shape)
    train = interactions.copy()
    for user in range(interactions.shape[0]):
        if interactions[user, :].nonzero()[0].shape[0] > n:
            test_interactions = np.random.choice(interactions[user, :].nonzero()[0],
                                                 size=n,
                                                 replace=False)
            train[user, test_interactions] = 0.
            test[user, test_interactions] = interactions[user, test_interactions]

    # Test and training are truly disjoint
    assert(np.all((train * test) == 0))
    return train, test


def _get_data_path():
    """
    Get path to the movielens dataset file.
    """
    data_path = os.path.join(os.path.dirname(os.path.abspath('__file__')),
                        'data')
    if not os.path.exists(data_path):
        print('Making data path')
        os.mkdir(data_path)
    return data_path


def _download_movielens(dest_path):
    """
    Download the dataset.
    """

    url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
    req = requests.get(url, stream=True)

    print('Downloading MovieLens data')

    with open(os.path.join(dest_path, 'ml-100k.zip'), 'wb') as fd:
        for chunk in req.iter_content(chunk_size=None):
            fd.write(chunk)

    with zipfile.ZipFile(os.path.join(dest_path, 'ml-100k.zip'), 'r') as z:
        z.extractall(dest_path)


def read_movielens_df():
    path = _get_data_path()
    zipfile = os.path.join(path, 'ml-100k.zip')
    if not os.path.isfile(zipfile):
        _download_movielens(path)
    fname = os.path.join(path, 'ml-100k', 'u.data')
    names = ['user_id', 'item_id', 'rating', 'timestamp']
    df = pd.read_csv(fname, sep='\t', names=names)
    return df


def get_movielens_interactions():
    df = read_movielens_df()

    n_users = df.user_id.unique().shape[0]
    n_items = df.item_id.unique().shape[0]

    interactions = np.zeros((n_users, n_items))
    for row in df.itertuples():
        interactions[row[1] - 1, row[2] - 1] = row[3]
    return interactions


def get_movielens_train_test_split(implicit=False):
    interactions = get_movielens_interactions()
    print(interactions)
    if implicit:
        interactions = (interactions >= 4).astype(np.float32)
    train, test = train_test_split(interactions)
    train = sp.coo_matrix(train)
    test = sp.coo_matrix(test)
    return train, test

In [12]:
train,test = get_movielens_train_test_split()

Making data path
Downloading MovieLens data


In [13]:
train

<943x1682 sparse matrix of type '<class 'numpy.float64'>'
	with 90570 stored elements in COOrdinate format>

In [14]:
test

<943x1682 sparse matrix of type '<class 'numpy.float64'>'
	with 9430 stored elements in COOrdinate format>