### 1. Linear User Integration

In [1]:
import torch, torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np

class LinearGRU(nn.Module):
    def __init__(self, n_users,n_items, emb_size=None, hidden_units=1000,dropout = 0):
        super(self.__class__, self).__init__()
        self.n_users = n_users
        self.n_items = n_items
        if emb_size == None:
            emb_size = hidden_units
        ## todo why embeding?
        self.user_emb = nn.Embedding(n_users,emb_size)
        self.item_emb = nn.Embedding(n_items,emb_size)
        self.gru = nn.GRU(input_size = n_users+n_items,hidden_size = hidden_units,dropout = dropout,batch_first=True)
        
    def forward(self, user_vectors, item_vectors):
        
        user_vectors = user_vectors.unsqueeze(-1)
        item_vectors = item_vectors.unsqueeze(-1)
        sequence_size = user_vectors.size()[1]
        users = self.user_emb(user_vectors.squeeze(0)).view(-1,sequence_size,self.n_users)
        items = self.item_emb(item_vectors.squeeze(0)).view(-1,sequence_size,self.n_items)
        
        gru_output,_ = self.gru(torch.cat([users,items],dim=-1))

        output = F.log_softmax(gru_output, dim=-1)
        return output
    


In [2]:
network = LinearGRU(3,3,3,3)
users = np.array([[1,1,1,1]])
items = np.array([[0,1,2,1]])
print(users.shape)

(1, 4)


In [3]:
network(Variable(torch.from_numpy(users)),Variable(torch.from_numpy(items)))

Variable containing:
(0 ,.,.) = 
 -1.1557 -1.4285 -0.8086
 -1.1543 -1.5384 -0.7551
 -1.0460 -1.8017 -0.7265
 -1.0773 -1.6558 -0.7581
[torch.FloatTensor of size 1x4x3]

In [126]:
# Libraries and provided functions
import pandas as pd
import zipfile
import wget
from io import StringIO 
import numpy as np
import scipy as sp
from scipy import sparse
import scipy.sparse.linalg
from tqdm import tqdm # Very useful library to see progress bar during range iterations: just type `for i in tqdm(range(10)):`
from matplotlib import pyplot as plt
%matplotlib inline

from collections import namedtuple
import sys

def normalize_timestamp(x):
    x["timestamp"] = np.argsort(list(x["timestamp"]))
    return x

def length_col(x):
    x['timestamp'] = len(x)
    return x

def get_movielens_data(local_file=None):
    '''Downloads movielens data, normalizes users, timesteps and movies ids,
    returns data in sparse CSR format.
    '''
    if not local_file:
        print('Downloading data...')
        zip_file_url = 'http://files.grouplens.org/datasets/movielens/ml-10m.zip'
        zip_contents = wget.download(zip_file_url)
        print('Done.')
    else:
        zip_contents = local_file
    
    print('Loading data into memory...')
    with zipfile.ZipFile(zip_contents) as zfile:
        zdata = zfile.read('ml-10M100K/ratings.dat').decode()
        delimiter = ';'
        zdata = zdata.replace('::', delimiter) # makes data compatible with pandas c-engine
        ml_data = pd.read_csv(StringIO(zdata), sep=delimiter, header=None, engine='c',
                                  names=['userid', 'movieid' ,'rating','timestamp'],
                                  usecols=['userid', 'movieid','rating','timestamp'])
    print("Normalizing indices to avoid gaps")
    # normalize indices to avoid gaps
    ml_data['movieid'] = ml_data.groupby('movieid', sort=False).grouper.group_info[0]
    ml_data['userid'] = ml_data.groupby('userid', sort=False).grouper.group_info[0]
    #ml_data['timestamp'] = ml_data.groupby(['timestamp'], sort=False).grouper.group_info[0]
    
    # Remove time duplicates 
    print("Removing time duplicates")
    ml_data = ml_data.drop_duplicates(['userid','timestamp'])
    
    # Normalize time for users
    print("Normalizing stars of time for users")
    ml_data = ml_data.groupby("userid").apply(normalize_timestamp)
    print("Removing data with more then 650 points")
    
    lc = ml_data.groupby("userid").apply(length_col)
    ml_data = ml_data[lc['timestamp']<650]
    
    # build sparse user-movie matrix
    
    data_shape = ml_data[['userid', 'timestamp']].max() + 1
    
    data_matrix = sp.sparse.csr_matrix((ml_data['movieid'],
                                       (ml_data['userid'], ml_data['timestamp'])),
                                        shape=data_shape, dtype=np.float64)
    
    print('Done.')
    return data_matrix

def split_data(data, test_ratio=0.2):
    '''Randomly splits data into training and testing datasets. Default ratio is 80%/20%.
    Returns datasets in namedtuple format for convenience. Usage:
    
    train_data, test_data = split_data(data_matrix)
    
    or
    
    movielens_data = split_data(data_matrix)
    
    and later in code: 
    
    do smth with movielens_data.train 
    do smth with movielens_data.test
    '''
    
    num_users = data.shape[0]
    idx = np.zeros((num_users,), dtype=bool)
    sel = np.random.choice(num_users, int(test_ratio*num_users), replace=False)
    np.put(idx, sel, True)
    
    Movielens_data = namedtuple('MovieLens10M', ['train', 'test'])
    movielens_data = Movielens_data(train=data[~idx, :], test=data[idx, :])
    return movielens_data

Data = get_movielens_data("ml-10m.zip")

Loading data into memory...
Normalizing indices to avoid gaps
Removing time duplicates
Normalizing stars of time for users
Removing data with less then 650 points
Done.


In [127]:
Data.todense().shape

(69878, 649)

In [108]:
print('Loading data into memory...')
with zipfile.ZipFile('ml-10m.zip') as zfile:
    zdata = zfile.read('ml-10M100K/ratings.dat').decode()
    delimiter = ';'
    zdata = zdata.replace('::', delimiter) # makes data compatible with pandas c-engine
    ml_data = pd.read_csv(StringIO(zdata), sep=delimiter, header=None, engine='c',
                              names=['userid', 'movieid' ,'rating','timestamp'],
                              usecols=['userid', 'movieid','rating','timestamp'])
print("Normalizing indices to avoid gaps")
# normalize indices to avoid gaps
ml_data['movieid'] = ml_data.groupby('movieid', sort=False).grouper.group_info[0]
ml_data['userid'] = ml_data.groupby('userid', sort=False).grouper.group_info[0]
#ml_data['timestamp'] = ml_data.groupby(['timestamp'], sort=False).grouper.group_info[0]

# Remove time duplicates 
#print("Removing time duplicates")
#ml_data = ml_data.drop_duplicates(['userid','timestamp'])

# Normalize time for users
print("Normalizing stars of time for users")
ml_data = ml_data.groupby("userid").apply(normalize_timestamp)
print("Removing data with less then 20 points")

lc = ml_data.groupby("userid").apply(length_col)
ml_data = ml_data[lc['timestamp']<650]

Loading data into memory...
Normalizing indices to avoid gaps
Normalizing stars of time for users
Removing data with less then 20 points


In [125]:
((lc[lc['timestamp']<650]).drop_duplicates('userid')['timestamp']).mean()

114.00543855759994

In [117]:
lc[lc['timestamp']<21]

Unnamed: 0,userid,movieid,rating,timestamp
22,1,22,5.0,20
23,1,23,3.0,20
24,1,24,5.0,20
25,1,25,3.0,20
26,1,16,3.0,20
27,1,26,5.0,20
28,1,27,2.0,20
29,1,28,3.0,20
30,1,29,3.0,20
31,1,30,3.0,20
