In [1]:
%matplotlib inline

In [2]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd

class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output

In [3]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [80]:
ratings_table = pd.read_csv('ratings.csv')
ratings_table["userId"] = np.unique(ratings_table["userId"], return_inverse=True)[1]
ratings_table["movieId"] = np.unique(ratings_table["movieId"], return_inverse=True)[1]
ratings_table["mean_of_user"] = ratings_table.groupby('userId')["rating"].transform('mean')
ratings_table["scaled_rating"] = (ratings_table["rating"] - ratings_table["mean_of_user"])
ratings_table

Unnamed: 0,userId,movieId,rating,timestamp,mean_of_user,scaled_rating
0,0,0,4.0,964982703,4.366379,-0.366379
1,0,2,4.0,964981247,4.366379,-0.366379
2,0,5,4.0,964982224,4.366379,-0.366379
3,0,43,5.0,964983815,4.366379,0.633621
4,0,46,5.0,964982931,4.366379,0.633621
...,...,...,...,...,...,...
100831,609,9416,4.0,1493848402,3.688556,0.311444
100832,609,9443,5.0,1493850091,3.688556,1.311444
100833,609,9444,5.0,1494273047,3.688556,1.311444
100834,609,9445,5.0,1493846352,3.688556,1.311444


In [5]:
from zero.sgd_temporal import MangakiSGDTemporal
import random

users = np.array(ratings_table[['userId']])[:,0]
items =np.array(ratings_table[['movieId']])[:,0]
ratings =np.array(ratings_table[['rating']])[:,0]
timestamps =np.array(ratings_table[['timestamp']])[:,0]
mean_of_user = np.array(ratings_table[['mean_of_user']])[:,0]
scaled_ratings = np.array(ratings_table[['scaled_rating']])[:,0]

def add_in_ordered_list_of_3_uplets(list, item):
    (a,b,c) = item
    i=0
    while i<len(list) and c > list[i][2] :
        i+=1
    list.insert(i, item)

def remove_duplicates(l):
    newl = []
    for x in l :
        if x not in newl:
            newl.append(x)
    return newl

def list_into_dictionnaries(users, items, ratings, timestamps):
    users_dict = {}
    test_dict = {}
    users_individual = remove_duplicates(users)
    random.shuffle(users_individual)
    nb_users = len(users_individual)
    training_users = users_individual[:math.floor(0.8*nb_users)]
    testing_users = users_individual[math.floor(0.8*nb_users):]
    for (i,j,r, t) in zip(users, items, ratings, timestamps) :
        if i in training_users:
            if i in users_dict :
                add_in_ordered_list_of_3_uplets(users_dict[i],(j,r,t))
            else :
                users_dict[i] = [(j,r,t)]
        else :
            if i in test_dict :
                add_in_ordered_list_of_3_uplets(test_dict[i],(j,r,t))
            else :
                test_dict[i] = [(j,r,t)]
    return (users_dict, test_dict)

In [6]:
users_dict,test_dict = list_into_dictionnaries(users, items, ratings, timestamps)

In [7]:
sequences_list = []

for i in users_dict.keys() :
    for (movie, rating, t) in users_dict[i]:
        if rating > 3.0:
            sequences_list.append(str(movie) + "+")
        else :
            sequences_list.append(str(movie) + "-")
    sequences_list.append(".")

#sequences_list

In [8]:
sequences_string = ""

for x in sequences_list:
    sequences_string = sequences_string + x + " "

#sequences_string

In [9]:
'''import torchtext
TEXT = torchtext.data.Field(init_token='<sos>',
                            eos_token='<eos>',
                            lower=True)


TEXT.build_vocab(sequences_list)
train_txt = TEXT.numericalize(sequences_list)
val_txt = test_txt = train_txt
print(TEXT.numericalize(sequences_string))
print(len(sequences_list))
print(train_txt.shape)
print('long', len(TEXT.vocab.stoi))'''

"import torchtext\nTEXT = torchtext.data.Field(init_token='<sos>',\n                            eos_token='<eos>',\n                            lower=True)\n\n\nTEXT.build_vocab(sequences_list)\ntrain_txt = TEXT.numericalize(sequences_list)\nval_txt = test_txt = train_txt\nprint(TEXT.numericalize(sequences_string))\nprint(len(sequences_list))\nprint(train_txt.shape)\nprint('long', len(TEXT.vocab.stoi))"

In [10]:
'''
train_txt = torch.transpose(train_txt, 0, 1)
train_txt = torch.reshape(train_txt, (100,84133))
val_txt = train_txt
test_txt = train_txt'''

'\ntrain_txt = torch.transpose(train_txt, 0, 1)\ntrain_txt = torch.reshape(train_txt, (100,84133))\nval_txt = train_txt\ntest_txt = train_txt'

In [11]:
'''converter(train_txt)'''

'converter(train_txt)'

In [12]:
'''print(TEXT.numericalize(train))
print('long', len(TEXT.vocab.stoi))'''

"print(TEXT.numericalize(train))\nprint('long', len(TEXT.vocab.stoi))"

In [13]:
import torchtext
from torchtext.data.utils import get_tokenizer
TEXT = torchtext.data.Field(init_token='<sos>',
                            eos_token='<eos>',
                            lower=True)
train_txt, val_txt, test_txt = sequences_list, sequences_list, sequences_list
TEXT.build_vocab([sequences_list])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def batchify(data, bsz):
    data = TEXT.numericalize([data])
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    print(nbatch)
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

batch_size = 100
eval_batch_size = 100
train_data = batchify(train_txt, batch_size)
#train_data = torch.transpose(train_data, 0, 1)
val_data = batchify(val_txt, eval_batch_size)
test_data = batchify(test_txt, eval_batch_size)


827
827
827


In [14]:
train_data.shape

torch.Size([827, 100])

In [15]:

#train_data = torch.reshape(train_data, (84133,100))
#train_data.shape
#val_data = train_data
#test_data = train_data

In [16]:
#train_data = torch.transpose(train_data, 0, 1)

Functions to generate input and target sequence
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




In [17]:
'''dico_file = open("dico_fr.txt", 'r')
dico_string = dico_file.read()
dico_list = dico_string.split('\n')
lists_of_letters = []
for x in dico_list:
    word_list = [ord(i)-ord(' ') for i in x.rjust(25)]
    lists_of_letters.append(word_list)    
train_data = torch.tensor(lists_of_letters[:100])
val_data = train_data
test_data = train_data
print(train_data)'''

'dico_file = open("dico_fr.txt", \'r\')\ndico_string = dico_file.read()\ndico_list = dico_string.split(\'\n\')\nlists_of_letters = []\nfor x in dico_list:\n    word_list = [ord(i)-ord(\' \') for i in x.rjust(25)]\n    lists_of_letters.append(word_list)    \ntrain_data = torch.tensor(lists_of_letters[:100])\nval_data = train_data\ntest_data = train_data\nprint(train_data)'

``get_batch()`` function generates the input and target sequence for
the transformer model. It subdivides the source data into chunks of
length ``bptt``. For the language modeling task, the model needs the
following words as ``Target``. For example, with a ``bptt`` value of 2,
we’d get the following two Variables for ``i`` = 0:

![](../_static/img/transformer_input_target.png)


It should be noted that the chunks are along dimension 0, consistent
with the ``S`` dimension in the Transformer model. The batch dimension
``N`` is along dimension 1.




In [18]:
bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

Initiate an instance
--------------------




The model is set up with the hyperparameter below. The vocab size is
equal to the length of the vocab object.




In [19]:
ntokens = len(TEXT.vocab.stoi) # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

Run the model
-------------




`CrossEntropyLoss <https://pytorch.org/docs/master/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss>`__
is applied to track the loss and
`SGD <https://pytorch.org/docs/master/optim.html?highlight=sgd#torch.optim.SGD>`__
implements stochastic gradient descent method as the optimizer. The initial
learning rate is set to 5.0. `StepLR <https://pytorch.org/docs/master/optim.html?highlight=steplr#torch.optim.lr_scheduler.StepLR>`__ is
applied to adjust the learn rate through epochs. During the
training, we use
`nn.utils.clip_grad_norm\_ <https://pytorch.org/docs/master/nn.html?highlight=nn%20utils%20clip_grad_norm#torch.nn.utils.clip_grad_norm_>`__
function to scale all the gradient together to prevent exploding.




In [20]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time
def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    ntokens = len(TEXT.vocab.stoi)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    ntokens = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output = eval_model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

Loop over epochs. Save the model if the validation loss is the best
we've seen so far. Adjust the learning rate after each epoch.



In [21]:
best_val_loss = float("inf")
epochs = 60 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 16.63s | valid loss  8.74 | valid ppl  6262.78
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 15.86s | valid loss  8.67 | valid ppl  5842.31
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   3 | time: 17.22s | valid loss  8.62 | valid ppl  5525.52
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   4 | time: 17.76s | valid loss  8.52 | valid ppl  4990.71
--------------------------------------------------------------------------

-----------------------------------------------------------------------------------------
| end of epoch  34 | time: 28.01s | valid loss  1.96 | valid ppl     7.10
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch  35 | time: 18.32s | valid loss  1.91 | valid ppl     6.73
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch  36 | time: 26.69s | valid loss  1.86 | valid ppl     6.40
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch  37 | time: 21.97s | valid loss  1.81 | valid ppl     6.12
--------------------------------------------------------------------------

In [22]:
best_model

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=200, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=200, out_features=200, bias=True)
        (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=20

In [23]:
#train_data = torch.transpose(train_data,0,1)

In [24]:
'''mot = train_data[:,0:1].clone()
mot
for x in converter(mot):
    for i in x:
        print(i)'''

'mot = train_data[:,0:1].clone()\nmot\nfor x in converter(mot):\n    for i in x:\n        print(i)'

In [25]:
converter = np.vectorize(lambda x: TEXT.vocab.itos[x])
unconverter = np.vectorize(lambda x: TEXT.vocab.stoi[x])

In [26]:
mot = train_data[:,14:15].clone()
mot.shape

torch.Size([827, 1])

In [27]:
mot = mot[:3000]

In [28]:
for x in converter(mot):
    for i in x:
        print(i)

3097+
2912+
3191+
3409+
3358-
2444+
2459+
2610-
2775+
2338+
1913+
1855+
1889-
1822+
1877+
1680+
1733+
1668+
1393+
1262+
1479+
1434+
996+
794-
955+
904+
829+
715+
518+
470+
384+
203+
80+
66+
200+
5250+
4192+
4606+
4068+
3981+
3391+
3393+
2034-
.
2822+
910+
328-
2531-
2802+
2395-
2144+
2030+
1873-
855+
2466+
2257+
2031+
2013-
2473+
2262-
2189+
2083-
2021-
1831-
190+
2410+
2042-
1915-
1959+
831+
2379+
2224+
2035+
199+
2391+
2037+
2775+
2670+
1945+
1724+
620-
2803-
2390+
2380+
1790-
2696-
2043-
2419+
1882-
1871-
1958-
2019+
1930+
2324+
2077+
2027+
1796+
1795+
2134-
2029+
1741-
2832+
1960+
316-
1289+
.
509-
508+
337-
257+
123-
506+
307+
302+
138-
126-
277+
197-
510+
378-
134-
398+
217-
249+
43+
20-
395+
322+
35-
229-
133-
314+
418-
376-
274-
145-
507+
485+
202-
383-
334-
308-
254-
157-
172-
144-
131-
287-
364-
472-
436+
504+
190+
203-
136-
484-
412+
5-
22-
473-
2+
630-
.
510+
0+
418+
314+
1938+
6388+
733+
277+
913+
921+
895+
4170+
2224+
694+
257+
902+
602+
461+
659+
898+
6693+
3617+
3979+
3

In [29]:
for i in range(0,700):
    #print("mot",converter(mot))
    prediction = best_model(mot).argmax(axis=2)
    print(converter(mot)[i + 1], 'par', converter(prediction)[ i])
    #mot[ i+1] = prediction[ i]
    #print("pred" ,converter(prediction))
    #print("mot",converter(mot))
    

['2912+'] par ['2912+']
['3191+'] par ['3191+']
['3409+'] par ['3409+']
['3358-'] par ['3358-']
['2444+'] par ['3086+']
['2459+'] par ['2459+']
['2610-'] par ['2610-']
['2775+'] par ['2775+']
['2338+'] par ['2338+']
['1913+'] par ['2325-']
['1855+'] par ['1855+']
['1889-'] par ['1889-']
['1822+'] par ['1822+']
['1877+'] par ['1877+']
['1680+'] par ['1680+']
['1733+'] par ['1733+']
['1668+'] par ['1668+']
['1393+'] par ['1393+']
['1262+'] par ['1262+']
['1479+'] par ['1479+']
['1434+'] par ['6985+']
['996+'] par ['4675-']
['794-'] par ['794-']
['955+'] par ['190+']
['904+'] par ['609-']
['829+'] par ['829+']
['715+'] par ['715+']
['518+'] par ['518+']
['470+'] par ['470+']
['384+'] par ['384+']
['203+'] par ['2038-']
['80+'] par ['80+']
['66+'] par ['66+']
['200+'] par ['200+']
['5250+'] par ['5250+']
['4192+'] par ['4352-']
['4606+'] par ['3564+']
['4068+'] par ['649-']
['3981+'] par ['6155-']
['3391+'] par ['3391+']
['3393+'] par ['3393+']
['2034-'] par ['263+']
['.'] par ['4853+']
['

['6045+'] par ['3162+']
['6204+'] par ['4354-']
['786+'] par ['1938+']
['1468+'] par ['1720+']
['5955+'] par ['994+']
['969+'] par ['10+']
['1401+'] par ['1211+']
['1542+'] par ['2391+']
['4350+'] par ['506+']
['184+'] par ['3409+']
['2650+'] par ['503+']
['1051+'] par ['2290+']
['1601+'] par ['2903+']
['942+'] par ['784+']
['2076-'] par ['1882+']
['527+'] par ['942+']
['1584+'] par ['3225+']
['459+'] par ['.']
['831+'] par ['409+']
['2637+'] par ['777-']
['4604+'] par ['1266+']
['789+'] par ['2523+']
['2390+'] par ['2435-']
['1881+'] par ['302+']
['1374+'] par ['2248-']
['7026+'] par ['23+']
['1549+'] par ['1032+']
['2984+'] par ['1548+']
['5722+'] par ['594-']
['3520+'] par ['740+']
['95+'] par ['3635+']
['3086+'] par ['1176+']
['2222+'] par ['2992+']
['6058+'] par ['3241+']
['1660+'] par ['443+']
['53+'] par ['2903+']
['993+'] par ['300+']
['4135+'] par ['285+']
['1558+'] par ['811+']
['2220+'] par ['4791+']
['5370+'] par ['1767+']
['5163+'] par ['385+']
['6046+'] par ['1882+']
['34

In [30]:
test_sequences_list = []

for i in test_dict.keys() :
    for (movie, rating, t) in test_dict[i]:
        if rating > 3.0:
            test_sequences_list.append(str(movie) + "+")
        else :
            test_sequences_list.append(str(movie) + "-")
    test_sequences_list.append(".")
    
manual_test_data = batchify(test_sequences_list, batch_size)

186


In [130]:
test_mot = torch.flatten(manual_test_data.clone()).unsqueeze(-1)
test_mot = test_mot[0:200,:]
test_mot.shape

torch.Size([200, 1])

In [109]:
for x in converter(test_mot):
    for i in x:
        print(i)

277-
320-
<unk>
2610+
910+
2877+
443-
470-
1985+
43+
1398+
1938+
1644+
1585+
6405+
994-
2108-
7493+
3617+
2048-
904+
1437+
4402+
7569+
5686+
897+
1690+
3404-
.
900+
2588+
2803+
6241+
6520+
948+
276+
4131+
1182-
138-
1059-
1971-
3814+
3543-
166-
3910+
6429-
4145+
6994+
6901-
2512+
5150+
2700+
123-
199+
2257+
2625+
989+
3189+
3845+
584-
4007+
455-
6405+
314-
6405+
1208-
986+
302-
<unk>
<unk>
4135+
<unk>
4450+
<unk>
<unk>
1635+
6423+
6517+
3002+
1776+
3460+
2592-
3623+
4789+
1996-
398+
6544-
3814+
95-
6225-
6074-
157+
908-
6465+
1986+
2315+
4-
2514+
190+
3635+
7355+
186-
751+
9203+
508+
3007+
483+
436-
287+
4328+
1428+
224+
5895-
3867+
5938+
1822+
1810-
116-
694+
1035-
887+
512+
1916+
7086+
963+
2654+
1307+
<unk>
509-
1041+
2593+
2390-
92+
5869+
1223-
133+
510+
3136+
4900+
1057+
2067-
2391+
3123+
124-
1544-
7601+
3446+
7733+
2783+
1374+
504+
1032+
506-
333-
2473+
904+
6310+
2832+
4065-
1393+
3872-
203-
3633+
277+
<unk>
484-
98+
3938-
55-
4285+
3202+
1444+
2497+
<unk>
4640+
<unk>
3892+
126

In [110]:
for i in range(0,199):
    prediction = best_model(test_mot).argmax(axis=2)
    print(converter(test_mot)[i + 1], 'par', converter(prediction)[i])
    if converter(test_mot)[i + 1] == converter(prediction)[i] :
        print("YEAH")

['320-'] par ['2224+']
['<unk>'] par ['143-']
['2610+'] par ['7833-']
['910+'] par ['2037-']
['2877+'] par ['83+']
['443-'] par ['5122+']
['470-'] par ['233-']
['1985+'] par ['139+']
['43+'] par ['1576+']
['1398+'] par ['1429+']
['1938+'] par ['1352+']
['1644+'] par ['1644+']
YEAH
['1585+'] par ['2353+']
['6405+'] par ['2257-']
['994-'] par ['6434-']
['2108-'] par ['7591+']
['7493+'] par ['8358+']
['3617+'] par ['8671+']
['2048-'] par ['277+']
['904+'] par ['1431+']
['1437+'] par ['690+']
['4402+'] par ['2832+']
['7569+'] par ['6517+']
['5686+'] par ['5653+']
['897+'] par ['1306+']
['1690+'] par ['931-']
['3404-'] par ['1772+']
['.'] par ['3590-']
['900+'] par ['509-']
['2588+'] par ['705+']
['2803+'] par ['2579+']
['6241+'] par ['2964+']
['6520+'] par ['6729+']
['948+'] par ['6764+']
['276+'] par ['2373+']
['4131+'] par ['702+']
['1182-'] par ['4791+']
['138-'] par ['1157-']
['1059-'] par ['9+']
['1971-'] par ['509-']
['3814+'] par ['5150-']
['3543-'] par ['4791+']
['166-'] par ['910+

In [131]:
#test_mot = mot[0:200,:]
test_mot.shape

torch.Size([200, 1])

In [132]:
from sklearn.metrics import roc_auc_score

#test_mot = mot

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

predictions = []
true_values = []
for i in range(0,199):
    item = converter(test_mot[i+1])[0]
    if item[len(item)-1] == '+' or item[len(item)-1] == '-':
        plus_pred = best_model(test_mot)[i,:,unconverter(item[0:len(item)-1] + "+")]
        minus_pred = best_model(test_mot)[i,:,unconverter(item[0:len(item)-1] + "-")]
        plus_proba = sigmoid((plus_pred - minus_pred).detach())
        predictions.append(plus_proba)
        if item == item[0:len(item)-1] + "+" :
            true_values.append(1)
        else :
            true_values.append(0)

print(roc_auc_score(true_values, predictions))

0.5421235253296321


In [114]:
best_model(test_mot).max()

tensor(21.1451, grad_fn=<MaxBackward1>)

In [115]:
item = converter(test_mot[i+1])[0]
item[0:len(item)-1] + "+"

'314+'

In [118]:
unconverter = np.vectorize(lambda x: TEXT.vocab.stoi[x])

In [119]:
converter(train_data)

array([['910+', '7399+', '217-', ..., '443+', '302+', '2832+'],
       ['632+', '7911+', '2066-', ..., '2370+', '512-', '7355+'],
       ['2125+', '6422-', '1034-', ..., '1544-', '197-', '4354+'],
       ...,
       ['4935+', '1622-', '921-', ..., '1730-', '3136+', '1251+'],
       ['7450+', '1082-', '1497-', ..., '5753+', '6693+', '2761+'],
       ['3673-', '1029-', '6191+', ..., '5705+', '4421+', '1429+']],
      dtype='<U5')

In [38]:
#train_out = best_model(train_data).argmax(axis=2)

In [39]:
#converter(best_model(mot).argmax(axis=2))

In [40]:
#train_out

In [41]:
'''for i, line in enumerate(converter(train_out.numpy())):
    print(''.join(line), pre_train[i], i)'''

"for i, line in enumerate(converter(train_out.numpy())):\n    print(''.join(line), pre_train[i], i)"

Evaluate the model with the test dataset
-------------------------------------

Apply the best model to check the result with the test dataset.



In [133]:
test_loss = evaluate(best_model, test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

RuntimeError: shape '[-1, 14483]' is invalid for input of size 47029500

In [10]:
'''import zero

from zero.als import MangakiALS

users_sorted = ratings_table.groupby("userId").count().sort_values('rating')
users_sorted_list = users_sorted.index.to_list()
users_training_set = users_sorted_list[:math.floor(len(users_sorted_list)*0.9)]
nb_users = len(users_sorted_list)
movies_sorted = ratings_table.groupby("movieId").count().sort_values('rating')
movies_sorted_list = movies_sorted.index.to_list()
nb_works = len(movies_sorted)
users_test_set = users_sorted_list[math.floor(len(users_sorted_list)*0.9) :]
training_ratings_table = ratings_table.query('userId in @users_training_set')
X_train = np.array(training_ratings_table[['userId', 'movieId']])
y_train = training_ratings_table['rating']
als = MangakiALS(20)
als.set_parameters(nb_users, nb_works)
als.fit(X_train, y_train)'''



Computing M: (610 × 9724)
Shapes (610, 20) (20, 9724)


In [120]:
ratings_table["binary_ratings"] = ratings_table["rating"].transform(lambda x : 1 if x>3 else 0)
ratings_table["mean_of_movie"] = ratings_table.groupby('movieId')["rating"].transform('mean')
ratings_table["mean_of_movie_binary"] = ratings_table.groupby('movieId')["binary_ratings"].transform('mean')
ratings_table

Unnamed: 0,userId,movieId,rating,timestamp,mean_of_user,scaled_rating,is_in_train,binary_ratings,mean_of_movie,mean_of_movie_binary
0,0,0,4.0,964982703,4.366379,-0.366379,1,1,3.920930,0.767442
1,0,2,4.0,964981247,4.366379,-0.366379,1,1,3.259615,0.442308
2,0,5,4.0,964982224,4.366379,-0.366379,1,1,3.946078,0.745098
3,0,43,5.0,964983815,4.366379,0.633621,1,1,3.975369,0.773399
4,0,46,5.0,964982931,4.366379,0.633621,1,1,4.237745,0.852941
...,...,...,...,...,...,...,...,...,...,...
100831,609,9416,4.0,1493848402,3.688556,0.311444,1,1,3.333333,0.666667
100832,609,9443,5.0,1493850091,3.688556,1.311444,1,1,4.142857,0.857143
100833,609,9444,5.0,1494273047,3.688556,1.311444,1,1,3.633333,0.533333
100834,609,9445,5.0,1493846352,3.688556,1.311444,1,1,4.280000,0.920000


In [121]:
ratings_table["is_in_train"] = ratings_table["userId"].transform(lambda x : 1 if x in users_dict else 0)
training_table = ratings_table[ratings_table.is_in_train == 1]
training_table["mean_of_movie"] = training_table.groupby('movieId')["rating"].transform('mean')
training_table["mean_of_movie_binary"] = training_table.groupby('movieId')["binary_ratings"].transform('mean')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [124]:
predictions = []
true_values = []
for i in range(0,199):
    item = converter(test_mot[i+1])[0]
    movieId = item[0:len(item)-1]
    if item[len(item)-1] == '+' or item[len(item)-1] == '-':
        plus_proba = list(training_table[training_table.movieId == int(movieId)]['mean_of_movie_binary'])[0]
        predictions.append(plus_proba)
        if item == movieId + "+" :
            true_values.append(1)
        else :
            true_values.append(0)

print(roc_auc_score(true_values, predictions))

0.7609299097848716
