In [32]:
import torch
import pickle
import io
import torch.nn as nn
from torch.utils.data import TensorDataset,DataLoader
import numpy as np
import time
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
import torch.optim as optim
import pdb
import math

In [33]:
if torch.cuda.is_available():
    cuda0 = torch.device('cuda:0')
else:
    cuda0 = torch.device('cpu')

In [34]:
reviews_path = '/content/drive/MyDrive/ExplainableRecommedation/reviews.pickle'

In [35]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
reviews = pickle.load(open(reviews_path, 'rb'))

In [None]:
reviews[0]

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
def load_data(data_path, max_word_num, data_size):
    tuple_list = [] # (u,i,rating,reviews)
    # collect all users id and items id
    doc_list = []
    user_set = set()
    item_set = set()
    feature_set = set()

    max_rating = 5
    min_rating = 1
    count=0
    for review in reviews:
        count+=1
        if count>data_size:
            break
        user_set.add(review['user'])
        item_set.add(review['item'])
        rating = review['rating']
        if max_rating < rating:
            max_rating = rating
        if min_rating > rating:
            min_rating = rating

        if 'text' in review:
            doc_list.extend(review['text'].split('\n')[1].lower().split('.'))
        if 'sentence' in review:
            quadtruple_list = review['sentence']
            for quadtruple in quadtruple_list:
                # doc_list.append(quadtruple[2].lower())
                feature_set.add(quadtruple[0].lower())

    # convert id to array index
    user_list = list(user_set)
    item_list = list(item_set)
    #feature_list = list(feature_set)
    user2index = {x: i for i, x in enumerate(user_list)}
    item2index = {x: i for i, x in enumerate(item_list)}

    word2index, word_list, index2word = get_word2index(doc_list, max_word_num)
    return word2index, index2word, word_list, user2index, item2index, user_list, item_list, feature_set


def get_word2index(doc_list, max_word_num):
    def split_words_by_space(text):
        return text.split(' ')

    vectorizer = CountVectorizer(max_features=max_word_num, analyzer=split_words_by_space)
    vectorizer.fit(doc_list)
    word_list = ['<PAD>']+vectorizer.get_feature_names()
    word_list.extend(['<UNK>', '<GO>', '<EOS>'])
    word2index = {w: i for i, w in enumerate(word_list)}
    index2word = {word2index[w]:w for w in word2index.keys()}

    return word2index, word_list, index2word

In [39]:
def format_data(data_size):
    tuple_list = []
    seq_len_list = []
    max_num_of_words = 0
    max_num_of_sentences = 0
    m_s_rev = None
    m_w_rev = None
    count=0
    for rev in reviews:
        count+=1
        if count>data_size:
            break
        if 'sentence' not in rev:
            continue
        u = user2index[rev['user']]
        i = item2index[rev['item']]
        r = rev['rating']
        sen_indexes = []
        sentences = [sen.lower().strip().split(' ') for sen in rev['text'].split('\n')[1].split('.')]
        temp_list = []
        #n_sen = min(len(sentences),10)
        n_sen = 0
        for sen in sentences:
            if len(sen)>=2:
                n_sen+=1
                h = len(sen)
                if h>100:
                    h = 100
                temp_list.append(h+2) # <GO> and <EOS>
                f = set(sen[:h]).intersection(feature_set)
                w_list = [word2index['<GO>']]+[word2index.get(w, word2index['<UNK>']) for w in sen[:h]]
                w_list.append(word2index['<EOS>'])
                beta = len(f)/(len(word_list)-1)
                w_list.append(beta)
                sen_indexes.append(w_list)
            if n_sen==3:
                break
        n_w = max(temp_list)
        #n_sen = len(temp_list)
        if n_w > max_num_of_words:
            max_num_of_words = n_w
            m_w_rev = sentences
        if n_sen > max_num_of_sentences:
            max_num_of_sentences = n_sen
            m_s_rev = sentences
        seq_len_list.append(temp_list)
        tup = (u,i,r,sen_indexes)
        tuple_list.append(tup)
    return tuple_list, seq_len_list, max_num_of_sentences, max_num_of_words#, m_s_rev, m_w_rev

In [40]:
max_vocab_len = 10000
data_size = 9000
word2index, index2word, word_list, user2index, item2index, user_list, item_list, feature_set = load_data(reviews_path, max_vocab_len, data_size)

In [41]:
len(word2index)

10004

In [42]:
tuple_list, seq_len_list, max_num_of_sentences, max_num_of_words = format_data(data_size) # [u,i,r,sen;beta]

In [43]:
len(feature_set),max_num_of_sentences, max_num_of_words #, m_s_rev

(513, 3, 102)

In [44]:
rev_data = np.zeros((len(tuple_list),max_num_of_sentences,max_num_of_words), dtype=np.int64) # beta value
beta_data = np.zeros((len(tuple_list),max_num_of_sentences), dtype = np.float32)

In [45]:
u = np.zeros(len(tuple_list), dtype=np.int64)
i = np.zeros(len(tuple_list), dtype=np.int64)
r = np.zeros((len(tuple_list),1), dtype=np.float32)
feature_indexes = torch.from_numpy(np.array([word2index.get(w, word2index['<UNK>']) for w in feature_set], dtype=np.int64)).to(cuda0)

In [46]:
for idx,tup in enumerate(tuple_list):
    #tup = (u,i,r,sen_indexes)
    u[idx],i[idx],r[idx],sen_indexes = tup
    for k in range(len(sen_indexes)):
        rev_data[idx,k,:seq_len_list[idx][k]] = sen_indexes[k][:-1]
        beta_data[idx,k] =  sen_indexes[k][-1]

In [47]:
tuple_list = None; feature_set = None

In [48]:
np.max(u), len(user2index)

(2975, 2976)

In [49]:
Batch_size = 50

In [50]:
len(word_list)

10004

In [51]:
dataset = TensorDataset(torch.from_numpy(u).to(cuda0), torch.from_numpy(i).to(cuda0),
                        torch.from_numpy(r).to(cuda0), torch.from_numpy(rev_data).to(cuda0), torch.from_numpy(beta_data).to(cuda0))
train_data, validation_data = train_test_split(dataset,test_size = .33, random_state=0)
train_loader = DataLoader(train_data,batch_size=Batch_size, shuffle=True)
val_loader = DataLoader(validation_data,batch_size=Batch_size, shuffle=True)

C 0 = φ(W u c 0 u + W v c 0 v + b c 0 )
h w
n,0 = W n,2 φ(W n,1 (C n ; u; v; o n ) + b n,1 ) + b n,2

Model Building and Shit

In [52]:
class Recommender(nn.Module):
    def __init__(self,num_layers, u_dim, i_dim, hidden_dim, word_dim, vocab_len, feature_indexes, max_sentences, max_words, n_user, n_item):
        super(Recommender, self).__init__()

        # The Rating Module... i/p: u,i; o/p:rating
        self.u_embed = nn.Embedding(n_user, u_dim, padding_idx=word2index['<PAD>'])
        self.i_embed = nn.Embedding(n_item, i_dim, padding_idx=word2index['<PAD>'])
        self.f1 = nn.Linear(in_features=u_dim+i_dim, out_features=hidden_dim)
        self.f2 = nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
        self.f3 = nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
        self.f4 = nn.Linear(in_features=hidden_dim, out_features=1)
        self.tanh = nn.Tanh()
        self.relu = nn.ReLU()
        self.feature_indexes = feature_indexes
        self.vocab_len = vocab_len

        # The NLP Module... i/p: u,i, o/p: set of sentences
        self.num_layers = num_layers
        self.word = nn.Embedding(vocab_len, word_dim, padding_idx=word2index['<PAD>'])
        self.c_0_layer = nn.Linear(u_dim+i_dim, hidden_dim)
        self.atten1 = nn.Linear(hidden_dim + word_dim, hidden_dim)
        self.atten2 = nn.Linear(hidden_dim,1)
        self.initial_hid_gru_word_layer_1 = nn.Linear(hidden_dim + u_dim + i_dim + word_dim, hidden_dim)
        self.initial_hid_gru_word_layer_2 = nn.Linear(hidden_dim, hidden_dim)
        self.Gru_word = nn.GRU(word_dim, hidden_dim, num_layers = num_layers)
        self.Gru_context = nn.GRU(hidden_dim, hidden_dim, num_layers = num_layers)
        self.max_sentences = max_sentences
        self.max_words = max_words
        self.final_layer_1 = nn.Linear(hidden_dim, vocab_len//2)
        self.final_layer_2 = nn.Linear(vocab_len//2, vocab_len)
        self.p_word_embed_1 = nn.Linear(word_dim+u_dim+i_dim, hidden_dim)
        self.p_word_embed_2 = nn.Linear(hidden_dim, hidden_dim)
        self.p_word_embed_3 = nn.Linear(hidden_dim, hidden_dim)
    
    def attention(self,h):
        # h = [num_layers, batch_size, hidden_dim]
        # feature_indexes = [n_feature]
        # pdb.set_trace()
        # print(h.shape)
        num_layers = h.shape[0]
        hid_dim = h.shape[-1]
        h = h.view(-1,hid_dim)
        # [num_layers*batch_size, hidden_dim]
        batch_size = h.shape[0]
        src_len = len(feature_indexes)
        self.feature_embed = self.word(feature_indexes).repeat(batch_size,1,1) # [num_layers*batch_size, n_features, word_dim]
        h = h.unsqueeze(1).repeat(1,src_len,1) # [num_layers*batch_size, n_features, hidden_dim]
        # print(h.shape,self.feature_embed.shape)
        x = self.atten1(torch.cat((h,self.feature_embed), dim =2))
        x = self.relu(x)
        # [num_layers*batch_size, n_features, hidden_dim]
        x = self.atten2(x).squeeze(2)
        return F.softmax(x,dim=1) # [num_layers*batch_size, n_features]

    def ratingMod(self):
        # u = [batch_size]
        # i = [batch_size]
        # [batch_size,u_dim]
        x = torch.cat((self.latent_u,self.latent_v),dim =1)
        x = self.f1(x)
        x = self.tanh(x)
        x = self.f2(x)
        x = self.tanh(x)
        x = self.f3(x)
        x = self.tanh(x)
        return self.f4(x)

    def getInitialHiddenState(self,C_0):
        # C_0 : [num_layers, batch_size, hidden_dim]
        # pdb.set_trace()
        hid_dim = C_0.shape[2]
        atten_score = self.attention(C_0).unsqueeze(1)
        # atten_score = [num_layers*batch_size, 1, n_features]
        latent_u = self.latent_u.repeat(C_0.shape[0],1)
        latent_v = self.latent_v.repeat(C_0.shape[0],1)
        # self.feature_embed = [num_layers*batch_size, n_features, word_dim]
        o_0 = torch.bmm(atten_score,self.feature_embed).squeeze(1)
        # o_0 = [num_layers*batch_size, word_dim]
        x = torch.cat((C_0.view(-1,hid_dim),latent_u,latent_v,o_0), dim =1)
        # h w n,0 = W n,2 φ(W n,1 (C n ; u; v; o n ) + b n,1 ) + b n,2
        x = self.initial_hid_gru_word_layer_1(x)
        x = self.relu(x)
        hid_dim = x.shape[-1]
        # [num_layers*batch_size, hid_dim]
        return self.initial_hid_gru_word_layer_2(x).view(self.num_layers,-1,hid_dim)


    def forward(self, u,i,rev):
        # rev = [batch_size, max_len_sentences, max_len_words]
        self.latent_u = self.u_embed(u)
        self.latent_v = self.i_embed(i)
        ratings = self.ratingMod()

        # time for NLP shit....
        x = torch.cat((self.latent_u,self.latent_v),dim =1)
        C_0 = self.relu(self.c_0_layer(x)).unsqueeze(0).repeat(self.num_layers,1,1)
        # [num_layers, batch_size, hidden_dim]
        batch_size = self.latent_u.shape[0]
        decoder_outputs = self.latent_u.new_zeros((batch_size,self.max_sentences, self.max_words-1, self.vocab_len))
        if rev is not None: # training mode
            #decoder_outputs = rev.new_zeros((batch_size,self.max_sentence, self.max_words, self.vocab_len))
            for i in range(self.max_sentences):
                h_0 = self.getInitialHiddenState(C_0)
                # [num_layers, batch_size, hid_dim]
                for j in range(self.max_words-1): # excluding the <EOS> as input for next word prediction
                    w_index = rev[:,i,j]
                    w_embed = self.word(w_index)
                    # w_embed = self.relu(self.p_word_embed(torch.cat((w_embed,latent_u,latent_v), dim =1)))
                    w_embed = self.relu(self.p_word_embed_1(torch.cat((w_embed,self.latent_u,self.latent_v), dim =1)))
                    w_embed = self.relu(self.p_word_embed_2(w_embed))
                    w_embed = self.p_word_embed_3(w_embed).unsqueeze(0)
                    # [1,batch_size, word_dim]
                    O_w, h_0 = self.Gru_word(w_embed,h_0)
                    # [1, batch, hidden_size] [num_layers, batch, hidden_size]
                    x = self.final_layer_1(O_w.squeeze(0))
                    x = self.relu(x)
                    out = self.final_layer_2(x)
                    #x.register_hook(print)
                    #x[:,word2index['<UNK>']] += -500 #-math.inf
                    #out = F.log_softmax(x, dim =1)
                    # [batch_size, vocab_len]
                    decoder_outputs[:,i,j,:] = out
                O_s, C_0 = self.Gru_context(O_w,C_0)
            return decoder_outputs, ratings
        else: # testing mode
            for i in range(self.max_sentences):
                h_0 = self.getInitialHiddenState(C_0)
                # [batch_size, hidden_dim]
                w_index = torch.full((batch_size,),word2index['<GO>']).to(cuda0)
                for j in range(self.max_words-1):
                    w_embed = self.word(w_index)
                    # w_embed = self.relu(self.p_word_embed(torch.cat((w_embed,latent_u,latent_v), dim =1)))
                    w_embed = self.relu(self.p_word_embed_1(torch.cat((w_embed,self.latent_u,self.latent_v), dim =1)))
                    w_embed = self.relu(self.p_word_embed_2(w_embed))
                    w_embed = self.p_word_embed_3(w_embed).unsqueeze(0)
                    # [1,batch_size, word_dim]
                    O_w, h_0 = self.Gru_word(w_embed,h_0)
                    # [1, batch, hidden_size] [num_layers, batch, hidden_size]
                    x = self.final_layer_1(O_w.squeeze(0))
                    x = self.relu(x)
                    out = self.final_layer_2(x)
                    #x[:,word2index['<UNK>']] = -500 #-math.inf
                    #out = F.log_softmax(x, dim =1)
                    # [batch_size, vocab_len]
                    w_index = torch.argmax(out,dim=1)
                    decoder_outputs[:,i,j,:] = out
                O_s,C_0 = self.Gru_context(O_w,C_0)
            return decoder_outputs, ratings

In [65]:
u_dim, i_dim, hidden_dim, word_dim = 300,300,300,300
num_layers =2
vocab_len = len(word_list)
model = Recommender(num_layers,u_dim, i_dim, hidden_dim, word_dim, vocab_len, feature_indexes, max_num_of_sentences, max_num_of_words, len(user2index), len(item2index))

In [None]:
model.to(cuda0)

In [67]:
optimizer = optim.Adam(model.parameters(),lr = .005,weight_decay=.001)
NLP_criterion = nn.CrossEntropyLoss(ignore_index=word2index['<PAD>'])
Rating_criterion = nn.MSELoss()

In [68]:
def training(model, iterator, optimizer, NLP_criterion,Rating_criterion, clip, scaling_factor):
    nlp_epoch_loss = 0
    rating_epoch_loss = 0
    for u,i,r,rev,beta in iterator:
        decoder_outputs, ratings = model(u,i,rev) # normailized decoder output
        # [batch_size, max_sentence, self.max_words-1, self.vocab_len], [batch_size]
        # beta = [batch_size, max_sentence]
        # rev = [batch_size, max_sentence, self.max_words]
        # r = [batch_size]
        # rating = [batch_size]
        #pdb.set_trace()
        n_classes = decoder_outputs.shape[-1]
        beta = beta.unsqueeze(2).unsqueeze(3).repeat(1,1,model.max_words-1,n_classes)
        # [batch_size, max_sentence, self.max_words-1]
        weighted_decoder_outputs = decoder_outputs#*beta
        targ = rev[:,:,1:].reshape(-1)
        pred = weighted_decoder_outputs.view(-1,n_classes)
        loss1 = NLP_criterion(pred,targ)#*scaling_factor
        loss2 = Rating_criterion(ratings,r)
        #print(loss1,loss2)
        loss = loss1+loss2
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        nlp_epoch_loss+=loss1.item()
        rating_epoch_loss+=loss2.item()
    return nlp_epoch_loss/len(iterator), rating_epoch_loss/len(iterator)


In [None]:
clip=1
scaling_factor = 10**6
epochs = 5
nlp_loss_list = []
rating_loss_list = []
for epoch in range(epochs):
    start = time.time()
    nlp_loss, rating_loss = training(model,train_loader, optimizer,
                                     NLP_criterion,Rating_criterion, clip, scaling_factor)
    end = time.time()
    torch.save(model.state_dict(),'/content/drive/MyDrive/ExplainableRecommedation/C_model.pt')
    rating_loss_list.append(rating_loss)
    nlp_loss_list.append(nlp_loss)
    print('epoch number:',epoch+1,'time per epoch(secs):', end-start,
          'nlp_loss:',nlp_loss, 'rating_loss:',rating_loss, 'total_loss:',nlp_loss + rating_loss)

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/ExplainableRecommedation/C_model.pt'))

In [None]:
epoch_loss = 0
for u,i,r,rev,beta in val_loader:
    decoder_outputs, ratings = model(u,i,None)
    loss2 = torch.sqrt(Rating_criterion(ratings,r))
    epoch_loss+=loss2
print(epoch_loss/len(val_loader))

In [78]:
x = torch.argmax(decoder_outputs,dim=3)
x.shape
x = x.to('cpu').numpy()

In [None]:
i=np.random.randint(0,x.shape[0])
for j in range(x.shape[1]):
    w_list = x[i,j]
    #print(w_list)
    w_list = [index2word[w] for w in w_list]
    print(' '.join(w_list))