In [119]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
import torch.utils.data
import scipy.sparse as sp
import torch.nn.functional as F
from rankfm.rankfm import RankFM
import logging
from copy import deepcopy as dp
from sklearn.metrics import ndcg_score
from lightfm.datasets import fetch_movielens

In [2]:
ratings = pd.read_csv('ratings.dat', delimiter='::', header=None, 
        names=['user_id', 'movie_id', 'rating', 'timestamp'], 
        usecols=['user_id', 'movie_id', 'rating'], engine='python')

movie_info = pd.read_csv('movies.dat', delimiter='::', header=None, 
        names=['movie_id', 'name', 'category'], engine='python')

implicit_ratings = ratings.loc[(ratings['rating'] >= 4)]

users = implicit_ratings["user_id"]
movies = implicit_ratings["movie_id"]

### сделаем датасет нормальным (без пропусков айдишников)

In [3]:
l_users = len(np.unique(users))
l_items = len(np.unique(movies))

# прямое соответсвие
user_dict = {i:j for j, i in enumerate(np.unique(users))}
item_dict = {i:j for j, i in enumerate(np.unique(movies))}
# обратное соответсвие
ruser_dict = {i:j for j, i in user_dict.items()}
ritem_dict = {i:j for j, i in item_dict.items()}

df_ = implicit_ratings.loc[:, ['user_id', 'movie_id']]
df_.user_id = df_.user_id.apply(lambda x: user_dict[x])
df_.movie_id = df_.movie_id.apply(lambda x: item_dict[x])

users = df_.user_id.max() + 1
movies = df_.movie_id.max() + 1

In [4]:
train_u, train_i = [], []
test_u, test_i = [], []

dfnp = df_.to_numpy()
all_item_set = set(dfnp[:, 1])
unseens = dict()
min_len_unseens = int(1e5)

for u in range(users):
    seen = dfnp[dfnp[:, 0] == u, 1]
    seen_train = seen[:-1]
    seen_set = set(seen_train)
    unseen_set = all_item_set - seen_set
    unseens[u] = list(unseen_set)
    min_len_unseens = min(len(unseen_set), min_len_unseens)
    seen_test = seen[-1:]
    train_u.extend([u] * len(seen_train))
    train_i.extend(seen_train)
    
    test_u.append(u)
    test_i.append(seen_test)

In [5]:
test_data = np.column_stack((test_u, test_i))
train_data = np.column_stack((train_u, train_i))

In [6]:
# метрики, которые мы будем считать
def calc_metrics(model, unseens, k):
    ndcg_scores = []
    htr_scores = []
    for u in tqdm(range(users), position=0, leave=False, desc='calc metrix...'):
        try:
            seen = test_data[u]
            unseen = np.random.choice(unseens[u], 500, replace=False)
            data = np.column_stack(([u] * 500, unseen))

            pred = model.predict(np.vstack((seen, data)))
            real = np.hstack(([1], [0] * 500))
            
            ndcg_scores.append(ndcg_score([real], [pred], k=k))
            htr_scores.append((np.argsort(pred)[::-1][:k] == 0).sum())
        except:
            pass
    print(f'NDCG_{k} = {np.mean(ndcg_scores)}, HTR_{k} = {np.mean(htr_scores)}')

###  обоснование датасета

Решил взять датасет из первой домашки, потому что уже знаком с ним, на нем реализовал WARP, тем более датасет популярен

### MF (WARP)

In [12]:
model = RankFM(factors=50, loss='warp', max_samples=100, learning_schedule='invscaling')
model.fit(train_data, epochs=50, verbose=False)

In [31]:
# функция для оценки качества 

get_similars = lambda item_id, model : [movie_info[movie_info["movie_id"] == ritem_dict[x]]["name"].to_string() 
                                        for x in model.similar_items(item_id)]

get_user_history = lambda user_id, implicit_ratings : [movie_info[movie_info["movie_id"] == ritem_dict[x]]["name"].to_string() 
                                            for x in implicit_ratings[implicit_ratings[:, 0] == user_id, 1]]

get_recommendations = lambda user_id, model : [movie_info[movie_info["movie_id"] == ritem_dict[x]]["name"].to_string()
                                               for x in model.recommend([user_id]).to_numpy()[0]]

In [32]:
get_similars(0, model)

['3045    Toy Story 2 (1999)',
 '584    Aladdin (1992)',
 '360    Lion King, The (1994)',
 '1838    Mulan (1998)',
 "2286    Bug's Life, A (1998)",
 '591    Beauty and the Beast (1991)',
 "2497    Doug's 1st Movie (1999)",
 '33    Babe (1995)',
 '1526    Hercules (1997)',
 '735    Close Shave, A (1995)']

In [33]:
get_user_history(3, train_data)

['3399    Hustler, The (1961)',
 '2882    Fistful of Dollars, A (1964)',
 '1196    Alien (1979)',
 '1023    Die Hard (1988)',
 '257    Star Wars: Episode IV - A New Hope (1977)',
 '1959    Saving Private Ryan (1998)',
 '476    Jurassic Park (1993)',
 '1180    Raiders of the Lost Ark (1981)',
 '1885    Rocky (1976)',
 '1081    E.T. the Extra-Terrestrial (1982)',
 '3349    Thelma & Louise (1991)',
 '3633    Mad Max (1979)',
 '2297    King Kong (1933)',
 '1366    Jaws (1975)',
 '1183    Good, The Bad and The Ugly, The (1966)',
 '2623    Run Lola Run (Lola rennt) (1998)',
 '2878    Goldfinger (1964)']

In [34]:
get_recommendations(3, model)

['1180    Raiders of the Lost Ark (1981)',
 '257    Star Wars: Episode IV - A New Hope (1977)',
 '847    Godfather, The (1972)',
 '1196    Alien (1979)',
 '1366    Jaws (1975)',
 '1178    Star Wars: Episode V - The Empire Strikes Back...',
 '585    Terminator 2: Judgment Day (1991)',
 '2502    Matrix, The (1999)',
 '1183    Good, The Bad and The Ugly, The (1966)',
 '1023    Die Hard (1988)']

In [200]:
calc_metrics(model, unseens, 11)

                                                                    

NDCG_11 = 0.20874677999109134, HTR_11 = 0.4058674956285215




# NCF

### сперва сделаем датасет

In [7]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, dfnp, num_neg):
        # Rating matrix
        self.dfnp = dfnp
       
        self.num_users = dfnp[:, 0].max() + 1
        self.num_items = dfnp[:, 1].max() + 1
        
        self.matrix = np.zeros(shape=(self.num_users, self.num_items))
        # заполняем матрицу
        for u, i in tqdm(self.dfnp, position=0,leave=False):
                self.matrix[u][i] = 1
        
        # Prepare negs
        self.prepare_unseen(self.dfnp, num_neg)
        # Training set
        self.user_input, self.item_input, self.labels = self.get_train_instances(self.dfnp, num_neg)


    def __len__(self):
        return len(self.user_input)

    def __getitem__(self, index):
        'Generates one sample of data.'
        user_input = self.user_input[index]
        item_input = self.item_input[index]
        label = self.labels[index]
        return {
            'user_input': user_input,
            'item_input': item_input,
            'label': label
        }
    
    
    def prepare_unseen(self, dfnp, num_neg):
        self.unseens = dict()
        all_item_set = set(dfnp[:, 1])
        for u in tqdm(set(dfnp[:, 0]), position=0, leave=False, desc='prepare unseen items...'):
            seen = dfnp[dfnp[:, 0] == u, 1]
            seen_set = set(seen)
            unseen_set = all_item_set - seen_set
            self.unseens[u] = list(unseen_set)
        

    def get_train_instances(self, dfnp, num_neg):
        user_input, item_input, labels = [], [], []
        for (u, i) in tqdm(dfnp, position=0, leave=False, desc='prepare dataset ...'):
            user_input.append(u)
            item_input.append(i)
            labels.append(1)
            
            negs = np.random.choice(self.unseens[u], num_neg)
            user_input.extend([u] * num_neg)
            item_input.extend(negs)
            labels.extend([0] * num_neg)           
        return user_input, item_input, labels

### составим датасет

In [103]:
dataset = Dataset(train_data, 6)

                                                                              

### структура MLP

In [105]:
class MLP(nn.Module):
    def __init__(self, num_users, num_items, embed_dim, hidden_size):
        super().__init__()
        self.MLP_user_embedding = nn.Embedding(num_users, embed_dim)
        self.MLP_item_embedding = nn.Embedding(num_items, embed_dim)
        
        self.layers = nn.Sequential(nn.Linear(2 * embed_dim, hidden_size * 2),
                                      nn.ReLU(),
                                      nn.Linear(hidden_size * 2, hidden_size),
                                      nn.ReLU(),
                                      nn.Linear(hidden_size, hidden_size),
                                      nn.ReLU())
        self.output = nn.Linear(hidden_size, 1)
        
    def forward(self, feeddict, train=False):
        user_input = feeddict['user_input']
        item_input = feeddict['item_input']
        # MLP
        MLP_user_embedding = self.MLP_user_embedding(user_input)
        MLP_item_embedding = self.MLP_item_embedding(item_input)
        x = torch.cat([MLP_user_embedding, MLP_item_embedding], -1)
        
        x = self.layers(x)
        if train:
            x = torch.sigmoid(self.output(x))
        return x
    
    def predict(self, data):
        feeddict = dict()
        feeddict['user_input'] = torch.LongTensor(data[:, 0])
        feeddict['item_input'] = torch.LongTensor(data[:, 1])
        with torch.no_grad():
            return self.forward(feeddict, True).numpy().squeeze()
    
    def emb(self):
        self.U = self.MLP_user_embedding.weight.data
        self.I = self.MLP_item_embedding.weight.data

### структура GMF

In [106]:
class GMF(nn.Module):
    def __init__(self, num_users, num_items, embed_dim):
        super().__init__()
        self.GMF_user_embedding = nn.Embedding(num_users, embed_dim)
        self.GMF_item_embedding = nn.Embedding(num_items, embed_dim)
        
        self.output = nn.Linear(embed_dim, 1)
        
    def forward(self, feeddict, train=False):
        user_input = feeddict['user_input']
        item_input = feeddict['item_input']
        # GMF
        GMF_user_embedding = self.GMF_user_embedding(user_input)
        GMF_item_embedding = self.GMF_item_embedding(item_input)
        x = GMF_user_embedding * GMF_item_embedding
        
        if train:
            x = torch.sigmoid(self.output(x))
        return x
    
    def predict(self, data):
        feeddict = dict()
        feeddict['user_input'] = torch.LongTensor(data[:, 0])
        feeddict['item_input'] = torch.LongTensor(data[:, 1])
        with torch.no_grad():
            return self.forward(feeddict, True).numpy().squeeze()
    
    def emb(self):
        self.U = self.GMF_user_embedding.weight.data
        self.I = self.GMF_item_embedding.weight.data

In [107]:
class Tester:
    def __init__(self, dataset):
        self.dataset = dataset
    
    @staticmethod
    def show_similars(model, item, num):
        model_cos = ((model.I @ model.I[item]) / np.linalg.norm(model.I, axis=1))
        model_samples = model_cos.argsort(descending=True)[:num + 1]
        return [movie_info[movie_info["movie_id"] == ritem_dict[x.item()]]["name"].to_string() 
                for x in model_samples]
    
    
    def show_recomendations(self, model, user, num):
        
        unseen = unseens[user]
        data =  np.column_stack(([user] * len(unseen), unseen))
        predictions = model.predict(data)
        args_r = np.argsort(predictions)[::-1][:num]
        
        return  [movie_info[movie_info["movie_id"] == ritem_dict[x.item()]]["name"].to_string() 
                for x in args_r]
    
T = Tester(dataset)

### претренируем GMF и MLP

In [123]:
mlp = MLP(df_.user_id.max() + 1, df_.movie_id.max() + 1, 32, 32)
gmf = GMF(df_.user_id.max() + 1, df_.movie_id.max() + 1, 32)

mlp_optim = torch.optim.Adam(mlp.parameters(), lr=5e-3)
gmf_optim = torch.optim.Adam(gmf.parameters(), lr=5e-3)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=256, shuffle=True)
criterion = nn.BCELoss()

In [124]:
# Train
loss = np.inf
best_loss = np.inf
for epoch in range(22):
    
    # Training
    for feed_dict in tqdm(train_loader, position=0,leave=False, desc=f'{epoch}: {loss}'):
        for key in feed_dict:
                feed_dict[key] = feed_dict[key].to(dtype = torch.long, device = 'cpu')  
        
        # Forward pass
        mlp_labels = mlp(feed_dict, True)
        gmf_labels = gmf(feed_dict, True)
        
        labels = feed_dict['label'].float().view(mlp_labels.shape)
        mlp_loss = criterion(mlp_labels, labels)
        gmf_loss = criterion(gmf_labels, labels)
        loss = mlp_loss + gmf_loss
        
        
        mlp_optim.zero_grad()
        gmf_optim.zero_grad()
        
        mlp_loss.backward()
        gmf_loss.backward()
        
        mlp_optim.step()
        gmf_optim.step()
        
        if loss < best_loss:
            best_loss = loss.item()
            save_mlp = dp(mlp)
            save_gmf = dp(gmf)
        
    logging.warning(f'{epoch}: current loss: {mlp_loss:.3f} (MLP), {gmf_loss:.3f} (GMF)')
    
mlp.emb()
gmf.emb()



### посмотрим на симиляры

In [212]:
T.show_similars(mlp, 0, 5)

['0    Toy Story (1995)',
 '3045    Toy Story 2 (1999)',
 '1180    Raiders of the Lost Ark (1981)',
 '1245    Groundhog Day (1993)',
 '584    Aladdin (1992)',
 '1179    Princess Bride, The (1987)']

In [213]:
T.show_similars(gmf, 0, 5)

['0    Toy Story (1995)',
 '3045    Toy Story 2 (1999)',
 '584    Aladdin (1992)',
 '1132    Wrong Trousers, The (1993)',
 '360    Lion King, The (1994)',
 '591    Beauty and the Beast (1991)']

### посмотрим на рекомендации

In [214]:
T.show_recomendations(gmf, 3, 5)

['1277    Real Genius (1985)',
 '2489    Forces of Nature (1999)',
 '955    Outlaw, The (1943)',
 '1174    Madonna: Truth or Dare (1991)',
 '583    Ghost (1990)']

In [215]:
T.show_recomendations(mlp, 3, 5)

['586    Dances with Wolves (1990)',
 '1213    Stalker (1979)',
 "3616    Prizzi's Honor (1985)",
 '1345    Crucible, The (1996)',
 '2863    Days of Heaven (1978)']

### посмотрим на метрики

In [216]:
calc_metrics(mlp, unseens, 11)

                                                                    

NDCG_11 = 0.15358269204087233, HTR_11 = 0.3115269956939384




In [217]:
calc_metrics(gmf, unseens, 11)

                                                                    

NDCG_11 = 0.16074836660115793, HTR_11 = 0.32957933090427294




### структура NCF

In [218]:
class NCF(nn.Module):
    def __init__(self, mlp, gmf, alpha):
        super().__init__()
        self.alpha = alpha
        self.mlp = mlp
        self.gmf = gmf
    
        self.output = nn.Linear(mlp.__dict__['_modules']['output'].in_features
                                + gmf.__dict__['_modules']['output'].in_features, 1)
    
    def forward(self, feeddict):
        user_input = feeddict['user_input']
        item_input = feeddict['item_input']
        
        mlp = self.mlp(feeddict, False)
        gmf = self.gmf(feeddict, False)
        
        r = torch.cat([self.alpha * gmf, (1 - self.alpha) * mlp], dim=-1)
        return torch.sigmoid(self.output(r)) 
    
    def predict(self, data):
        return self.matrix[data[:, 0], data[:, 1]].numpy()
    
    def predict(self, data):
        feeddict = dict()
        feeddict['user_input'] = torch.LongTensor(data[:, 0])
        feeddict['item_input'] = torch.LongTensor(data[:, 1])
        with torch.no_grad():
            return self.forward(feeddict, True).numpy().squeeze()
    
    def emb(self):
        self.I = torch.cat([self.alpha * self.gmf.I, (1 - self.alpha)*self.mlp.I], dim=-1)
        self.U = torch.cat([self.alpha * self.gmf.U, (1 - self.alpha)*self.mlp.U], dim=-1)

In [219]:
ncf = NCF(mlp, gmf, 0.5)
ncf_optim = torch.optim.Adam(ncf.parameters(), lr=5e-3)
ncf.emb()

In [220]:
# Train
loss = np.inf
best_loss = np.inf
for epoch in range(25):
    # Training
    for feed_dict in tqdm(train_loader, position=0,leave=False, desc=f'{epoch}: {loss}'):
        for key in feed_dict:
                feed_dict[key] = feed_dict[key].to(dtype = torch.long, device = 'cpu')  
        
        # Forward pass
        ncf_labels = ncf(feed_dict)
        labels = feed_dict['label'].float().view(ncf_labels.shape)
        loss = nn.BCELoss()(ncf_labels, labels)
        
        
        ncf_optim.zero_grad()
    
        loss.backward()
        
        ncf_optim.step()
        
        if loss < best_loss:
            best_loss = loss.item()
            save_ncf = dp(ncf)
ncf.emb()

                                                                               

### посмотрим на метрики

In [221]:
T.show_similars(ncf, 0, 5)

['0    Toy Story (1995)',
 '584    Aladdin (1992)',
 '3045    Toy Story 2 (1999)',
 '591    Beauty and the Beast (1991)',
 "2286    Bug's Life, A (1998)",
 '360    Lion King, The (1994)']

In [235]:
T.show_recomendations(ncf, 0, 5)

['583    Ghost (1990)',
 '955    Outlaw, The (1943)',
 '1560    Hoodlum (1997)',
 '2681    Radio Days (1987)',
 '2777    Adventures of Milo and Otis, The (1986)']

In [269]:
calc_metrics(ncf, unseens, 11)

                                                                    

NDCG_11 = 0.14741163532204385, HTR_11 = 0.29927128188141766


