In [1]:
import pandas as pd
import torch as torch
import torch.utils.data as data
import torch.nn as nn
import torch.optim as optim
import time
import numpy as np
from sklearn.manifold import TSNE

import math
from torch.utils.data import Dataset
import itertools
import seaborn as sns
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tabulate import tabulate



In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# movies_df = pd.read_csv('/kaggle/input/movielens-1m-dataset/movies.dat', sep='::',
#                      names=['movieId','title','genres'],
#                      encoding='latin-1',engine='python')

movies_df = pd.read_csv('/kaggle/input/grouplens-2018/ml-latest/movies.csv')

movies_df['movieId_index'] = movies_df['movieId'].astype('category').cat.codes

In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres,movieId_index
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3
4,5,Father of the Bride Part II (1995),Comedy,4


In [None]:
# users_df = pd.read_csv('/kaggle/input/movielens-1m-dataset/users.dat',sep='::',
#                        header=None,
#                        names=['userId', 'gender' ,'age','occupation', 'zipcode'],
#                        engine='python')
# users_df['gender_index'] = users_df['gender'].astype('category').cat.codes
# users_df['age_index'] = users_df['age'].astype('category').cat.codes
# users_df['occupation_index'] = users_df['occupation'].astype('category').cat.codes
# users_df['userId_index'] = users_df['userId'].astype('category').cat.codes

In [None]:
# users_df.head()

In [5]:
# ratings=pd.read_csv('/kaggle/input/movielens-1m-dataset/ratings.dat',sep='::',
#                     names=['userId','movieId','rating','time'],engine='python')

ratings=pd.read_csv('/kaggle/input/grouplens-2018/ml-latest/ratings.csv')
ratings=ratings.join(movies_df.set_index('movieId'),on='movieId')
# ratings=ratings.join(users_df.set_index('userId'), on='userId')

In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,movieId_index
0,1,307,3.5,1256677221,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama,304
1,1,481,3.5,1256677456,Kalifornia (1993),Drama|Thriller,477
2,1,1091,1.5,1256677471,Weekend at Bernie's (1989),Comedy,1069
3,1,1257,4.5,1256677460,Better Off Dead... (1985),Comedy|Romance,1229
4,1,1449,4.5,1256677264,Waiting for Guffman (1996),Comedy,1414


In [7]:
feature_columns = ['userId','movieId_index']

In [8]:
n_movie_unique = len(ratings['movieId_index'].unique())
n_user_unique = len(ratings['userId'].unique())
print(n_movie_unique, n_user_unique)

53889 283228


In [9]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,movieId_index
0,1,307,3.5,1256677221,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama,304
1,1,481,3.5,1256677456,Kalifornia (1993),Drama|Thriller,477
2,1,1091,1.5,1256677471,Weekend at Bernie's (1989),Comedy,1069
3,1,1257,4.5,1256677460,Better Off Dead... (1985),Comedy|Romance,1229
4,1,1449,4.5,1256677264,Waiting for Guffman (1996),Comedy,1414
...,...,...,...,...,...,...,...
27753439,283228,8542,4.5,1379882795,"Day at the Races, A (1937)",Comedy|Musical,7885
27753440,283228,8712,4.5,1379882751,My Favorite Wife (1940),Comedy|Romance,8029
27753441,283228,34405,4.5,1379882889,Serenity (2005),Action|Adventure|Sci-Fi,10304
27753442,283228,44761,4.5,1354159524,Brick (2005),Crime|Drama|Film-Noir|Mystery,10964


In [10]:
features_sizes = {
    'userId': n_user_unique,
    'movieId_index': n_movie_unique,
#     'age_index':len(ratings['age_index'].unique()),
#     'gender_index':len(ratings['gender_index'].unique()),
#     'occupation_index':len(ratings['occupation_index'].unique()),
}

next_offset = 0
features_offsets={}
for k,v in features_sizes.items():
    features_offsets[k] = next_offset
    next_offset += v

In [11]:
features_offsets

{'userId': 0, 'movieId_index': 283228}

In [12]:
for column in feature_columns:
    ratings[column] = ratings[column].apply(lambda c: c + features_offsets[column])   

In [13]:
ratings[[*feature_columns,'rating']].head()

Unnamed: 0,userId,movieId_index,rating
0,1,283532,3.5
1,1,283705,3.5
2,1,284297,1.5
3,1,284457,4.5
4,1,284642,4.5


In [14]:
data_x = torch.tensor(ratings[feature_columns].values)
data_y = torch.tensor(ratings['rating'].values).float()
dataset = data.TensorDataset(data_x, data_y)

In [15]:
bs=1024
train_n = int(len(dataset)*0.9)
valid_n = len(dataset) - train_n
splits = [train_n,valid_n]
assert sum(splits) == len(dataset)
trainset,devset = torch.utils.data.random_split(dataset,splits)
train_dataloader = data.DataLoader(trainset,batch_size=bs,shuffle=True)
dev_dataloader = data.DataLoader(devset,batch_size=bs,shuffle=True)

In [16]:
def trunc_normal_(x, mean=0., std=1.):
    "Truncated normal initialization."
    return x.normal_().fmod_(2).mul_(std).add_(mean)

In [17]:
class FMModel(nn.Module):
    def __init__(self, n, k):
        super().__init__()

        self.w0 = nn.Parameter(torch.zeros(1))
        self.bias = nn.Embedding(n, 1)
        self.embeddings = nn.Embedding(n, k)

        with torch.no_grad(): trunc_normal_(self.embeddings.weight, std=0.01)
        with torch.no_grad(): trunc_normal_(self.bias.weight, std=0.01)

    def forward(self, X):
        emb = self.embeddings(X)
        pow_of_sum = emb.sum(dim=1).pow(2)
        sum_of_pow = emb.pow(2).sum(dim=1)
        pairwise = (pow_of_sum-sum_of_pow).sum(1)*0.5
        bias = self.bias(X).squeeze().sum(1)
        return torch.sigmoid(self.w0 + bias + pairwise)*5.5

In [18]:
def fit(iterator, model, optimizer, criterion):
    train_loss = 0
    model.train()
    for x,y in iterator:
        optimizer.zero_grad()
        y_hat = model(x.to(device))
        loss = criterion(y_hat, y.to(device))
        train_loss += loss.item()*x.shape[0]
        loss.backward()
        optimizer.step()
    return train_loss / len(iterator.dataset)

def test(iterator, model, criterion):
    train_loss = 0
    model.eval()
    for x,y in iterator:                    
        with torch.no_grad():
            y_hat = model(x.to(device))
        loss = criterion(y_hat, y.to(device))
        train_loss += loss.item()*x.shape[0]
    return train_loss / len(iterator.dataset)

In [19]:
def train_n_epochs(model, n, optimizer,scheduler):
    criterion = nn.MSELoss().to(device)
    for epoch in range(n):
        start_time = time.time()
        train_loss = fit(train_dataloader, model, optimizer, criterion)
        valid_loss = test(dev_dataloader, model, criterion)
        scheduler.step()
        secs = int(time.time() - start_time)
        print(f'epoch {epoch}. time: {secs}[s]')
        print(f'\ttrain rmse: {(math.sqrt(train_loss)):.4f}')
        print(f'\tvalidation rmse: {(math.sqrt(valid_loss)):.4f}')

In [21]:
model = FMModel(data_x.max()+1, 120).to(device)
wd=1e-5
lr=0.001
epochs=5
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[7], gamma=0.1)
criterion = nn.MSELoss().to(device)
for epoch in range(epochs):
    start_time = time.time()
    train_loss = fit(train_dataloader, model, optimizer, criterion)
    valid_loss = test(dev_dataloader, model, criterion)
    scheduler.step()
    secs = int(time.time() - start_time)
    print(f'epoch {epoch}. time: {secs}[s]')
    print(f'\ttrain rmse: {(math.sqrt(train_loss)):.4f}')
    print(f'\tvalidation rmse: {(math.sqrt(valid_loss)):.4f}')

epoch 0. time: 577[s]
	train rmse: 0.8819
	validation rmse: 0.8555
epoch 1. time: 577[s]
	train rmse: 0.8513
	validation rmse: 0.8493
epoch 2. time: 578[s]
	train rmse: 0.8468
	validation rmse: 0.8464
epoch 3. time: 578[s]
	train rmse: 0.8441
	validation rmse: 0.8448
epoch 4. time: 578[s]
	train rmse: 0.8426
	validation rmse: 0.8437


In [76]:
movies = ratings.drop_duplicates('movieId_index').copy()
movie_embeddings = model.embeddings(torch.tensor(movies['movieId_index'].values,device=device).long())
movies['embedding'] = movie_embeddings.tolist()
movie_biases = model.bias(torch.tensor(movies['movieId_index'].values,device=device).long())
movies['bias'] = movie_biases.cpu().detach().numpy()

NameError: name 'movies' is not defined

In [77]:
movies[['title','movieId_index','embedding','bias']]

Unnamed: 0,title,movieId_index,embedding,bias
0,One Flew Over the Cuckoo's Nest (1975),7216,"[-0.18085111677646637, 0.11375931650400162, -0...",0.416962
1,James and the Giant Peach (1996),6695,"[0.2736254930496216, 0.21331946551799774, -0.0...",-0.045954
2,My Fair Lady (1964),6942,"[0.15543237328529358, 0.11537939310073853, -0....",0.243651
3,Erin Brockovich (2000),9379,"[0.1369326412677765, -0.29257506132125854, -0....",0.331395
4,"Bug's Life, A (1998)",8326,"[0.28984084725379944, -0.03195134922862053, -0...",0.223239
...,...,...,...,...
919876,Modulations (1998),8169,"[0.02889641560614109, -0.07132098078727722, -0...",0.099995
940262,Broken Vessels (1998),8674,"[-0.006507044658064842, 0.0005138540873304009,...",-0.024744
957826,White Boys (1999),8816,"[0.0058865477330982685, -0.0463237464427948, 0...",-0.138961
970914,One Little Indian (1973),9578,"[0.054615385830402374, 0.009685531258583069, -...",0.119665


In [None]:
movies[movies.movieId == 1]

In [78]:
toy_story_index = torch.tensor(6040).to(device)
toy_story_embedding = model.embeddings(toy_story_index)
cosine_similarities = torch.tensor([F.cosine_similarity(toy_story_embedding,i,dim=0) for i in movie_embeddings])
movies.iloc[cosine_similarities.argsort(descending=True).detach().numpy()]['title'].values[:10]

array(['Toy Story (1995)', 'Toy Story 2 (1999)', "Bug's Life, A (1998)",
       'Aladdin (1992)', 'Mulan (1998)', 'Beauty and the Beast (1991)',
       'Babe (1995)', 'Cold Fever (Á köldum klaka) (1994)',
       'My Man Godfrey (1936)', 'Denise Calls Up (1995)'], dtype=object)