In [6]:
# !wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
# !unzip ml-1m.zip

In [7]:
import pandas as pd
import torch as torch
import torch.utils.data as data
import torch.nn as nn
import torch.optim as optim
import time
import numpy as np
from sklearn.manifold import TSNE

import math
from torch.utils.data import Dataset
import itertools
import seaborn as sns
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tabulate import tabulate
from elasticsearch import Elasticsearch, helpers

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [9]:
movies_df = pd.read_csv(r'E:\DataSet\DataSet\kaggle\get_started\TheMovieLens\ml-1m\movies.dat', sep='::',
                     names=['movieId','title','genres'],
                     encoding='latin-1',engine='python')
movies_df['movieId_index'] = movies_df['movieId'].astype('category').cat.codes

In [10]:
movies_df.head(4)

Unnamed: 0,movieId,title,genres,movieId_index
0,1,Toy Story (1995),Animation|Children's|Comedy,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama,3


In [11]:
users_df = pd.read_csv(r'E:\DataSet\DataSet\kaggle\get_started\TheMovieLens\ml-1m\users.dat',sep='::',
                       header=None,
                       names=['userId', 'gender' ,'age','occupation', 'zipcode'],
                       engine='python')
users_df['gender_index'] = users_df['gender'].astype('category').cat.codes
users_df['age_index'] = users_df['age'].astype('category').cat.codes
users_df['occupation_index'] = users_df['occupation'].astype('category').cat.codes
users_df['userId_index'] = users_df['userId'].astype('category').cat.codes

In [12]:
users_df.head(5)

Unnamed: 0,userId,gender,age,occupation,zipcode,gender_index,age_index,occupation_index,userId_index
0,1,F,1,10,48067,0,0,10,0
1,2,M,56,16,70072,1,6,16,1
2,3,M,25,15,55117,1,2,15,2
3,4,M,45,7,2460,1,4,7,3
4,5,M,25,20,55455,1,2,20,4


In [13]:
ratings=pd.read_csv(r'E:\DataSet\DataSet\kaggle\get_started\TheMovieLens\ml-1m\ratings.dat',sep='::',
                    names=['userId','movieId','rating','time'],engine='python')
ratings=ratings.join(movies_df.set_index('movieId'),on='movieId')
ratings=ratings.join(users_df.set_index('userId'), on='userId')

In [14]:
ratings.head(4)

Unnamed: 0,userId,movieId,rating,time,title,genres,movieId_index,gender,age,occupation,zipcode,gender_index,age_index,occupation_index,userId_index
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,1176,F,1,10,48067,0,0,10,0
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,655,F,1,10,48067,0,0,10,0
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,902,F,1,10,48067,0,0,10,0
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,3339,F,1,10,48067,0,0,10,0


In [15]:
feature_columns = ['userId_index','movieId_index','age_index','gender_index','occupation_index']

In [16]:
# This is the width of each feature (number of values)
features_sizes = {
    'userId_index':len(ratings['userId_index'].unique()),
    'movieId_index':len(ratings['movieId_index'].unique()),
    'age_index':len(ratings['age_index'].unique()),
    'gender_index':len(ratings['gender_index'].unique()),
    'occupation_index':len(ratings['occupation_index'].unique()),
}

# calculate offsets.
# Each feature starts from the end of the last one.

next_offset = 0
features_offsets={}
for k,v in features_sizes.items():
    features_offsets[k] = next_offset
    next_offset += v

In [17]:
# map all column indices to start from correct offset
for column in feature_columns:
    ratings[column] = ratings[column].apply(lambda c: c + features_offsets[column])   

In [18]:
#print(tabulate(ratings[[*feature_columns,'rating']].head(4),headers=ratings[[*feature_columns,'rating']].columns,tablefmt="github"))
ratings[[*feature_columns,'rating']].head(5)

Unnamed: 0,userId_index,movieId_index,age_index,gender_index,occupation_index,rating
0,0,7216,9746,9753,9765,5
1,0,6695,9746,9753,9765,3
2,0,6942,9746,9753,9765,3
3,0,9379,9746,9753,9765,4
4,0,8326,9746,9753,9765,5


In [19]:
data_x = torch.tensor(ratings[feature_columns].values)
data_y = torch.tensor(ratings['rating'].values).float()
dataset = data.TensorDataset(data_x, data_y)

In [20]:
bs=1024
train_n = int(len(dataset)*0.9)
valid_n = len(dataset) - train_n
splits = [train_n,valid_n]
assert sum(splits) == len(dataset)
trainset,devset = torch.utils.data.random_split(dataset,splits)
train_dataloader = data.DataLoader(trainset,batch_size=bs,shuffle=True)
dev_dataloader = data.DataLoader(devset,batch_size=bs,shuffle=True)

In [21]:
# copied from fastai: 
def trunc_normal_(x, mean=0., std=1.):
    "Truncated normal initialization."
    # From https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/12
    return x.normal_().fmod_(2).mul_(std).add_(mean)

In [22]:
class FMModel(nn.Module):
    def __init__(self, n, k):
        super().__init__()

        self.w0 = nn.Parameter(torch.zeros(1))
        self.bias = nn.Embedding(n, 1)
        self.embeddings = nn.Embedding(n, k)

        # See https://arxiv.org/abs/1711.09160
        with torch.no_grad(): trunc_normal_(self.embeddings.weight, std=0.01)
        with torch.no_grad(): trunc_normal_(self.bias.weight, std=0.01)

    def forward(self, X):
        emb = self.embeddings(X)
        # calculate the interactions in complexity of O(nk) see lemma 3.1 from paper
        pow_of_sum = emb.sum(dim=1).pow(2)
        sum_of_pow = emb.pow(2).sum(dim=1)
        pairwise = (pow_of_sum-sum_of_pow).sum(1)*0.5
        bias = self.bias(X).squeeze().sum(1)
        return torch.sigmoid(self.w0 + bias + pairwise)*5.5

In [23]:
# fit/test functions
def fit(iterator, model, optimizer, criterion):
    train_loss = 0
    model.train()
    for x,y in iterator:
        optimizer.zero_grad()
        y_hat = model(x.to(device))
        loss = criterion(y_hat, y.to(device))
        train_loss += loss.item()*x.shape[0]
        loss.backward()
        optimizer.step()
    return train_loss / len(iterator.dataset)

def test(iterator, model, criterion):
    train_loss = 0
    model.eval()
    for x,y in iterator:                    
        with torch.no_grad():
            y_hat = model(x.to(device))
        loss = criterion(y_hat, y.to(device))
        train_loss += loss.item()*x.shape[0]
    return train_loss / len(iterator.dataset)

In [24]:
def train_n_epochs(model, n, optimizer,scheduler):
    criterion = nn.MSELoss().to(device)
    for epoch in range(n):
        start_time = time.time()
        train_loss = fit(train_dataloader, model, optimizer, criterion)
        valid_loss = test(dev_dataloader, model, criterion)
        scheduler.step()
        secs = int(time.time() - start_time)
        print(f'epoch {epoch}. time: {secs}[s]')
        print(f'\ttrain rmse: {(math.sqrt(train_loss)):.4f}')
        print(f'\tvalidation rmse: {(math.sqrt(valid_loss)):.4f}')


In [25]:
model = FMModel(data_x.max()+1, 120).to(device)
wd=1e-5
lr=0.001
epochs=10
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[7], gamma=0.1)
criterion = nn.MSELoss().to(device)
for epoch in range(epochs):
    start_time = time.time()
    train_loss = fit(train_dataloader, model, optimizer, criterion)
    valid_loss = test(dev_dataloader, model, criterion)
    scheduler.step()
    secs = int(time.time() - start_time)
    print(f'epoch {epoch}. time: {secs}[s]')
    print(f'\ttrain rmse: {(math.sqrt(train_loss)):.4f}')
    print(f'\tvalidation rmse: {(math.sqrt(valid_loss)):.4f}')

epoch 0. time: 13[s]
	train rmse: 0.9427
	validation rmse: 0.9071
epoch 1. time: 13[s]
	train rmse: 0.9001
	validation rmse: 0.8943
epoch 2. time: 13[s]
	train rmse: 0.8771
	validation rmse: 0.8779
epoch 3. time: 13[s]
	train rmse: 0.8524
	validation rmse: 0.8660
epoch 4. time: 13[s]
	train rmse: 0.8283
	validation rmse: 0.8591
epoch 5. time: 13[s]
	train rmse: 0.8030
	validation rmse: 0.8546
epoch 6. time: 13[s]
	train rmse: 0.7755
	validation rmse: 0.8522
epoch 7. time: 13[s]
	train rmse: 0.7308
	validation rmse: 0.8494
epoch 8. time: 13[s]
	train rmse: 0.7249
	validation rmse: 0.8489
epoch 9. time: 13[s]
	train rmse: 0.7205
	validation rmse: 0.8490


## Understand Embeddings

In [26]:
movies = ratings.drop_duplicates('movieId_index').copy()
movie_embeddings = model.embeddings(torch.tensor(movies['movieId_index'].values,device=device).long())
movies['embedding'] = movie_embeddings.tolist()
movie_biases = model.bias(torch.tensor(movies['movieId_index'].values,device=device).long())
movies['bias'] = movie_biases.cpu().detach().numpy()

In [27]:
movies[['title','movieId_index','embedding','bias']]

Unnamed: 0,title,movieId_index,embedding,bias
0,One Flew Over the Cuckoo's Nest (1975),7216,"[-0.13714545965194702, -0.16459739208221436, -...",2.592745e-01
1,James and the Giant Peach (1996),6695,"[-0.1304452121257782, 0.046994879841804504, -0...",-2.247100e-02
2,My Fair Lady (1964),6942,"[0.03056049533188343, -0.012383323162794113, -...",1.213250e-01
3,Erin Brockovich (2000),9379,"[0.0037924121133983135, 0.024567672982811928, ...",2.170747e-01
4,"Bug's Life, A (1998)",8326,"[-0.00697118928655982, 0.10853516310453415, -0...",1.396654e-01
...,...,...,...,...
919876,Modulations (1998),8169,"[-0.014871002174913883, -0.008648834191262722,...",8.294313e-02
940262,Broken Vessels (1998),8674,"[1.4926911501680816e-40, -3.1310753016720145e-...",-3.635613e-40
957826,White Boys (1999),8816,"[-0.02360401675105095, 0.010345765389502048, 0...",-1.329537e-01
970914,One Little Indian (1973),9578,"[-0.023722853511571884, 0.04373824968934059, 0...",1.034661e-01


In [28]:
movies_subset = movies[movies['genres'].str.contains('Children\'s|Horror|Documentary')].copy()
X = np.stack(movies_subset['embedding'].values)
ldr = TSNE(n_components=2, random_state=0)
Y = ldr.fit_transform(X)
movies_subset['x'] = Y[:, 0]
movies_subset['y'] = Y[:, 1]

In [None]:
def single_genre(g):
    for i in ['Children\'s', 'Horror', 'Documentary']:
        if i in g: return i
        
movies_subset['genres']=movies_subset['genres'].apply(single_genre)
plt.figure(figsize=(12, 12))
ax = sns.scatterplot(x="x", y="y", hue='genres',data=movies_subset)
plt.savefig('movie_emb.png', bbox_inches='tight')

In [None]:
star_wars_5_index=torch.tensor(6297,device=device) 	
star_war_embeddings = model.embeddings(star_wars_5_index)
cosine_similarities = torch.tensor([F.cosine_similarity(star_war_embeddings,i,dim=0) for i in movie_embeddings])
movies.iloc[cosine_similarities.argsort(descending=True).detach().numpy()]['title'].values[:10]

In [None]:
toy_story_index=torch.tensor(6040,device=device)
toy_story_embeddings = model.embeddings(toy_story_index)
cosine_similarities = torch.tensor([F.cosine_similarity(toy_story_embeddings,i,dim=0) 
                                    for i in movie_embeddings])
[i for i in movies.iloc[cosine_similarities.argsort(descending=True).detach().numpy()]['title'].values[:10]]

## Recommending

In [None]:
man_embedding = model.embeddings(torch.tensor(9754,device=device))
age18_25_embedding = model.embeddings(torch.tensor(9747,device=device))
metadata_embedding = man_embedding+age18_25_embedding
rankings = movie_biases.squeeze()+(metadata_embedding*movie_embeddings).sum(1)
[i for i in movies.iloc[rankings.argsort(descending=True).cpu()]['title'].values][:10]

In [None]:
woman_embedding = model.embeddings(torch.tensor(9753,device=device))
age50_56_embedding = model.embeddings(torch.tensor(9751,device=device))
metadata_embedding = woman_embedding+age50_56_embedding
rankings = movie_biases.squeeze()+(metadata_embedding*movie_embeddings).sum(1)
[i for i in movies.iloc[rankings.argsort(descending=True).cpu()]['title'].values][:10]

## Elasticsearch


In [None]:
es = Elasticsearch()

In [None]:
def generate_movie_docs():
    for i, movie in movies.iterrows():
        yield {
            '_index': 'recsys',
            '_id': f'movie_{movie["movieId"]}',
            '_source': {'embedding':movie['embedding'],
                    'bias':movie['bias'],
                    'feature_type':'movie',
                    'title':movie['title']
                   }
        }
helpers.bulk(es,generate_movie_docs())

In [None]:
users = ratings.drop_duplicates('userId_index').copy()
users['embedding'] = model.embeddings(torch.tensor(users['userId_index'].values,device=device)).tolist()
users['bias'] = model.bias(torch.tensor(users['userId_index'].values,device=device)).detach().numpy()

def generate_user_docs():
    for i, user in users.iterrows():
        yield {
            '_index': 'recsys',
            '_id': f'user_{user["userId"]}',
            '_source': {'embedding':user['embedding'],
                    'bias':user['bias'],
                    'feature_type':'user',
                   }
        }
helpers.bulk(es,generate_user_docs())

In [None]:
ages = [i+features_offsets['age_index'] for i in range(features_sizes['age_index'])]
def generate_age_docs():
    for age_index in ages:
        yield {
            '_index': 'recsys',
            '_id': f'age_{age_index}',
            '_source': {'embedding':model.embeddings(torch.tensor(age_index,device=device)).tolist(),
                    'bias':model.bias(torch.tensor(age_index,device=device)).item(),
                    'feature_type':'age',
                   }
        }
helpers.bulk(es,generate_age_docs())

In [None]:
genders = [i+features_offsets['gender_index'] for i in range(features_sizes['gender_index'])]
def generate_gender_docs():
    for gender_index in genders:
        yield {
            '_index': 'recsys',
            '_id': f'gender_{gender_index}',
            '_source': {'embedding':model.embeddings(torch.tensor(gender_index,device=device)).tolist(),
                    'bias':model.bias(torch.tensor(gender_index,device=device)).item(),
                    'feature_type':'gender',
                   }
        }
helpers.bulk(es,generate_gender_docs())

In [None]:
occupations = [i+features_offsets['occupation_index'] for i in range(features_sizes['occupation_index'])]
def generate_occupation_docs():
    for occupation_index in occupations:
        yield {
            '_index': 'recsys',
            '_id': f'occupation_{occupation_index}',
            '_source': {'embedding':model.embeddings(torch.tensor(occupation_index,device=device)).tolist(),
                    'bias':model.bias(torch.tensor(occupation_index,device=device)).item(),
                    'feature_type':'occupation',
                   }
        }
helpers.bulk(es,generate_occupation_docs())

In [None]:
metadata = es.mget({"docs":[
    {
        "_index" : "recsys",
        "_id" : "age_9747"},
    {
        "_index" : "recsys",
        "_id" : "gender_9754"}]})
embeddings = [doc['_source']['embedding'] for doc in metadata['docs']]
v_metadata = [sum(pair) for pair in zip(*embeddings)]


In [None]:
search_body = {
    "query": {
    "script_score": {
      "query" : {
          "bool" : {
          "filter" : {
            "term" : {
              "feature_type" : "movie" 
            }
          }
        }
      },
      "script": {
        "source": "dotProduct(params.query_vector, \u0027embedding\u0027) + doc[\u0027bias\u0027].value", 
        "params": {
          "query_vector": v_metadata
        }
      }
    }
  }
}
[hit['_source']['title'] 
 for hit in es.search(search_body,index='recsys',_source_includes='title')['hits']['hits']]
