In [2]:
!pip install sentence_transformers

Defaulting to user installation because normal site-packages is not writeable




In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
movies = pd.read_csv('data/movies.csv')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [5]:
movies['description'] = 'Title: ' + movies['title'] + ', Genre: ' + movies['genres']
movies

Unnamed: 0,movieId,title,genres,description
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"Title: Toy Story (1995), Genre: Adventure|Anim..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,"Title: Jumanji (1995), Genre: Adventure|Childr..."
2,3,Grumpier Old Men (1995),Comedy|Romance,"Title: Grumpier Old Men (1995), Genre: Comedy|..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"Title: Waiting to Exhale (1995), Genre: Comedy..."
4,5,Father of the Bride Part II (1995),Comedy,"Title: Father of the Bride Part II (1995), Gen..."
...,...,...,...,...
62418,209157,We (2018),Drama,"Title: We (2018), Genre: Drama"
62419,209159,Window of the Soul (2001),Documentary,"Title: Window of the Soul (2001), Genre: Docum..."
62420,209163,Bad Poems (2018),Comedy|Drama,"Title: Bad Poems (2018), Genre: Comedy|Drama"
62421,209169,A Girl Thing (2001),(no genres listed),"Title: A Girl Thing (2001), Genre: (no genres ..."


In [6]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

def get_embeddings(sentences):
    encoded_input = tokenizer(sentences, padding = True, truncation = True, return_tensors = 'pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p = 2, dim = 1)
    return sentence_embeddings

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [7]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
movies_embeddings = model.encode(movies['description'].tolist())

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
def get_recommendations(query, embeddings, df, top_n = 5):
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, embeddings)
    top_indices = similarities[0].argsort()[-top_n:][::-1]
    return df.iloc[top_indices]

In [14]:
query = "Dylan"
recommendations = get_recommendations(query, movies_embeddings, movies)
recommendations[['title', 'genres']]

Unnamed: 0,title,genres
32204,Cas & Dylan (2013),Comedy|Drama
40600,Dylan Moran: Off The Hook (2015),Comedy
16779,Dylan Dog: Dead of Night (2010),Comedy|Horror|Mystery|Thriller
10284,No Direction Home: Bob Dylan (2005),Documentary
60370,Rolling Thunder Revue: A Bob Dylan Story by Ma...,Documentary


In [10]:
query = "Movies falling under the comedy and horror genres"
recommendations = get_recommendations(query, movies_embeddings, movies)
recommendations[['title', 'genres']]

Unnamed: 0,title,genres
46122,Range 15 (2016),Comedy|Horror
32261,"Visit, The (2015)",Comedy|Horror
41423,Scare Campaign (2016),Comedy|Horror
7399,Versus (2000),Action|Comedy|Fantasy|Horror
31265,Stalled (2013),Comedy|Horror


In [11]:
query = "Funny comedies to watch"
recommendations = get_recommendations(query, movies_embeddings, movies)
recommendations[['title', 'genres']]

Unnamed: 0,title,genres
60050,Funny story (2018),Comedy|Drama
49032,Elementary (2017),Comedy|Drama
11653,"TV Set, The (2006)",Comedy|Drama
20284,"To Do List, The (2013)",Comedy
19348,21 and Over (2013),Comedy


In [12]:
query = "Romantic comedies released in the 1990s"
recommendations = get_recommendations(query, movies_embeddings, movies)
recommendations[['title', 'genres']]

Unnamed: 0,title,genres
15552,"Romantics, The (2010)",Comedy|Drama|Romance
7992,Funny About Love (1990),Comedy|Romance
55735,The New Romantic (2018),Comedy|Drama
184,Nine Months (1995),Comedy|Romance
7772,Book of Love (1990),Comedy|Romance
