# 0. Configuration

In [1]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

# 1. Modules and functions

In [2]:
import re
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from ast import literal_eval
from pymystem3 import Mystem
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import warnings
warnings.filterwarnings('ignore')

# download stop words beforehand
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nikitasenyatkin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1.1. Helper functions to avoid copypaste

In [3]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

In [4]:
# init lemmatizer to avoid slow performance
mystem = Mystem() 

def word_tokenize_clean(doc: str, stop_words: list):
    '''
    tokenize from string to list of words
    '''

    # split into lower case word tokens \w lemmatization
    tokens = list(set(mystem.lemmatize(doc.lower())))
  
    # remove tokens that are not alphabetic (including punctuation) and not a stop word
    tokens = [word for word in tokens if word.isalpha() and not word in stop_words \
              not in list(punctuation)]
    return tokens

# 2. Main

## 2.1. Data Preparation

In [5]:
# read csv information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [6]:
# let's see what columns we have
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

To get accurate results we need to preprocess text a bit. The pipeline will be as follows:

- Filter only necessary columns from movies_metadada : id, original_title, overview;
- Define `model_index` for model to match back with `id` column;
- Text cleaning: removing stopwords & punctuation, lemmatization for further tokenization and tagged document creatin required for gensim.Doc2Vec

In [7]:
# filter cols
sample = movies_metadata[['id', 'original_title', 'overview']].copy()
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              45466 non-null  object
 1   original_title  45466 non-null  object
 2   overview        44512 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


In [8]:
# as you see from above, we have missing overview in some cases -- let's fill it with the original title
sample.loc[sample['overview'].isnull(), 'overview'] = sample.loc[sample['overview'].isnull(), 'original_title']
sample.isnull().sum()

id                0
original_title    0
overview          0
dtype: int64

In [9]:
# define model_index and make it as string
sample = sample.reset_index().rename(columns = {'index': 'model_index'})
sample['model_index'] = sample['model_index'].astype(str)

In [10]:
# create mapper with title and model_idnex to use it further in evaluation
movies_inv_mapper = dict(zip(sample['original_title'].str.lower(), sample['model_index'].astype(int)))

In [11]:
# preprocess by removing non-character data, stopwords
tags_corpus = sample['overview'].values
tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
stop_words = stopwords.words('english')

tags_doc = [word_tokenize_clean(description, stop_words) for description in tags_corpus]
tags_doc

[['happily',
  'circumstances',
  'differences',
  'place',
  'buzz',
  'scene',
  'room',
  'learns',
  'woody',
  'brings',
  'afraid',
  'onto',
  'losing',
  'toys',
  'led',
  'aside',
  'heart',
  'duo',
  'put',
  'lightyear',
  'eventually',
  'plots',
  'andy',
  'owner',
  'separate',
  'birthday',
  'live'],
 ['running',
  'world',
  'adult',
  'living',
  'opens',
  'freedom',
  'hope',
  'discover',
  'game',
  'monkeys',
  'giant',
  'room',
  'terrifying',
  'board',
  'magical',
  'door',
  'alan',
  'find',
  'evil',
  'finish',
  'proves',
  'risky',
  'enchanted',
  'three',
  'siblings',
  'invite',
  'creatures',
  'trapped',
  'judy',
  'peter',
  'rhinoceroses',
  'inside',
  'years',
  'unwittingly'],
 ['bait',
  'opens',
  'time',
  'max',
  'local',
  'fish',
  'ancient',
  'locals',
  'meanwhile',
  'italian',
  'neighbors',
  'sultry',
  'less',
  'door',
  'cooking',
  'scare',
  'interested',
  'family',
  'worry',
  'reignites',
  'next',
  'buddies',
  '

In [12]:
# prepare data as model input for Word2Vec
## it takes some time to execute
tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

In [13]:
# let's check what do we have
## tag = movie index
tags_doc[1]

TaggedDocument(words=['running', 'world', 'adult', 'living', 'opens', 'freedom', 'hope', 'discover', 'game', 'monkeys', 'giant', 'room', 'terrifying', 'board', 'magical', 'door', 'alan', 'find', 'evil', 'finish', 'proves', 'risky', 'enchanted', 'three', 'siblings', 'invite', 'creatures', 'trapped', 'judy', 'peter', 'rhinoceroses', 'inside', 'years', 'unwittingly'], tags=['1'])

# 2.2. Model Training and Evaluation

In [14]:
VEC_SIZE = 50
ALPHA = .02
MIN_ALPHA = .00025
MIN_COUNT = 5
EPOCHS = 20

In [15]:
# initialize
model = Doc2Vec(vector_size = VEC_SIZE,
                alpha = ALPHA, 
                min_alpha = MIN_ALPHA,
                min_count = MIN_COUNT,
                dm = 0)

In [16]:
# generate vocab from all tag docs
model.build_vocab(tags_doc)

In [17]:
# train model
model.train(tags_doc,
            total_examples = model.corpus_count,
            epochs = EPOCHS)

## 2.3. Evaluate the model

Let's assume that we watched movie `batman` and based on that generate recommendation similar to it's description.

To do that we need
- To extract movie id from `movies_inv_mapper` we created to map back titles from model output
- Load embeddings from trained model
- Use built-in most_similar() method to get most relevant recommendations based on film embedding
- Finally, map title names for sense-check

In [18]:
# get id
movie_id = movies_inv_mapper['batman']
movie_id

8603

In [19]:
# load trained embeddings 
movies_vectors = model.dv.vectors

In [20]:
movie_embeddings = movies_vectors[movie_id]

In [21]:
movie_embeddings

array([-0.01260352, -0.17648646,  0.31885964, -0.00316406, -0.00963075,
        0.05991915, -0.43649006, -0.03407767, -0.3402733 ,  0.11028342,
       -0.2848994 , -0.01092427, -0.03176834, -0.27425322,  0.10205461,
       -0.1492908 ,  0.26643065, -0.258164  ,  0.1517608 , -0.07475427,
        0.08462706,  0.14358222,  0.00954648,  0.12048364,  0.13801584,
        0.16123997, -0.05570374, -0.07482784,  0.2728512 , -0.2857733 ,
        0.12358125, -0.01469085, -0.00721742, -0.03931078, -0.23107149,
        0.1643544 ,  0.12541144, -0.11195131,  0.27103552,  0.2093133 ,
        0.11221762,  0.01719217,  0.0029151 ,  0.09990697, -0.02530034,
       -0.29180762,  0.18225749, -0.1321083 , -0.08278154,  0.4370748 ],
      dtype=float32)

In [22]:
# get recommendations
similars = model.docvecs.most_similar(positive = [movie_embeddings], topn = 20)
output = pd.DataFrame(similars, columns = ['model_index', 'model_score'])
output.head()

Unnamed: 0,model_index,model_score
0,8603,1.0
1,7772,0.957393
2,13835,0.950422
3,5713,0.949719
4,43461,0.949436


In [23]:
# reverse values and indices to map names in dataframe
name_mapper = {v: k for k, v in movies_inv_mapper.items()}

In [24]:
output['title_name'] = output['model_index'].astype(int).map(name_mapper)
output


Unnamed: 0,model_index,model_score,title_name
0,8603,1.0,batman
1,7772,0.957393,this island earth
2,13835,0.950422,k2
3,5713,0.949719,rollover
4,43461,0.949436,megafault
5,30134,0.948247,spy
6,18294,0.945416,the darkest hour
7,27658,0.945238,giperboloid inzhenera garina
8,1045,0.944958,sleeper
9,9883,0.944288,kaijū daisensō


# TODO

- Add `original_title`, `keywords`, `tagline` and other metadata to train sample and then retrain embeddings;
- Make visualization of embeddings with links of films with each other;
- Compare results with the embeddings we created in lecture
- Write function get_recommendations() which takes arguments we used 2.3., but such that we can use embeddings of several watched films to get recommendations

# Appendix

Here, we wrap up all pipeline into functions to re-use if needed and it is just prettier to code this way :)

In [25]:
def get_clean_tags_array(agg_tags: pd.DataFrame,
                         text_col = 'tag'):
    '''text preprocessing
    '''
    tags_corpus = agg_tags[text_col].values
    tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
    stop_words = stopwords.words('english')


    # preprocess corpus of movie tags before feeding it into Doc2Vec model
    tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

    return tags_doc


In [26]:
def train_embeddings(tags_doc: np.array,
                     epochs = 20,
                     vec_size = 50,
                     alpha = .02,
                     min_alpha =  0.00025,
                     min_count = 5,
                     save_path: str = None):
    """
    fit doc2vec model to prepared corpus
    :tags_doc: result of get_clean_tags_array()
    :max_epocs: int
    :vec_size: int
    :alpha: float
    """
    #initialize
    model = Doc2Vec(vector_size = vec_size,
                    alpha = alpha, 
                    min_alpha = min_alpha,
                    min_count = min_count,
                    dm = 0)
    
    #generate vocab from all tag docs
    model.build_vocab(tags_doc)
    
    #train model
    model.train(tags_doc,
                total_examples = model.corpus_count,
                epochs = epochs)
    
    #save model to dir
    if save_path:
        model.save(f'{save_path}/d2v_model.pkl')
    
    return model

## TO DO work

In [27]:
movies_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [28]:
new_sample = movies_metadata[['id', 'original_title', 'overview', 'genres', 'tagline']]

In [29]:
new_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              45466 non-null  object
 1   original_title  45466 non-null  object
 2   overview        44512 non-null  object
 3   genres          45466 non-null  object
 4   tagline         20412 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB


In [30]:
new_sample.loc[new_sample['overview'].isnull(), 'overview'] = new_sample.loc[new_sample['overview'].isnull(), 'original_title']
new_sample['tagline'] = new_sample['tagline'].fillna(new_sample['original_title'])

In [31]:
new_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              45466 non-null  object
 1   original_title  45466 non-null  object
 2   overview        45466 non-null  object
 3   genres          45466 non-null  object
 4   tagline         45466 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB


In [32]:
new_sample = new_sample.reset_index().rename(columns = {'index': 'model_index'})
new_sample['model_index'] = new_sample['model_index'].astype(str)
new_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   model_index     45466 non-null  object
 1   id              45466 non-null  object
 2   original_title  45466 non-null  object
 3   overview        45466 non-null  object
 4   genres          45466 non-null  object
 5   tagline         45466 non-null  object
dtypes: object(6)
memory usage: 2.1+ MB


In [33]:
movies_inv_mapper = dict(zip(new_sample['original_title'].str.lower(), new_sample['model_index'].astype(int)))

In [34]:
def get_clean_tags_array(agg_tags: pd.DataFrame,
                         text_cols = ['overview', 'original_title', 'tagline']):
    '''text preprocessing
    '''
    # concatenate the text from all columns of interest
    tags_corpus = agg_tags[text_cols].apply(lambda x: ' '.join(x), axis=1).values
    
    # preprocess the text
    tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
    stop_words = stopwords.words('english')
    tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

    return tags_doc

In [35]:
tags_array = get_clean_tags_array(new_sample)

In [36]:
def train_embeddings(tags_doc: np.array,
                     epochs = 20,
                     vec_size = 50,
                     alpha = .02,
                     min_alpha =  0.00025,
                     min_count = 5,
                     save_path: str = None):
    """
    fit doc2vec model to prepared corpus
    :tags_doc: result of get_clean_tags_array()
    :max_epocs: int
    :vec_size: int
    :alpha: float
    """
    #initialize
    model = Doc2Vec(vector_size = vec_size,
                    alpha = alpha, 
                    min_alpha = min_alpha,
                    min_count = min_count,
                    dm = 0)
    
    #generate vocab from all tag docs
    model.build_vocab(tags_doc)
    
    #train model
    model.train(tags_doc,
                total_examples = model.corpus_count,
                epochs = epochs)
    
    #save model to dir
    if save_path:
        model.save(f'{save_path}/d2v_model.pkl')
    
    return model

In [37]:
model = train_embeddings(tags_array)

In [38]:
movie_id = movies_inv_mapper['batman']
movie_id

8603

In [39]:
movie_vectors = model.dv.vectors[movie_id]

In [40]:
movie_embeddings = movies_vectors[movie_id]

In [41]:
movie_embeddings

array([-0.01260352, -0.17648646,  0.31885964, -0.00316406, -0.00963075,
        0.05991915, -0.43649006, -0.03407767, -0.3402733 ,  0.11028342,
       -0.2848994 , -0.01092427, -0.03176834, -0.27425322,  0.10205461,
       -0.1492908 ,  0.26643065, -0.258164  ,  0.1517608 , -0.07475427,
        0.08462706,  0.14358222,  0.00954648,  0.12048364,  0.13801584,
        0.16123997, -0.05570374, -0.07482784,  0.2728512 , -0.2857733 ,
        0.12358125, -0.01469085, -0.00721742, -0.03931078, -0.23107149,
        0.1643544 ,  0.12541144, -0.11195131,  0.27103552,  0.2093133 ,
        0.11221762,  0.01719217,  0.0029151 ,  0.09990697, -0.02530034,
       -0.29180762,  0.18225749, -0.1321083 , -0.08278154,  0.4370748 ],
      dtype=float32)

In [42]:
similars = model.docvecs.most_similar(positive=[movie_embeddings], topn=20)
output = pd.DataFrame(similars, columns = ['model_index', 'model_score'])
output.head()

Unnamed: 0,model_index,model_score
0,34282,0.915982
1,32667,0.912129
2,13779,0.910338
3,24762,0.908032
4,45022,0.907549


In [43]:
output['title_name'] = output['model_index'].astype(int).map(name_mapper)
output

Unnamed: 0,model_index,model_score,title_name
0,34282,0.915982,kshanbhar vishranti
1,32667,0.912129,some voices
2,13779,0.910338,imagine that
3,24762,0.908032,the returned
4,45022,0.907549,ماجرای نیمروز
5,22855,0.906911,dug's special mission
6,34199,0.906633,der rest ist schweigen
7,34896,0.906432,recep i̇vedik 2
8,19771,0.905578,stolen
9,43461,0.905525,megafault


As we can see, model with more attributes works worse, it doesn't have even batman by itself

In [None]:
from tensorflow.summary import FileWriter
from tensorflow.summary import TensorSummary




In [None]:
# assume that the embeddings are stored in a variable named "embeddings"
# and the metadata is stored in a variable named "metadata"
# each row of the metadata file contains the name and link of a film, separated by a tab character
# for example, "The Shawshank Redemption\thttps://www.imdb.com/title/tt0111161/"

# create a summary writer that writes to a log directory named "logs"
writer = FileWriter("logs")

# create a tensor summary for the embeddings
tensor_summary = TensorSummary(
    tensor=embeddings,
    metadata=metadata,
    name="film_embeddings"
)

# add the tensor summary to the writer
writer.add_summary(tensor_summary)

# close the writer
writer.close()

As we can see, model with more attributes works worse, also, I don't know, why there is new model_indexes, that are not in new_sample_df