# 0. Configuration

In [1]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

# 1. Modules and functions

In [2]:
import re
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from ast import literal_eval
from pymystem3 import Mystem
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import warnings
warnings.filterwarnings('ignore')

# download stop words beforehand
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nikitasenyatkin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1.1. Helper functions to avoid copypaste

In [3]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

In [4]:
# init lemmatizer to avoid slow performance
mystem = Mystem() 

def word_tokenize_clean(doc: str, stop_words: list):
    '''
    tokenize from string to list of words
    '''

    # split into lower case word tokens \w lemmatization
    tokens = list(set(mystem.lemmatize(doc.lower())))
  
    # remove tokens that are not alphabetic (including punctuation) and not a stop word
    tokens = [word for word in tokens if word.isalpha() and not word in stop_words \
              not in list(punctuation)]
    return tokens

# 2. Main

## 2.1. Data Preparation

In [5]:
# read csv information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [6]:
# let's see what columns we have
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

To get accurate results we need to preprocess text a bit. The pipeline will be as follows:

- Filter only necessary columns from movies_metadada : id, original_title, overview;
- Define `model_index` for model to match back with `id` column;
- Text cleaning: removing stopwords & punctuation, lemmatization for further tokenization and tagged document creatin required for gensim.Doc2Vec

In [7]:
# filter cols
sample = movies_metadata[['id', 'original_title', 'overview']].copy()
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              45466 non-null  object
 1   original_title  45466 non-null  object
 2   overview        44512 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


In [8]:
# as you see from above, we have missing overview in some cases -- let's fill it with the original title
sample.loc[sample['overview'].isnull(), 'overview'] = sample.loc[sample['overview'].isnull(), 'original_title']
sample.isnull().sum()

id                0
original_title    0
overview          0
dtype: int64

In [9]:
# define model_index and make it as string
sample = sample.reset_index().rename(columns = {'index': 'model_index'})
sample['model_index'] = sample['model_index'].astype(str)

In [10]:
# create mapper with title and model_idnex to use it further in evaluation
movies_inv_mapper = dict(zip(sample['original_title'].str.lower(), sample['model_index'].astype(int)))

In [11]:
# preprocess by removing non-character data, stopwords
tags_corpus = sample['overview'].values
tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
stop_words = stopwords.words('english')

tags_doc = [word_tokenize_clean(description, stop_words) for description in tags_corpus]
tags_doc

[['differences',
  'circumstances',
  'room',
  'andy',
  'lightyear',
  'woody',
  'separate',
  'brings',
  'owner',
  'duo',
  'afraid',
  'buzz',
  'losing',
  'learns',
  'onto',
  'happily',
  'place',
  'live',
  'aside',
  'birthday',
  'heart',
  'led',
  'toys',
  'eventually',
  'scene',
  'plots',
  'put'],
 ['creatures',
  'board',
  'room',
  'opens',
  'running',
  'world',
  'unwittingly',
  'monkeys',
  'terrifying',
  'years',
  'finish',
  'find',
  'giant',
  'evil',
  'freedom',
  'hope',
  'three',
  'peter',
  'invite',
  'game',
  'magical',
  'discover',
  'siblings',
  'door',
  'alan',
  'risky',
  'enchanted',
  'rhinoceroses',
  'judy',
  'inside',
  'living',
  'proves',
  'trapped',
  'adult'],
 ['neighbors',
  'worry',
  'wedding',
  'hot',
  'max',
  'family',
  'fishing',
  'seafood',
  'opens',
  'buddies',
  'less',
  'italian',
  'fish',
  'cooking',
  'away',
  'bait',
  'alarming',
  'sultry',
  'shop',
  'next',
  'ancient',
  'reignites',
  'loc

In [12]:
# prepare data as model input for Word2Vec
## it takes some time to execute
tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

In [13]:
# let's check what do we have
## tag = movie index
tags_doc[1]

TaggedDocument(words=['creatures', 'board', 'room', 'opens', 'running', 'world', 'unwittingly', 'monkeys', 'terrifying', 'years', 'finish', 'find', 'giant', 'evil', 'freedom', 'hope', 'three', 'peter', 'invite', 'game', 'magical', 'discover', 'siblings', 'door', 'alan', 'risky', 'enchanted', 'rhinoceroses', 'judy', 'inside', 'living', 'proves', 'trapped', 'adult'], tags=['1'])

# 2.2. Model Training and Evaluation

In [14]:
VEC_SIZE = 50
ALPHA = .02
MIN_ALPHA = .00025
MIN_COUNT = 5
EPOCHS = 20

In [15]:
# initialize
model = Doc2Vec(vector_size = VEC_SIZE,
                alpha = ALPHA, 
                min_alpha = MIN_ALPHA,
                min_count = MIN_COUNT,
                dm = 0)

In [16]:
# generate vocab from all tag docs
model.build_vocab(tags_doc)

In [17]:
# train model
model.train(tags_doc,
            total_examples = model.corpus_count,
            epochs = EPOCHS)

## 2.3. Evaluate the model

Let's assume that we watched movie `batman` and based on that generate recommendation similar to it's description.

To do that we need
- To extract movie id from `movies_inv_mapper` we created to map back titles from model output
- Load embeddings from trained model
- Use built-in most_similar() method to get most relevant recommendations based on film embedding
- Finally, map title names for sense-check

In [18]:
# get id
movie_id = movies_inv_mapper['batman']
movie_id

8603

In [19]:
# load trained embeddings 
movies_vectors = model.dv.vectors

In [26]:
movie_embeddings = movies_vectors[movie_id]

In [27]:
movie_embeddings

array([-7.0131160e-02, -3.3835763e-01,  3.0831149e-01,  1.3182771e-02,
       -5.7961393e-02,  1.2275443e-05, -4.0449625e-01,  2.0281950e-03,
       -3.7153825e-01, -9.2607411e-03, -3.3908996e-01, -6.4845763e-02,
        3.1051576e-02, -1.4093252e-01,  1.9463678e-01, -3.7134260e-02,
        2.1508460e-01, -1.4130819e-01,  4.6053428e-02, -1.7604569e-01,
        1.3203999e-01,  4.5756478e-02,  7.5088665e-02,  2.7475566e-01,
        1.6631313e-01,  2.3973122e-01, -5.5711973e-02, -1.2349634e-01,
        2.0342937e-01, -1.7663518e-01,  5.8568440e-02,  5.8960021e-02,
       -1.1412838e-02,  1.4288864e-01, -3.0591339e-01,  2.1894985e-01,
        4.1225806e-02, -1.2414209e-01,  2.0681736e-01,  2.5548238e-01,
        6.2928066e-02,  1.6401391e-02, -9.3745425e-02,  1.5540864e-01,
        9.4812810e-03, -3.0466881e-01, -4.8220363e-02, -4.6033621e-02,
       -1.4343587e-01,  3.3848348e-01], dtype=float32)

In [21]:
# get recommendations
similars = model.docvecs.most_similar(positive = [movie_embeddings], topn = 20)
output = pd.DataFrame(similars, columns = ['model_index', 'model_score'])
output.head()

Unnamed: 0,model_index,model_score
0,8603,1.0
1,5713,0.960702
2,7772,0.958968
3,13835,0.957286
4,18294,0.955941


In [22]:
# reverse values and indices to map names in dataframe
name_mapper = {v: k for k, v in movies_inv_mapper.items()}

In [23]:
output['title_name'] = output['model_index'].astype(int).map(name_mapper)
output


Unnamed: 0,model_index,model_score,title_name
0,8603,1.0,batman
1,5713,0.960702,rollover
2,7772,0.958968,this island earth
3,13835,0.957286,k2
4,18294,0.955941,the darkest hour
5,43461,0.954918,megafault
6,43165,0.954657,the zookeeper's wife
7,44366,0.954133,"abraxas, guardian of the universe"
8,44339,0.952415,the underground world
9,8916,0.950602,killer klowns from outer space


# TODO

- Add `original_title`, `keywords`, `tagline` and other metadata to train sample and then retrain embeddings;
- Make visualization of embeddings with links of films with each other;
- Compare results with the embeddings we created in lecture
- Write function get_recommendations() which takes arguments we used 2.3., but such that we can use embeddings of several watched films to get recommendations

# Appendix

Here, we wrap up all pipeline into functions to re-use if needed and it is just prettier to code this way :)

In [24]:
def get_clean_tags_array(agg_tags: pd.DataFrame,
                         text_col = 'tag'):
    '''text preprocessing
    '''
    tags_corpus = agg_tags[text_col].values
    tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
    stop_words = stopwords.words('english')


    # preprocess corpus of movie tags before feeding it into Doc2Vec model
    tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

    return tags_doc


In [25]:
def train_embeddings(tags_doc: np.array,
                     epochs = 20,
                     vec_size = 50,
                     alpha = .02,
                     min_alpha =  0.00025,
                     min_count = 5,
                     save_path: str = None):
    """
    fit doc2vec model to prepared corpus
    :tags_doc: result of get_clean_tags_array()
    :max_epocs: int
    :vec_size: int
    :alpha: float
    """
    #initialize
    model = Doc2Vec(vector_size = vec_size,
                    alpha = alpha, 
                    min_alpha = min_alpha,
                    min_count = min_count,
                    dm = 0)
    
    #generate vocab from all tag docs
    model.build_vocab(tags_doc)
    
    #train model
    model.train(tags_doc,
                total_examples = model.corpus_count,
                epochs = epochs)
    
    #save model to dir
    if save_path:
        model.save(f'{save_path}/d2v_model.pkl')
    
    return model