# 0. Configuration

In [62]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'
RATINGS_SMALL_URL = 'https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link'

# 1. Modules and functions

In [63]:
import re
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from ast import literal_eval
from pymystem3 import Mystem
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import warnings
warnings.filterwarnings('ignore')

# download stop words beforehand
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nikitasenyatkin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1.1. Helper functions to avoid copypaste

In [64]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

In [65]:
# init lemmatizer to avoid slow performance
mystem = Mystem() 

def word_tokenize_clean(doc: str, stop_words: list):
    '''
    tokenize from string to list of words
    '''

    # split into lower case word tokens \w lemmatization
    tokens = list(set(mystem.lemmatize(doc.lower())))
  
    # remove tokens that are not alphabetic (including punctuation) and not a stop word
    tokens = [word for word in tokens if word.isalpha() and not word in stop_words \
              not in list(punctuation)]
    return tokens

# 2. Main

## 2.1. Data Preparation

In [66]:
# read csv information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [67]:
# let's see what columns we have
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

To get accurate results we need to preprocess text a bit. The pipeline will be as follows:

- Filter only necessary columns from movies_metadada : id, original_title, overview;
- Define `model_index` for model to match back with `id` column;
- Text cleaning: removing stopwords & punctuation, lemmatization for further tokenization and tagged document creatin required for gensim.Doc2Vec

In [68]:
# filter cols
sample = movies_metadata[['id', 'original_title', 'overview']].copy()
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              45466 non-null  object
 1   original_title  45466 non-null  object
 2   overview        44512 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


In [69]:
# as you see from above, we have missing overview in some cases -- let's fill it with the original title
sample.loc[sample['overview'].isnull(), 'overview'] = sample.loc[sample['overview'].isnull(), 'original_title']
sample.isnull().sum()

id                0
original_title    0
overview          0
dtype: int64

In [70]:
# define model_index and make it as string
sample = sample.reset_index().rename(columns = {'index': 'model_index'})
sample['model_index'] = sample['model_index'].astype(str)

In [71]:
# create mapper with title and model_idnex to use it further in evaluation
movies_inv_mapper = dict(zip(sample['original_title'].str.lower(), sample['model_index'].astype(int)))

In [72]:
# preprocess by removing non-character data, stopwords
tags_corpus = sample['overview'].values
tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
stop_words = stopwords.words('english')

tags_doc = [word_tokenize_clean(description, stop_words) for description in tags_corpus]
tags_doc

[['heart',
  'losing',
  'led',
  'toys',
  'happily',
  'aside',
  'owner',
  'birthday',
  'afraid',
  'live',
  'plots',
  'lightyear',
  'onto',
  'eventually',
  'scene',
  'differences',
  'woody',
  'andy',
  'circumstances',
  'buzz',
  'learns',
  'place',
  'brings',
  'room',
  'put',
  'duo',
  'separate'],
 ['opens',
  'discover',
  'rhinoceroses',
  'game',
  'judy',
  'magical',
  'freedom',
  'finish',
  'invite',
  'hope',
  'find',
  'running',
  'three',
  'inside',
  'proves',
  'siblings',
  'peter',
  'evil',
  'adult',
  'creatures',
  'door',
  'monkeys',
  'unwittingly',
  'risky',
  'years',
  'giant',
  'living',
  'board',
  'room',
  'enchanted',
  'alan',
  'terrifying',
  'world',
  'trapped'],
 ['opens',
  'locals',
  'family',
  'restaurant',
  'reignites',
  'wedding',
  'scare',
  'seafood',
  'shop',
  'hot',
  'fishing',
  'worry',
  'john',
  'feud',
  'meanwhile',
  'less',
  'bait',
  'away',
  'interested',
  'max',
  'fish',
  'ancient',
  'bud

In [73]:
# prepare data as model input for Word2Vec
## it takes some time to execute
tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

In [74]:
# let's check what do we have
## tag = movie index
tags_doc[1]

TaggedDocument(words=['opens', 'discover', 'rhinoceroses', 'game', 'judy', 'magical', 'freedom', 'finish', 'invite', 'hope', 'find', 'running', 'three', 'inside', 'proves', 'siblings', 'peter', 'evil', 'adult', 'creatures', 'door', 'monkeys', 'unwittingly', 'risky', 'years', 'giant', 'living', 'board', 'room', 'enchanted', 'alan', 'terrifying', 'world', 'trapped'], tags=['1'])

# 2.2. Model Training and Evaluation

In [75]:
VEC_SIZE = 50
ALPHA = .02
MIN_ALPHA = .00025
MIN_COUNT = 5
EPOCHS = 20

In [76]:
# initialize
model = Doc2Vec(vector_size = VEC_SIZE,
                alpha = ALPHA, 
                min_alpha = MIN_ALPHA,
                min_count = MIN_COUNT,
                dm = 0)

In [77]:
# generate vocab from all tag docs
model.build_vocab(tags_doc)

In [78]:
# train model
model.train(tags_doc,
            total_examples = model.corpus_count,
            epochs = EPOCHS)

## 2.3. Evaluate the model

Let's assume that we watched movie `batman` and based on that generate recommendation similar to it's description.

To do that we need
- To extract movie id from `movies_inv_mapper` we created to map back titles from model output
- Load embeddings from trained model
- Use built-in most_similar() method to get most relevant recommendations based on film embedding
- Finally, map title names for sense-check

In [79]:
# get id
movie_id = movies_inv_mapper['batman']
movie_id

8603

In [80]:
# load trained embeddings 
movies_vectors = model.dv.vectors

In [81]:
movie_embeddings = movies_vectors[movie_id]

In [82]:
movie_embeddings

array([-0.07718373, -0.18148568,  0.22810401,  0.10211275, -0.04539826,
        0.07921068, -0.34359732,  0.0384021 , -0.2953704 ,  0.05182046,
       -0.3360866 , -0.01197015, -0.02433519, -0.22660376,  0.1927586 ,
       -0.13426839,  0.19194816, -0.15456964,  0.07777856, -0.12952073,
        0.15212601,  0.03392074,  0.0230126 ,  0.19480014,  0.13041466,
        0.29545382, -0.10453822, -0.08730748,  0.2807249 , -0.290508  ,
        0.03351963,  0.05963235, -0.05358792, -0.00446036, -0.30415437,
        0.32883778,  0.11766003, -0.07146677,  0.2174833 ,  0.23573527,
        0.13671444, -0.01057564, -0.03829711,  0.07598588, -0.00501741,
       -0.34062085,  0.08706772, -0.02256046, -0.08752392,  0.4326323 ],
      dtype=float32)

In [83]:
# get recommendations
similars = model.docvecs.most_similar(positive = [movie_embeddings], topn = 20)
output = pd.DataFrame(similars, columns = ['model_index', 'model_score'])
output.head()

Unnamed: 0,model_index,model_score
0,8603,1.0
1,7772,0.959072
2,43165,0.958035
3,13835,0.957089
4,11256,0.949659


In [84]:
# reverse values and indices to map names in dataframe
name_mapper = {v: k for k, v in movies_inv_mapper.items()}

In [85]:
output['title_name'] = output['model_index'].astype(int).map(name_mapper)
output


Unnamed: 0,model_index,model_score,title_name
0,8603,1.0,batman
1,7772,0.959072,this island earth
2,43165,0.958035,the zookeeper's wife
3,13835,0.957089,k2
4,11256,0.949659,sun faa sau si
5,5713,0.949088,rollover
6,2175,0.94899,
7,29872,0.948594,angels die hard
8,27681,0.948251,l'umanoide
9,44339,0.947688,the underground world


# TODO

- Add `original_title`, `keywords`, `tagline` and other metadata to train sample and then retrain embeddings;
- Make visualization of embeddings with links of films with each other;
- Compare results with the embeddings we created in lecture
- Write function get_recommendations() which takes arguments we used 2.3., but such that we can use embeddings of several watched films to get recommendations

# Appendix

Here, we wrap up all pipeline into functions to re-use if needed and it is just prettier to code this way :)

## Making personal rekkos (building get_recommendations() function that will give personal rekkos for each user with filter on watched films)

In [86]:
def get_clean_tags_array(agg_tags: pd.DataFrame,
                         text_col = 'overview'):
    '''text preprocessing
    '''
    tags_corpus = agg_tags[text_col].values
    tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
    stop_words = stopwords.words('english')


    # preprocess corpus of movie tags before feeding it into Doc2Vec model
    tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

    return tags_doc


In [87]:
def train_embeddings(tags_doc: np.array,
                     epochs = 20,
                     vec_size = 50,
                     alpha = .02,
                     min_alpha =  0.00025,
                     min_count = 5,
                     save_path: str = None):
    """
    fit doc2vec model to prepared corpus
    :tags_doc: result of get_clean_tags_array()
    :max_epocs: int
    :vec_size: int
    :alpha: float
    """
    #initialize
    model = Doc2Vec(vector_size = vec_size,
                    alpha = alpha, 
                    min_alpha = min_alpha,
                    min_count = min_count,
                    dm = 0)
    
    #generate vocab from all tag docs
    model.build_vocab(tags_doc)
    
    #train model
    model.train(tags_doc,
                total_examples = model.corpus_count,
                epochs = epochs)
    
    #save model to dir
    if save_path:
        model.save(f'{save_path}/d2v_model.pkl')
    
    return model

In [88]:
tags_doc = get_clean_tags_array(sample)


In [89]:
model = train_embeddings(tags_doc)

lets load interactions data

In [91]:
interactions = read_csv_from_gdrive(RATINGS_SMALL_URL)
interactions['movieId'] = interactions['movieId'].astype(str)
movies_metadata.rename(columns = {'id': 'movieId'}, inplace = True)
interactions_filtered = interactions.loc[interactions['movieId'].isin(sample['model_index'])]
print(interactions.shape, interactions_filtered.shape)

(100004, 4) (88299, 4)


In [92]:
# create users input
users = interactions_filtered[['userId']].drop_duplicates().reset_index(drop = True)

In [93]:
known_items = interactions_filtered.groupby('userId')['movieId'].apply(list).to_dict()

users['watched_movies'] = users['userId'].map(known_items)
users.head(5)

Unnamed: 0,userId,watched_movies
0,1,"[31, 1029, 1061, 1129, 1172, 1263, 1287, 1293,..."
1,2,"[10, 17, 39, 47, 50, 52, 62, 110, 144, 150, 15..."
2,3,"[60, 110, 247, 267, 296, 318, 355, 356, 377, 5..."
3,4,"[10, 34, 112, 141, 153, 173, 185, 260, 289, 29..."
4,5,"[3, 39, 104, 141, 150, 231, 277, 344, 356, 364..."


In [94]:
movie_vectors = model.dv.vectors

In [162]:
# Defining a function to get the embeddings of a list of movies
def get_movie_embeddings(movie_list):
    embeddings = [movie_vectors[int(movie_id)] for movie_id in movie_list]
    # Remove None values
    embeddings = [x for x in embeddings if x is not None]
    return embeddings


users['watched_movies_embeddings'] = users['watched_movies'].apply(get_movie_embeddings)


In [147]:
users.head(5)

Unnamed: 0,userId,watched_movies,watched_movies_embeddings,mean_embeddings,rekkos
0,1,"[31, 1029, 1061, 1129, 1172, 1263, 1287, 1293,...","[[-0.044266038, -0.1109356, 0.017569203, 0.026...","[-0.059348322, -0.14135118, 0.21070437, -0.054...","[The Unholy, La vie moderne, The Dentist, The ..."
1,2,"[10, 17, 39, 47, 50, 52, 62, 110, 144, 150, 15...","[[-0.1283832, -0.2998937, 0.25938413, -0.01427...","[-0.06124544, -0.09482918, 0.23931415, -0.0656...","[Save the Date, Herbie Goes Bananas, He's Just..."
2,3,"[60, 110, 247, 267, 296, 318, 355, 356, 377, 5...","[[-0.12485384, 0.00032843807, 0.12745522, -0.0...","[-0.07713403, -0.08571106, 0.22542897, -0.0427...","[He's Just Not That Into You, Szabadgyalog, Th..."
3,4,"[10, 34, 112, 141, 153, 173, 185, 260, 289, 29...","[[-0.1283832, -0.2998937, 0.25938413, -0.01427...","[-0.07082486, -0.105662175, 0.2270135, -0.0546...","[Eloise at Christmastime, Passed Away, Emperor..."
4,5,"[3, 39, 104, 141, 150, 231, 277, 344, 356, 364...","[[-0.027566599, -0.084369525, 0.31476548, -0.0...","[-0.06818351, -0.11244293, 0.2296152, -0.04997...","[Passed Away, Valami Amerika 2, Perfectly norm..."


In [165]:
## lets calculate mean embeddings for each user:

mean_embeddings = []
for embedding_list in users['watched_movies_embeddings']:
    embeddings = np.array(embedding_list)
    # Take the mean of the embeddings for movies that the user has watched
    user_embedding = np.mean(embeddings, axis=0)
    mean_embeddings.append(user_embedding)
    
# Add the mean_embeddings column to dataframe
users['mean_embeddings'] = mean_embeddings

[array([-0.05934832, -0.14135118,  0.21070437, -0.05467386, -0.07932346,
         0.11498956, -0.42841253, -0.00120192, -0.20487177,  0.02575624,
        -0.15949331, -0.02418745, -0.06264637, -0.2982904 ,  0.18258403,
        -0.06319727,  0.3136001 , -0.13809238,  0.00241211, -0.11222684,
         0.00062604,  0.10961665,  0.11611342,  0.09398754,  0.18695354,
         0.1618193 , -0.14968374, -0.09610676,  0.28605992, -0.17437947,
         0.0755507 ,  0.0675517 , -0.05029527,  0.13834767, -0.22809768,
         0.1239226 ,  0.01039495, -0.14472273,  0.17945644,  0.19830924,
         0.05647413,  0.10553993, -0.03804951,  0.13185215,  0.00495507,
        -0.34700793, -0.06672442, -0.25628147, -0.01818395,  0.3785563 ],
       dtype=float32)]

In [151]:
#mapping titles and ids for furher function
name_mapper = dict(zip(sample['model_index'], sample['original_title']))

## Function that creates recommendations for each user

In [155]:
def rekkos(data: pd.DataFrame, number_of_samples: int):
    similars = model.docvecs.most_similar(positive=data, topn = number_of_samples)
    return [name_mapper.get(movie_id[0], movie_id[0]) for movie_id in similars]

let's calculate maximum films that we should predict to recommend every user fresh films:

In [175]:
lst_max = []
for row in users.index:
    lst_max.append(len(users['watched_movies'][row]))

max_films = max(lst_max)+20
max_films


1888

In [183]:
def watched_films_filter(df: pd.DataFrame, number_of_samples: int):
    """
    calculates mean rating to define popular titles with taking to the account watched films
    """
    popular_titles = df['rekkos']
    personal_rekkos = {}
    for key in known_items.keys():
        popular_titles = df['rekkos'][key-1]
        list_of_unknown_films = [x for x in popular_titles if x not in known_items[key]]
        personal_rekkos[key] = list(list_of_unknown_films[:number_of_samples])
    return personal_rekkos


## FINAL FUNCTION 

In [184]:
def get_recommendatios(data: pd.DataFrame, 
                       number_of_samples: int,
                       item_column: str):
    
    '''function that returns dataframe with recommended films for each user based on their previous history of watches.
    :data : pd.DataFrame 
    :number_of_samples : number of films needed to be predicted,
    :item_column : column with watched films in df.
    '''

    # creating embeddings for each movie
    data['watched_movies_embeddings'] = data[item_column].apply(get_movie_embeddings)

    #counting mean embeddings for watched_films
    mean_embeddings = []
    for embedding_list in data['watched_movies_embeddings']:
        embeddings = np.array(embedding_list)
        # Take the mean of the embeddings for movies that the user has watched
        user_embedding = np.mean(embeddings, axis=0)
        mean_embeddings.append(user_embedding)
    
    # Adding the mean_embeddings column to dataframe
    data['mean_embeddings'] = mean_embeddings

    #receiving rekkos
    data['rekkos'] = data['mean_embeddings'].apply(rekkos, number_of_samples=max_films)

    #filtering watched films 
    personal_rekkos = watched_films_filter(data, number_of_samples=number_of_samples)
    data['rekkos'] = data['userId'].map(personal_rekkos)
    
    return data[['userId', 'rekkos']]

In [185]:
recommended_df = get_recommendatios(data= users,number_of_samples= 20, item_column='watched_movies')
recommended_df.head(5)

Unnamed: 0,userId,rekkos
0,1,"[The Unholy, La vie moderne, The Dentist, The ..."
1,2,"[Save the Date, Herbie Goes Bananas, He's Just..."
2,3,"[He's Just Not That Into You, Szabadgyalog, Th..."
3,4,"[Eloise at Christmastime, Passed Away, Emperor..."
4,5,"[Passed Away, Valami Amerika 2, Perfectly norm..."


## Adding more features into the model (this model works worse that first one, that is why I used first in recommendations)

In [27]:
movies_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [28]:
new_sample = movies_metadata[['id', 'original_title', 'overview', 'genres', 'tagline']]

In [29]:
new_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              45466 non-null  object
 1   original_title  45466 non-null  object
 2   overview        44512 non-null  object
 3   genres          45466 non-null  object
 4   tagline         20412 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB


In [30]:
new_sample.loc[new_sample['overview'].isnull(), 'overview'] = new_sample.loc[new_sample['overview'].isnull(), 'original_title']
new_sample['tagline'] = new_sample['tagline'].fillna(new_sample['original_title'])

In [31]:
new_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              45466 non-null  object
 1   original_title  45466 non-null  object
 2   overview        45466 non-null  object
 3   genres          45466 non-null  object
 4   tagline         45466 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB


In [32]:
new_sample = new_sample.reset_index().rename(columns = {'index': 'model_index'})
new_sample['model_index'] = new_sample['model_index'].astype(str)
new_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   model_index     45466 non-null  object
 1   id              45466 non-null  object
 2   original_title  45466 non-null  object
 3   overview        45466 non-null  object
 4   genres          45466 non-null  object
 5   tagline         45466 non-null  object
dtypes: object(6)
memory usage: 2.1+ MB


In [33]:
movies_inv_mapper = dict(zip(new_sample['original_title'].str.lower(), new_sample['model_index'].astype(int)))

In [34]:
def get_clean_tags_array(agg_tags: pd.DataFrame,
                         text_cols = ['overview', 'original_title', 'tagline']):
    '''text preprocessing
    '''
    # concatenate the text from all columns of interest
    tags_corpus = agg_tags[text_cols].apply(lambda x: ' '.join(x), axis=1).values
    
    # preprocess the text
    tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
    stop_words = stopwords.words('english')
    tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

    return tags_doc

In [35]:
tags_array = get_clean_tags_array(new_sample)

In [36]:
def train_embeddings(tags_doc: np.array,
                     epochs = 20,
                     vec_size = 50,
                     alpha = .02,
                     min_alpha =  0.00025,
                     min_count = 5,
                     save_path: str = None):
    """
    fit doc2vec model to prepared corpus
    :tags_doc: result of get_clean_tags_array()
    :max_epocs: int
    :vec_size: int
    :alpha: float
    """
    #initialize
    model = Doc2Vec(vector_size = vec_size,
                    alpha = alpha, 
                    min_alpha = min_alpha,
                    min_count = min_count,
                    dm = 0)
    
    #generate vocab from all tag docs
    model.build_vocab(tags_doc)
    
    #train model
    model.train(tags_doc,
                total_examples = model.corpus_count,
                epochs = epochs)
    
    #save model to dir
    if save_path:
        model.save(f'{save_path}/d2v_model.pkl')
    
    return model

In [37]:
model = train_embeddings(tags_array)

In [38]:
movie_id = movies_inv_mapper['batman']
movie_id

8603

In [19]:
movie_vectors = model.dv.vectors

array([-0.0895271 ,  0.03324347,  0.26425728, -0.01878192,  0.02332089,
        0.0397878 , -0.5016296 ,  0.00375795, -0.04347954,  0.08193494,
       -0.00681862, -0.02294026,  0.03649666, -0.36086872,  0.23730916,
       -0.05900694,  0.18090206, -0.14913467,  0.01987115, -0.01842449,
       -0.07024898,  0.15909122,  0.11066394,  0.06038789,  0.30639258,
        0.09609788, -0.19722821, -0.21473187,  0.29956174, -0.12605098,
       -0.03314768,  0.07171424, -0.08256987,  0.15433526, -0.3212227 ,
        0.11035433,  0.09680413, -0.1320987 ,  0.16997814,  0.20738117,
       -0.04640359,  0.05173053, -0.05761185,  0.15216777, -0.01633531,
       -0.32691956, -0.0368297 , -0.22638334, -0.03992472,  0.2614981 ],
      dtype=float32)

In [40]:
movie_embeddings = movies_vectors[movie_id]

In [41]:
movie_embeddings

array([-0.01260352, -0.17648646,  0.31885964, -0.00316406, -0.00963075,
        0.05991915, -0.43649006, -0.03407767, -0.3402733 ,  0.11028342,
       -0.2848994 , -0.01092427, -0.03176834, -0.27425322,  0.10205461,
       -0.1492908 ,  0.26643065, -0.258164  ,  0.1517608 , -0.07475427,
        0.08462706,  0.14358222,  0.00954648,  0.12048364,  0.13801584,
        0.16123997, -0.05570374, -0.07482784,  0.2728512 , -0.2857733 ,
        0.12358125, -0.01469085, -0.00721742, -0.03931078, -0.23107149,
        0.1643544 ,  0.12541144, -0.11195131,  0.27103552,  0.2093133 ,
        0.11221762,  0.01719217,  0.0029151 ,  0.09990697, -0.02530034,
       -0.29180762,  0.18225749, -0.1321083 , -0.08278154,  0.4370748 ],
      dtype=float32)

In [42]:
similars = model.docvecs.most_similar(positive=[movie_embeddings], topn=20)
output = pd.DataFrame(similars, columns = ['model_index', 'model_score'])
output.head()

Unnamed: 0,model_index,model_score
0,34282,0.915982
1,32667,0.912129
2,13779,0.910338
3,24762,0.908032
4,45022,0.907549


In [43]:
output['title_name'] = output['model_index'].astype(int).map(name_mapper)
output

Unnamed: 0,model_index,model_score,title_name
0,34282,0.915982,kshanbhar vishranti
1,32667,0.912129,some voices
2,13779,0.910338,imagine that
3,24762,0.908032,the returned
4,45022,0.907549,ماجرای نیمروز
5,22855,0.906911,dug's special mission
6,34199,0.906633,der rest ist schweigen
7,34896,0.906432,recep i̇vedik 2
8,19771,0.905578,stolen
9,43461,0.905525,megafault


As we can see, model with more attributes works worse, it doesn't have even batman by itself