In [60]:
import pandas as pd
import numpy as np
np.random.seed(2020)
from pprint import pprint

# used for pre-processing and modeling
import gensim
from gensim import models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from gensim.models.wrappers import LdaMallet

# used for natural language processing {NLTK: Natural Language Tool-Kit} 
from nltk.stem import WordNetLemmatizer, SnowballStemmer, LancasterStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')
import spacy

# Visualization tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.spatial import distance

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/siddharth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [61]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [92]:
# Load dataset
plots = pd.read_csv("wiki_movie_plots_deduped.csv")
plots

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
...,...,...,...,...,...,...,...,...
34881,2014,The Water Diviner,Turkish,Director: Russell Crowe,Director: Russell Crowe\r\nCast: Russell Crowe...,unknown,https://en.wikipedia.org/wiki/The_Water_Diviner,"The film begins in 1919, just after World War ..."
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...


## Model 1:

1. WordNetLemmatizer
2. TF-IDF
3. LDAMulticore  

In [3]:
# lemmatizer
lemmatizer = WordNetLemmatizer()

# spacy model
nlp = spacy.load("en")

In [4]:
# function to lemmatize a given word
def lemmatize(text):
    return lemmatizer.lemmatize(text, pos = "n")

# function to preprocess the text
def preprocess(text):
    result = []
    for word in gensim.utils.simple_preprocess(text):
        if word not in gensim.parsing.preprocessing.STOPWORDS and len(word) > 3:
            l = lemmatize(word)
            if len(l) > 3:
                result.append(l)
    return result

In [5]:
plots_processed = plots['Plot'].map(preprocess)
plots_processed

0        [bartender, working, saloon, serving, drink, c...
1        [moon, painted, smiling, face, hang, park, nig...
2        [film, minute, long, composed, shot, girl, sit...
3        [lasting, second, consisting, shot, shot, wood...
4        [earliest, known, adaptation, classic, fairyta...
5        [alice, follows, large, white, rabbit, rabbit,...
6        [film, open, bandit, breaking, railroad, teleg...
7        [film, family, suburb, hoping, quiet, life, th...
8        [opening, scene, show, interior, robber, wall,...
9        [scene, introduced, line, poem, santa, claus, ...
10       [rarebit, fiend, gorge, welsh, rarebit, restau...
11       [film, feature, train, traveling, rockies, hol...
12       [irish, villager, kathleen, tenant, captain, c...
13       [boone, daughter, befriends, indian, maiden, b...
14       [heading, baseball, game, nearby, ballpark, sp...
15       [plot, black, woman, going, dentist, toothache...
16       [beautiful, summer, father, mother, daughter, .

In [6]:
# Dictionary for LDA model
plots_dict = Dictionary(plots_processed)
len(plots_dict)

118500

In [7]:
# Filter dictonary
plots_dict.filter_extremes(no_below = 2000, no_above = 0.5) #948
len(plots_dict)

303

In [8]:
# function that provides frequency of words in the dictionary
def freq_list(dict):
    l = []
    for k, v in dict.iteritems():
        l.append(v)
    
    freq_dict = dict.dfs
    freq_df = pd.DataFrame.from_dict(freq_dict, orient = 'index', columns = ['Frequency']).reset_index()
    freq_df = freq_df.sort_values(by = 'index').reset_index()
    freq_df.drop(columns = ['level_0'], inplace = True)
    freq_df.insert(1, 'word', l)
    
    freq = freq_df.sort_values(by = 'Frequency')
    
    return freq

In [9]:
# Bag of words and TFIDF
bag_of_words = [plots_dict.doc2bow(plot) for plot in plots_processed]
tfidf = models.TfidfModel(bag_of_words)
corpus_tfidf = tfidf[bag_of_words]

In [10]:
# LDA Multicore model using the tf-idf corpus
lda_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=plots_dict, passes=10,
                                       workers = 7, random_state = 2020)

for idx, topic in lda_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.049*"gang" + 0.021*"police" + 0.016*"money" + 0.015*"leader" + 0.013*"kill" + 0.013*"brother" + 0.013*"criminal" + 0.012*"member" + 0.010*"fight" + 0.009*"escape"

Topic: 1 
Words: 0.050*"school" + 0.036*"student" + 0.025*"john" + 0.018*"high" + 0.012*"father" + 0.012*"girl" + 0.012*"college" + 0.012*"mother" + 0.011*"year" + 0.011*"friend"

Topic: 2 
Words: 0.063*"village" + 0.021*"land" + 0.019*"power" + 0.016*"people" + 0.011*"story" + 0.010*"form" + 0.010*"father" + 0.010*"life" + 0.009*"save" + 0.009*"love"

Topic: 3 
Words: 0.013*"kill" + 0.010*"escape" + 0.010*"body" + 0.010*"group" + 0.010*"attack" + 0.008*"killed" + 0.008*"attempt" + 0.008*"house" + 0.008*"head" + 0.007*"water"

Topic: 4 
Words: 0.030*"army" + 0.020*"american" + 0.020*"town" + 0.016*"battle" + 0.012*"country" + 0.011*"force" + 0.010*"officer" + 0.010*"state" + 0.010*"order" + 0.010*"attack"

Topic: 5 
Words: 0.029*"story" + 0.029*"love" + 0.018*"college" + 0.017*"life" + 0.016*"girl" + 0.016

In [11]:
# Model measure of performance
cm = CoherenceModel(model = lda_tfidf, texts = plots_processed, dictionary = plots_dict)
cm.get_coherence()

0.39250017417614014

In [12]:
# Visualization
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_tfidf, corpus_tfidf, plots_dict)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Model 2:

using spacy for lemmatizing

In [5]:
def lemmatize_nlp(doc, pos = ['NOUN', 'ADJ', 'ADV', 'VERB']):
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-' and token.pos_ in pos]
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

def remove_stopwords(doc):
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

nlp.add_pipe(lemmatize_nlp, name='lemmatizer', after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [6]:
plots_processed_spacy = plots['Plot'].str.lower().map(nlp)

In [7]:
# Dictionary for LDA model
plots_dict_spacy = Dictionary(plots_processed_spacy)
len(plots_dict_spacy)

71514

In [8]:
# Filter dictonary
plots_dict_spacy.filter_extremes(no_below = 1500, no_above = 0.5) # 200
len(plots_dict_spacy)

479

In [9]:
# Bag of words and TFIDF
bow = [plots_dict_spacy.doc2bow(plot) for plot in plots_processed_spacy]
tfidf_mod = models.TfidfModel(bow)
corpus = tfidf_mod[bow]

In [18]:
# LDA Multicore model using the tf-idf corpus
lda_mod = gensim.models.LdaMulticore(corpus, num_topics=10, id2word=plots_dict_spacy, passes=10, workers=7, 
                                       random_state = 2020)

for idx, topic in lda_mod.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.026*"film" + 0.021*"school" + 0.020*"student" + 0.018*"story" + 0.014*"play" + 0.012*"life" + 0.012*"team" + 0.012*"movie" + 0.011*"character" + 0.010*"teacher"

Topic: 1 
Words: 0.010*"car" + 0.008*"house" + 0.008*"tell" + 0.008*"kill" + 0.007*"police" + 0.007*"room" + 0.007*"find" + 0.006*"apartment" + 0.006*"leave" + 0.006*"man"

Topic: 2 
Words: 0.012*"ship" + 0.010*"destroy" + 0.010*"kill" + 0.009*"human" + 0.009*"attack" + 0.009*"power" + 0.009*"use" + 0.008*"escape" + 0.007*"group" + 0.007*"world"

Topic: 3 
Words: 0.010*"company" + 0.010*"money" + 0.009*"job" + 0.009*"win" + 0.007*"work" + 0.007*"business" + 0.007*"play" + 0.007*"big" + 0.007*"star" + 0.006*"dance"

Topic: 4 
Words: 0.010*"relationship" + 0.008*"mother" + 0.008*"love" + 0.008*"letter" + 0.008*"friend" + 0.007*"write" + 0.007*"life" + 0.007*"tell" + 0.007*"book" + 0.007*"day"

Topic: 5 
Words: 0.019*"police" + 0.019*"murder" + 0.013*"kill" + 0.012*"case" + 0.010*"officer" + 0.010*"jail" + 0.01

In [19]:
cm_spacy = CoherenceModel(model = lda_mod, texts = plots_processed_spacy, dictionary = plots_dict_spacy)
cm_spacy.get_coherence()

0.43750702086131527

In [20]:
# Visualization
pyLDAvis.enable_notebook()
vis_spacy = pyLDAvis.gensim.prepare(lda_mod, corpus, plots_dict_spacy)
vis_spacy

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Model 3:
Using Bigrams

In [21]:
bigrams = gensim.models.Phrases(plots_processed, min_count=10, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigrams)

In [22]:
def create_bigrams(text):
    return bigram_mod[text]

In [23]:
bigrams_processed = plots_processed.map(create_bigrams)
bigrams_spacy = plots_processed_spacy.map(create_bigrams)

In [24]:
# Dictionary for LDA model
bigrams_dict = Dictionary(bigrams_processed)
print(len(bigrams_dict))

bigrams_dict_spacy = Dictionary(bigrams_spacy)
print(len(bigrams_dict_spacy))

122071
73531


In [25]:
# Filter dictonary
bigrams_dict.filter_extremes(no_below = 1700, no_above = 0.5)
print(len(bigrams_dict))

bigrams_dict_spacy.filter_extremes(no_below = 2000, no_above = 0.5)
print(len(bigrams_dict_spacy))

359
329


In [26]:
f = freq_list(bigrams_dict)
f_spacy = freq_list(bigrams_dict_spacy)

#### Bigram model with general lemmatizer

In [27]:
# Bag of words and TFIDF
bow_bigrams = [bigrams_dict.doc2bow(plot) for plot in bigrams_processed]
tfidf_bigrams = models.TfidfModel(bow_bigrams)
corpus_bigrams = tfidf_bigrams[bow_bigrams]

In [28]:
# LDA Multicore model using the tf-idf corpus
lda_bigram = gensim.models.LdaMulticore(corpus_bigrams, num_topics=10, id2word=bigrams_dict, passes=10, workers = 7, 
                                        random_state = 2020)

for idx, topic in lda_bigram.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.025*"army" + 0.021*"village" + 0.021*"soldier" + 0.016*"battle" + 0.014*"american" + 0.010*"attack" + 0.010*"force" + 0.009*"fight" + 0.008*"people" + 0.008*"kill"

Topic: 1 
Words: 0.040*"team" + 0.024*"game" + 0.021*"agent" + 0.021*"captain" + 0.020*"mission" + 0.009*"officer" + 0.008*"play" + 0.008*"member" + 0.007*"secret" + 0.007*"kill"

Topic: 2 
Words: 0.032*"murder" + 0.018*"case" + 0.016*"woman" + 0.016*"husband" + 0.015*"murdered" + 0.015*"wife" + 0.014*"doctor" + 0.014*"police" + 0.012*"young" + 0.012*"plot"

Topic: 3 
Words: 0.011*"tell" + 0.010*"house" + 0.009*"home" + 0.009*"mother" + 0.008*"night" + 0.008*"apartment" + 0.008*"room" + 0.008*"father" + 0.007*"friend" + 0.007*"child"

Topic: 4 
Words: 0.020*"love" + 0.018*"village" + 0.018*"marriage" + 0.016*"father" + 0.015*"family" + 0.015*"marry" + 0.015*"daughter" + 0.014*"married" + 0.014*"sister" + 0.013*"brother"

Topic: 5 
Words: 0.061*"film" + 0.022*"star" + 0.022*"movie" + 0.021*"london" + 0.018

In [29]:
cm_bigrams = CoherenceModel(model = lda_bigram, texts = bigrams_processed, dictionary = bigrams_dict)
cm_bigrams.get_coherence()

0.3960279064232234

In [30]:
# Visualization
pyLDAvis.enable_notebook()
vis_bigram = pyLDAvis.gensim.prepare(lda_bigram, corpus_bigrams, bigrams_dict)
vis_bigram

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


#### Bigram model with spacy lemmatizer

In [31]:
# Bag of words and TFIDF
bow_bigrams_spacy = [bigrams_dict_spacy.doc2bow(plot) for plot in bigrams_spacy]
tfidf_bigrams_spacy = models.TfidfModel(bow_bigrams_spacy)
corpus_bigrams_spacy = tfidf_bigrams_spacy[bow_bigrams_spacy]

In [32]:
# LDA Multicore model using the tf-idf corpus
lda_bigram_mod = gensim.models.LdaMulticore(corpus_bigrams_spacy, num_topics=10, id2word=bigrams_dict_spacy, passes=10, workers = 7, 
                                        random_state = 2020)

for idx, topic in lda_bigram_mod.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.040*"student" + 0.035*"school" + 0.026*"college" + 0.024*"team" + 0.012*"girl" + 0.011*"friend" + 0.011*"win" + 0.010*"play" + 0.009*"love" + 0.009*"father"

Topic: 1 
Words: 0.018*"love" + 0.014*"wedding" + 0.013*"marry" + 0.012*"marriage" + 0.012*"friend" + 0.012*"parent" + 0.011*"family" + 0.011*"father" + 0.010*"meet" + 0.010*"life"

Topic: 2 
Words: 0.027*"village" + 0.020*"love" + 0.019*"marry" + 0.019*"son" + 0.017*"brother" + 0.017*"family" + 0.016*"father" + 0.016*"daughter" + 0.016*"sister" + 0.015*"marriage"

Topic: 3 
Words: 0.017*"money" + 0.008*"job" + 0.008*"pay" + 0.008*"win" + 0.008*"offer" + 0.008*"tell" + 0.007*"work" + 0.007*"sell" + 0.007*"steal" + 0.007*"buy"

Topic: 4 
Words: 0.017*"murder" + 0.015*"police" + 0.013*"kill" + 0.012*"body" + 0.011*"car" + 0.010*"case" + 0.009*"find" + 0.009*"house" + 0.009*"man" + 0.009*"dead"

Topic: 5 
Words: 0.036*"town" + 0.023*"boy" + 0.016*"small" + 0.014*"train" + 0.014*"local" + 0.014*"girl" + 0.014*"young

In [33]:
cm_bigrams_spacy = CoherenceModel(model = lda_bigram_mod, texts = bigrams_spacy, dictionary = bigrams_dict_spacy)
cm_bigrams_spacy.get_coherence()

0.4122192899496212

In [34]:
# Visualization
pyLDAvis.enable_notebook()
vis_bigram_spacy = pyLDAvis.gensim.prepare(lda_bigram_mod, corpus_bigrams_spacy, bigrams_dict_spacy)
vis_bigram_spacy

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Optimizing number of topics for best model (model #2)

In [49]:
def model_eval(corpus, dictionary, texts, limit=30, start=2, step=2):
    coherence = []
    models = []
    for num_topics in range(start, limit, step):
        model = gensim.models.LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=10, workers=7, 
                                       random_state = 2020)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary)
        models.append(model)
        coherence.append(coherencemodel.get_coherence())

    return models, coherence

In [None]:
models_list, coherence_vals = model_eval(corpus=corpus, dictionary=plots_dict_spacy, 
                                         texts = plots_processed_spacy, limit=16, start=2, step=1)

In [None]:
limit=16
start=2
step=1

x = range(start, limit, step)

plt.plot(x, coherence_vals)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence score")
plt.legend(("C values"), loc='best')
plt.title("Optimizing no. of topics for model #2")
plt.show()

In [None]:
coherence_vals

In [26]:
# LDA Multicore model using the tf-idf corpus
final_mod = gensim.models.LdaMulticore(corpus, num_topics=8, id2word=plots_dict_spacy, passes=10, workers=7, 
                                       random_state = 2020)

for idx, topic in final_mod.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.027*"film" + 0.020*"story" + 0.016*"school" + 0.016*"student" + 0.014*"play" + 0.013*"life" + 0.012*"movie" + 0.011*"character" + 0.010*"team" + 0.009*"role"

Topic: 1 
Words: 0.009*"car" + 0.008*"tell" + 0.008*"house" + 0.007*"room" + 0.007*"night" + 0.006*"leave" + 0.006*"find" + 0.006*"hotel" + 0.006*"apartment" + 0.006*"home"

Topic: 2 
Words: 0.010*"kill" + 0.009*"ship" + 0.009*"attack" + 0.008*"soldier" + 0.007*"battle" + 0.007*"escape" + 0.007*"destroy" + 0.007*"use" + 0.007*"war" + 0.007*"power"

Topic: 3 
Words: 0.028*"gang" + 0.014*"money" + 0.012*"town" + 0.010*"big" + 0.008*"win" + 0.008*"boss" + 0.007*"star" + 0.007*"play" + 0.006*"local" + 0.006*"job"

Topic: 4 
Words: 0.010*"love" + 0.009*"family" + 0.009*"mother" + 0.009*"father" + 0.009*"child" + 0.008*"marry" + 0.008*"friend" + 0.007*"life" + 0.007*"parent" + 0.007*"relationship"

Topic: 5 
Words: 0.020*"village" + 0.016*"love" + 0.014*"marry" + 0.014*"son" + 0.013*"brother" + 0.013*"marriage" + 0.0

In [27]:
cm_final = CoherenceModel(model = final_mod, texts = plots_processed_spacy, dictionary = plots_dict_spacy)
cm_final.get_coherence()

0.44155065607560084

In [28]:
# Visualization
pyLDAvis.enable_notebook()
visual = pyLDAvis.gensim.prepare(final_mod, corpus, plots_dict_spacy)
visual

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [32]:
def assign_topic_dist(text, model = final_mod, dictionary = plots_dict_spacy):
    bag_of_words = dictionary.doc2bow(nlp(text))
    
    prob_list = []
    indlist = []
    
    for i, score in model[bag_of_words]:
        indlist.append(i)
        prob_list.append(round(score,4))
    
    n_topics = model.print_topics(-1)[-1][0] + 1
    diff = list(set(list(range(0,n_topics))).difference(indlist))

    while len(prob_list) < 8:
        for pos in diff:
            prob_list.insert(pos, 0.0)
        
    return prob_list



def show_topic_dist(text, model = final_mod, dictionary = plots_dict_spacy):
    bag_of_words = dictionary.doc2bow(nlp(text))
    
    for i, score in sorted(model[bag_of_words], key = lambda x: -1*x[1]):
        print("\nScore: {}\nTopic {}: {}".format(round(score,4), i, model.print_topic(i,10)))

In [33]:
t = plots['Plot'][15629]

In [34]:
print(assign_topic_dist(t))
show_topic_dist(t)

[0.0, 0.3418, 0.419, 0.0, 0.0, 0.0, 0.0, 0.2353]

Score: 0.4180999994277954
Topic 2: 0.010*"kill" + 0.009*"ship" + 0.009*"attack" + 0.008*"soldier" + 0.007*"battle" + 0.007*"escape" + 0.007*"destroy" + 0.007*"use" + 0.007*"war" + 0.007*"power"

Score: 0.3411000072956085
Topic 1: 0.009*"car" + 0.008*"tell" + 0.008*"house" + 0.007*"room" + 0.007*"night" + 0.006*"leave" + 0.006*"find" + 0.006*"hotel" + 0.006*"apartment" + 0.006*"home"

Score: 0.23690000176429749
Topic 7: 0.019*"murder" + 0.018*"police" + 0.013*"kill" + 0.011*"killer" + 0.010*"crime" + 0.010*"case" + 0.009*"prison" + 0.009*"officer" + 0.008*"criminal" + 0.008*"arrest"


In [35]:
plots.iloc[15629]

Release Year                                                     2009
Title                                                  Fast & Furious
Origin/Ethnicity                                             American
Director                                                   Justin Lin
Cast                Vin Diesel, Paul Walker, Michelle Rodriguez, J...
Genre                                                          action
Wiki Page           https://en.wikipedia.org/wiki/Fast_%26_Furious...
Plot                Five years after the first film, Dominic Toret...
Name: 15629, dtype: object

In [36]:
# plots[(plots['Release Year']>=1990) & (plots['Release Year']<=2018) & (plots['Origin/Ethnicity']=='American') & 
#       (plots['Genre']=='mystery')][0:50]

#plots[plots['Title'].str.contains('Avenger')]

In [37]:
# drama/bio: 12361, 15935, 15410, 15776, 12307, 12191
# comedy: 12353, 14591, 14672, 14734
# action: 14792, 15127, 15629, 15158, 14852, 12280
# thriller: 14872, 15107, 12552, 13464
# family: 15663, 15948, 12232
# fantasy: 14667, 15405, 15225, 14303
# adventure: 14696, 15737
# crime: 15178, 13917, 15035, 16238, 8795
# Sci-fi: 15855, 9428, 10298, 10322, 16301, 13509
# mystery: 16112, 15582

In [38]:
# t0: drama, 
# t1: bio-drama, comedy, action, family, fantasy, adventure, crime, sci-fi, mystery
# t2: drama, comedy, action, family, fantasy, sci-fi, 
# t3: drama, action
# t4:

### Recommendation system

In [39]:
prob_vectors = plots['Plot'].map(assign_topic_dist)

In [87]:
prob_vectors[15629]

[0.0, 0.3407, 0.4187, 0.0, 0.0, 0.0, 0.0, 0.2367]

In [88]:
def find_similar(movie, vectors):
    similarity = []

    for i, v in enumerate(vectors):
        s = distance.jensenshannon(assign_topic_dist(movie), v)

        if len(similarity) < 5:
            similarity.append([i,s])

        else:
            case = []
            new = [i,s]

            for ind, j in enumerate(similarity):
                case.append(j[1])

            if new[1] < max(case):
                similarity[case.index(max(case))] = new
    
    similarity = sorted(similarity, key = lambda x: -1*x[1])
    return similarity

In [89]:
x = find_similar(t, prob_vectors)
x

[[11021, 0.030193520778288133],
 [15281, 0.029971609453100387],
 [9353, 0.02088843903844027],
 [7040, 0.019877648197289793],
 [15629, 0.0006034568891839021]]

In [93]:
for i in x:
    print(plots['Title'][i[0]])

The Naked Gun: From the Files of Police Squad!
The Mother of Tears
The Domino Principle
I Bury the Living
Fast & Furious
