In [539]:
import pandas as pd
import numpy as np
import random
import re
from pprint import pprint
import sys
import pickle

# used for pre-processing and modeling
import gensim
from gensim import models
from gensim.utils import simple_preprocess
from gensim.utils import tokenize
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from gensim.models.wrappers import LdaMallet
from gensim.models.phrases import SentenceAnalyzer

# used for natural language processing {NLTK: Natural Language Tool-Kit} 
from nltk.stem import WordNetLemmatizer, SnowballStemmer, LancasterStemmer
from nltk.stem.porter import *
import nltk
from nltk.tag import pos_tag
nltk.download('wordnet')
import spacy

# Visualization tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.spatial import distance

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/siddharthsuresh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
path_to_mallet_binary = "~/mallet-2.0.8/bin/mallet"

In [378]:
# Load dataset
plots_full = pd.read_csv("wiki_movie_plots_deduped.csv")
plots_full['Plot'] = plots_full['Plot'].apply(lambda x: x.strip())
plots_full['Title'] = plots_full['Title'].apply(lambda x: x.strip())
plots_full.head(10)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
5,1903,Alice in Wonderland,American,Cecil Hepworth,May Clark,unknown,https://en.wikipedia.org/wiki/Alice_in_Wonderl...,"Alice follows a large white rabbit down a ""Rab..."
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
8,1905,The Little Train Robbery,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Little_Train...,The opening scene shows the interior of the ro...
9,1905,The Night Before Christmas,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Night_Before...,Scenes are introduced using lines of the poem....


In [379]:
plots = plots_full[plots_full["Origin/Ethnicity"].isin(['American', 'British', 'Australian', 'Canadian', 'Bollywood'])].reset_index(drop=True)
# plots = plots[plots['Release Year'] >= 1980].reset_index(drop=True)
plots.loc[:,'Title'] = plots.loc[:,'Title'].apply(lambda x: re.sub(r'.*TheThe', 'The', x))
# plots = plots.drop_duplicates(subset=['Title', 'Release Year'], keep='last').reset_index()
plots = plots.drop_duplicates(subset=['Title', 'Plot'], keep='last').reset_index(drop=True)
plots.head(10)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
5,1903,Alice in Wonderland,American,Cecil Hepworth,May Clark,unknown,https://en.wikipedia.org/wiki/Alice_in_Wonderl...,"Alice follows a large white rabbit down a ""Rab..."
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
8,1905,The Little Train Robbery,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Little_Train...,The opening scene shows the interior of the ro...
9,1905,The Night Before Christmas,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Night_Before...,Scenes are introduced using lines of the poem....


In [380]:
plots['Decade'] = ["1900s" if x in range(1900,1910) else "1910s" if x in range(1910, 1920) else \
                   "1920s" if x in range(1920,1930) else "1930s" if x in range(1930,1940) else "1940s" if x in range(1940, 1950) else \
                   "1950s" if x in range(1950,1960) else "1960s" if x in range(1960,1970) else "1970s" if x in range(1970, 1980) else \
                   "1980s" if x in range(1980,1990) else "1990s" if x in range(1990,2000) else "2000s" if x in range(2000, 2010) else "2010s" \
                   for x in plots['Release Year']]
plots = plots.sample(frac=1).reset_index(drop=True)
plots

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Decade
0,2007,"Romulus, My Father",Australian,Richard Roxburgh,"Eric Bana, Kodi Smit-McPhee",drama,"https://en.wikipedia.org/wiki/Romulus,_My_Fath...","The film tells the story of Romulus Gaiţă, a R...",2000s
1,1945,Quiet Please!,American,Hanna-Barbera,Tom and Jerry,animated,https://en.wikipedia.org/wiki/Quiet_Please!,"Tom's nemesis, Spike, is trying to take a nap,...",1940s
2,1933,Bombshell,American,Victor Fleming,"Jean Harlow, Lee Tracy, Franchot Tone","comedy, drama",https://en.wikipedia.org/wiki/Bombshell_(film),Movie star Lola Burns (Jean Harlow) is angry w...,1930s
3,1951,Duck and Cover,American,Anthony Rizzo,,propaganda,https://en.wikipedia.org/wiki/Duck_and_Cover_(...,"The film starts with an animated sequence, sho...",1950s
4,1938,Man-Proof,American,Karl Freund,"Myrna Loy, Rosalind Russell, Walter Pidgeon","comedy, drama",https://en.wikipedia.org/wiki/Man-Proof,The daughter of wealthy and famous novelist Me...,1930s
...,...,...,...,...,...,...,...,...,...
24908,1958,Onionhead,American,Norman Taurog,"Andy Griffith, Walter Matthau, Joey Bishop, Fe...","drama, comedy",https://en.wikipedia.org/wiki/Onionhead,"In the spring of 1941, Al Woods quits an Oklah...",1950s
24909,1979,Promises in the Dark,American,Jerome Hellman,"Marsha Mason, Ned Beatty",drama,https://en.wikipedia.org/wiki/Promises_in_the_...,"Numbed by career demands and a recent divorce,...",1970s
24910,1973,Hitler: The Last Ten Days,British,Ennio De Concini,"Alec Guinness, Simon Ward",historical,https://en.wikipedia.org/wiki/Hitler:_The_Last...,"The film opens with Hitler's 56th birthday, on...",1970s
24911,1998,A Bug's Life,American,John Lasseter,"Voices of Dave Foley, Kevin Spacey, Julia Loui...",comedy,https://en.wikipedia.org/wiki/A_Bug%27s_Life,Ant Island is a colony of ants led by the Quee...,1990s


## Model 1:

1. WordNetLemmatizer
2. TF-IDF
3. LDAMulticore  

In [381]:
# lemmatizer
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")

In [382]:
# function to lemmatize a given word
def lemmatize(text):
    lemmatized_text_noun = lemmatizer.lemmatize(text, pos = "n")
    lemmatized_text = lemmatizer.lemmatize(lemmatized_text_noun, pos = "v")
    return lemmatized_text

# function to preprocess the text
def preprocess(text):
    result = []
    for word in list(tokenize(text, deacc=True)):
        if word not in gensim.parsing.preprocessing.STOPWORDS and len(word) > 3 and pos_tag(word.split())[0][1] != 'NNP':
            l = lemmatize(word)
            if len(l) > 3:
                result.append(l)
    result_list = [word.lower() for word in result]
    return result_list

In [383]:
%%time
plots_processed = plots['Plot'].map(preprocess)
plots_processed

CPU times: user 9min 58s, sys: 1min 15s, total: 11min 14s
Wall time: 11min 15s


0        [film, tell, story, romulus, romanian, immigra...
1        [nemesis, spike, awake, chase, jerry, mouse, a...
2        [movie, star, burns, jean, harlow, angry, stud...
3        [film, start, animate, sequence, show, anthrop...
4        [daughter, wealthy, famous, novelist, swift, m...
                               ...                        
24908    [spring, woods, quit, oklahoma, college, join,...
24909    [numbed, career, demand, recent, divorce, kend...
24910       [film, open, hitler, birthday, later, suicide]
24911    [island, colony, queen, daughter, princess, at...
24912    [scotland, world, phyllis, young, scottish, ho...
Name: Plot, Length: 24913, dtype: object

In [384]:
# Dictionary for LDA model
plots_dict = Dictionary(plots_processed)
len(plots_dict)

82106

In [385]:
# Filter dictonary
plots_dict.filter_extremes(no_below = 300, no_above = 0.5) #948
len(plots_dict)

1805

In [171]:
# function that provides frequency of words in the dictionary
def freq_list(dict):
    l = []
    for k, v in dict.iteritems():
        l.append(v)
    
    freq_dict = dict.dfs
    freq_df = pd.DataFrame.from_dict(freq_dict, orient = 'index', columns = ['Frequency']).reset_index()
    freq_df = freq_df.sort_values(by = 'index').reset_index()
    freq_df.drop(columns = ['level_0'], inplace = True)
    freq_df.insert(1, 'word', l)
    
    freq = freq_df.sort_values(by = 'Frequency', ascending = False)
    
    return freq

In [386]:
df_freq = freq_list(plots_dict)
df_freq

Unnamed: 0,index,word,Frequency
146,146,take,10525
161,161,when,9973
16,16,tell,9880
94,94,leave,9365
198,198,return,9314
...,...,...,...
835,835,withdraw,301
1701,1701,christopher,301
137,137,somewhat,300
1304,1304,emotionally,300


In [387]:
# Bag of words and TFIDF
bag_of_words = [plots_dict.doc2bow(plot) for plot in plots_processed]
tfidf = models.TfidfModel(bag_of_words)
corpus_tfidf = tfidf[bag_of_words]

In [391]:
%%time
# LDA Multicore model using the tf-idf corpus
lda_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=20, id2word=plots_dict, workers = 5,
                                       random_state = 2020)

for idx, topic in lda_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.006*"jerry" + 0.004*"kill" + 0.004*"island" + 0.003*"family" + 0.003*"ship" + 0.003*"house" + 0.003*"wife" + 0.003*"love" + 0.003*"father" + 0.003*"police"

Topic: 1 
Words: 0.007*"nick" + 0.004*"love" + 0.003*"father" + 0.003*"jane" + 0.003*"family" + 0.003*"mother" + 0.003*"life" + 0.003*"film" + 0.003*"wife" + 0.003*"kill"

Topic: 2 
Words: 0.004*"film" + 0.003*"family" + 0.003*"tommy" + 0.003*"johnny" + 0.003*"woman" + 0.003*"police" + 0.003*"love" + 0.003*"father" + 0.003*"kill" + 0.003*"child"

Topic: 3 
Words: 0.003*"family" + 0.003*"grace" + 0.003*"kill" + 0.003*"wife" + 0.003*"marry" + 0.003*"love" + 0.003*"tell" + 0.003*"friend" + 0.003*"mother" + 0.002*"father"

Topic: 4 
Words: 0.004*"marry" + 0.004*"jeff" + 0.004*"love" + 0.003*"father" + 0.003*"family" + 0.003*"tell" + 0.003*"kill" + 0.003*"murder" + 0.003*"life" + 0.003*"money"

Topic: 5 
Words: 0.003*"marry" + 0.003*"love" + 0.003*"wife" + 0.003*"kill" + 0.003*"daughter" + 0.003*"life" + 0.003*"father

In [392]:
# %%time
# # LDA Mallet implementation
# lda_mallet = LdaMallet(path_to_mallet_binary, corpus=corpus_tfidf, num_topics=10, id2word=plots_dict)

# for idx, topic in lda_mallet.print_topics(-1):
#     print('Topic: {} \nWords: {}\n'.format(idx, topic))

In [393]:
# Model measure of performance
cm = CoherenceModel(model = lda_tfidf, texts = plots_processed, dictionary = plots_dict)
cm.get_coherence()

0.3218820233349272

In [99]:
# Visualization
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_tfidf, corpus_tfidf, plots_dict)
vis

In [17]:
def model_eval(corpus, dictionary, texts, limit=30, start=2, step=2):
    coherence = []
    models = []
    for num_topics in range(start, limit, step):
        model = gensim.models.LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary, workers=5, 
                                       random_state = 2020)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary)
        models.append(model)
        coherence.append(coherencemodel.get_coherence())

    return models, coherence

In [18]:
# models_eval, coherence_eval = model_eval(corpus_tfidf, plots_dict, plots_processed, 25, 8, 1)

In [19]:
# limit=25
# start=8
# step=1

# x = range(start, limit, step)

# plt.plot(x, coherence_eval)
# plt.xlabel("Number of Topics")
# plt.ylabel("Coherence score")
# plt.legend(("C values"), loc='best')
# plt.title("Optimizing no. of topics")
# plt.show()

### Model 2:

using spacy for lemmatizing

In [394]:
# spacy model
nlp = spacy.load("en", disable=["parser", 'ner'])

def lemmatize_nlp(doc, pos = ['NOUN', 'ADJ', 'ADV', 'VERB']):
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-' and token.pos_ in pos]
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

def remove_stopwords(doc):
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

nlp.add_pipe(lemmatize_nlp, name='lemmatizer', after='tagger')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [395]:
%%time
plots_processed_spacy = plots['Plot'].str.lower().map(nlp)

CPU times: user 4min 41s, sys: 11.3 s, total: 4min 52s
Wall time: 4min 53s


In [400]:
# Dictionary for LDA model
plots_dict_spacy = Dictionary(plots_processed_spacy)
len(plots_dict_spacy)

58709

In [402]:
# Filter dictonary
plots_dict_spacy.filter_extremes(no_below = 300, no_above = 0.5) # 200
len(plots_dict_spacy)

1747

In [403]:
df_freq = freq_list(plots_dict_spacy)
df_freq

Unnamed: 0,index,word,Frequency
91,91,leave,11470
317,317,man,10069
18,18,tell,9888
150,150,try,9514
193,193,return,9409
...,...,...,...
1733,1733,eager,301
1264,1264,emotionally,300
135,135,somewhat,300
1412,1412,crazy,300


In [404]:
# Bag of words and TFIDF
bow = [plots_dict_spacy.doc2bow(plot) for plot in plots_processed_spacy]
tfidf_mod = models.TfidfModel(bow)
corpus = tfidf_mod[bow]

In [407]:
%%time
# LDA Multicore model using the tf-idf corpus
lda_mod = gensim.models.LdaMulticore(corpus, num_topics=20, id2word=plots_dict_spacy, workers=5, 
                                       random_state = 2020)

for idx, topic in lda_mod.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.004*"man" + 0.003*"town" + 0.003*"kill" + 0.003*"tell" + 0.003*"life" + 0.003*"love" + 0.003*"film" + 0.002*"woman" + 0.002*"mother" + 0.002*"home"

Topic: 1 
Words: 0.003*"race" + 0.003*"man" + 0.003*"kill" + 0.003*"horse" + 0.003*"son" + 0.003*"father" + 0.003*"film" + 0.003*"love" + 0.003*"family" + 0.003*"town"

Topic: 2 
Words: 0.003*"love" + 0.003*"tell" + 0.003*"father" + 0.003*"man" + 0.003*"woman" + 0.003*"marry" + 0.003*"leave" + 0.003*"friend" + 0.003*"police" + 0.003*"car"

Topic: 3 
Words: 0.003*"kill" + 0.003*"woman" + 0.003*"man" + 0.003*"love" + 0.003*"father" + 0.003*"tell" + 0.003*"diamond" + 0.003*"brother" + 0.003*"family" + 0.003*"house"

Topic: 4 
Words: 0.004*"kill" + 0.003*"island" + 0.003*"father" + 0.003*"man" + 0.003*"tell" + 0.003*"ship" + 0.003*"game" + 0.003*"use" + 0.002*"escape" + 0.002*"team"

Topic: 5 
Words: 0.004*"kill" + 0.003*"murder" + 0.003*"man" + 0.003*"police" + 0.003*"film" + 0.003*"father" + 0.003*"mother" + 0.003*"wife" +

In [408]:
cm_spacy = CoherenceModel(model = lda_mod, texts = plots_processed_spacy, dictionary = plots_dict_spacy)
cm_spacy.get_coherence()

0.33073503378313224

In [402]:
# Visualization
pyLDAvis.enable_notebook()
vis_spacy = pyLDAvis.gensim.prepare(lda_mod, corpus, plots_dict_spacy)
vis_spacy

### Model 3:
Using Bigrams

In [483]:
bigrams = gensim.models.Phrases(plots_processed_spacy, min_count=10, threshold=1)
bigram_mod = gensim.models.phrases.Phraser(bigrams)

In [477]:
def create_bigrams(text):
    return bigram_mod[text]

In [486]:
#bigrams_processed = plots_processed.map(create_bigrams)
bigrams_spacy = plots_processed_spacy.map(create_bigrams)

In [487]:
# Dictionary for LDA model
# bigrams_dict = Dictionary(bigrams_processed)
# print(len(bigrams_dict))

bigrams_dict_spacy = Dictionary(bigrams_spacy)
print(len(bigrams_dict_spacy))

73536


In [489]:
# Filter dictonary
# bigrams_dict.filter_extremes(no_below = 600, no_above = 0.5)
# print(len(bigrams_dict))

bigrams_dict_spacy.filter_extremes(no_below = 300, no_above = 0.5)
print(len(bigrams_dict_spacy))

1552


In [494]:
# f = freq_list(bigrams_dict)
f_spacy = freq_list(bigrams_dict_spacy)
f_spacy

Unnamed: 0,index,word,Frequency
151,151,find,10136
78,78,leave,9330
171,171,tell,7922
355,355,man,7453
292,292,kill,6906
...,...,...,...
992,992,pet,301
1479,1479,addition,301
493,493,territory,300
524,524,tank,300


#### Bigram model with general lemmatizer

In [354]:
# # Bag of words and TFIDF
# bow_bigrams = [bigrams_dict.doc2bow(plot) for plot in bigrams_processed]
# tfidf_bigrams = models.TfidfModel(bow_bigrams)
# corpus_bigrams = tfidf_bigrams[bow_bigrams]

In [355]:
# # LDA Multicore model using the tf-idf corpus
# lda_bigram = gensim.models.LdaMulticore(corpus_bigrams, num_topics=10, id2word=bigrams_dict, workers = 5, 
#                                         random_state = 2020)

# for idx, topic in lda_bigram.print_topics(-1):
#     print('Topic: {} \nWords: {}\n'.format(idx, topic))

In [356]:
# cm_bigrams = CoherenceModel(model = lda_bigram, texts = bigrams_processed, dictionary = bigrams_dict)
# cm_bigrams.get_coherence()

In [357]:
# # Visualization
# pyLDAvis.enable_notebook()
# vis_bigram = pyLDAvis.gensim.prepare(lda_bigram, corpus_bigrams, bigrams_dict)
# vis_bigram

#### Bigram model with spacy lemmatizer

In [495]:
# Bag of words and TFIDF
bow_bigrams_spacy = [bigrams_dict_spacy.doc2bow(plot) for plot in bigrams_spacy]
tfidf_bigrams_spacy = models.TfidfModel(bow_bigrams_spacy)
corpus_bigrams_spacy = tfidf_bigrams_spacy[bow_bigrams_spacy]

In [496]:
# LDA Multicore model using the tf-idf corpus
lda_bigram_mod = gensim.models.LdaMulticore(corpus_bigrams_spacy, num_topics=12, id2word=bigrams_dict_spacy, workers = 5, 
                                        random_state = 2020)

for idx, topic in lda_bigram_mod.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.004*"kill" + 0.003*"family" + 0.003*"man" + 0.003*"find" + 0.003*"mother" + 0.003*"life" + 0.003*"son" + 0.003*"child" + 0.003*"father" + 0.003*"earth"

Topic: 1 
Words: 0.003*"man" + 0.003*"play" + 0.003*"find" + 0.003*"film" + 0.003*"work" + 0.003*"boy" + 0.003*"leave" + 0.003*"mother" + 0.003*"life" + 0.003*"love"

Topic: 2 
Words: 0.003*"kill" + 0.003*"money" + 0.003*"gangster" + 0.003*"family" + 0.003*"find" + 0.003*"father" + 0.003*"work" + 0.003*"tell" + 0.003*"man" + 0.003*"meet"

Topic: 3 
Words: 0.004*"kill" + 0.003*"find" + 0.003*"man" + 0.003*"house" + 0.003*"father" + 0.003*"family" + 0.003*"love" + 0.003*"leave" + 0.003*"marry" + 0.003*"friend"

Topic: 4 
Words: 0.003*"kill" + 0.003*"man" + 0.003*"tell" + 0.003*"find" + 0.003*"wife" + 0.003*"leave" + 0.003*"gold" + 0.003*"woman" + 0.002*"life" + 0.002*"friend"

Topic: 5 
Words: 0.004*"father" + 0.004*"family" + 0.003*"mother" + 0.003*"tell" + 0.003*"find" + 0.003*"girl" + 0.003*"love" + 0.003*"man" + 0.

In [497]:
cm_bigrams_spacy = CoherenceModel(model = lda_bigram_mod, texts = bigrams_spacy, dictionary = bigrams_dict_spacy)
cm_bigrams_spacy.get_coherence()

0.30677093719219334

In [34]:
# Visualization
pyLDAvis.enable_notebook()
vis_bigram_spacy = pyLDAvis.gensim.prepare(lda_bigram_mod, corpus_bigrams_spacy, bigrams_dict_spacy)
vis_bigram_spacy

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Final Model

In [512]:
# LDA Multicore model using the tf-idf corpus
final_mod = gensim.models.LdaMulticore(corpus, num_topics=12, id2word=plots_dict_spacy, passes=5, workers=5, 
                                       random_state = 2020)

for idx, topic in final_mod.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.013*"village" + 0.006*"girl" + 0.006*"boy" + 0.006*"family" + 0.006*"villager" + 0.005*"father" + 0.005*"love" + 0.005*"young" + 0.005*"story" + 0.005*"child"

Topic: 1 
Words: 0.006*"war" + 0.006*"german" + 0.006*"soldier" + 0.005*"british" + 0.005*"ship" + 0.005*"pilot" + 0.005*"officer" + 0.005*"army" + 0.004*"aircraft" + 0.004*"mission"

Topic: 2 
Words: 0.005*"love" + 0.005*"tell" + 0.004*"marry" + 0.004*"mother" + 0.004*"father" + 0.004*"home" + 0.004*"leave" + 0.003*"wedding" + 0.003*"friend" + 0.003*"ask"

Topic: 3 
Words: 0.006*"mother" + 0.006*"family" + 0.006*"love" + 0.005*"father" + 0.005*"marry" + 0.005*"life" + 0.005*"child" + 0.005*"son" + 0.004*"daughter" + 0.004*"husband"

Topic: 4 
Words: 0.004*"house" + 0.004*"dog" + 0.003*"cat" + 0.003*"kill" + 0.003*"island" + 0.003*"run" + 0.003*"car" + 0.003*"chase" + 0.003*"tell" + 0.003*"boat"

Topic: 5 
Words: 0.006*"kill" + 0.005*"earth" + 0.005*"human" + 0.005*"alien" + 0.004*"scientist" + 0.004*"creature

In [513]:
cm_final = CoherenceModel(model = final_mod, texts = plots_processed_spacy, dictionary = plots_dict_spacy)
cm_final.get_coherence()

0.4476811335584956

In [151]:
# Visualization
pyLDAvis.enable_notebook()
visual = pyLDAvis.gensim.prepare(final_mod, corpus, plots_dict_spacy)
visual

In [501]:

def show_topic_dist(text, model = final_mod, dictionary = plots_dict_spacy):
    bag_of_words = dictionary.doc2bow(nlp(text))
    
    for i, score in sorted(model[bag_of_words], key = lambda x: -1*x[1]):
        print("\nScore: {}\nTopic {}: {}".format(round(score,4), i, model.print_topic(i,10)))

In [416]:
# plots[(plots['Release Year']>=1990) & (plots['Release Year']<=2018) & (plots['Origin/Ethnicity']=='American') & 
#       (plots['Genre']=='mystery')][0:50]

#plots[plots['Title'].str.contains('Avenger')]

### Recommendation system

In [516]:
%%time
prob_vectors = plots['Plot'].map(assign_topic_dist)

CPU times: user 5min 12s, sys: 11.6 s, total: 5min 23s
Wall time: 5min 23s


In [574]:
def assign_topic_dist(text, model = final_mod, dictionary = plots_dict_spacy):
    bag_of_words = dictionary.doc2bow(nlp(text))

    prob_list = []
    indlist = []
    
    for i, score in model[bag_of_words]:
        indlist.append(i)
        prob_list.append(round(score,1))
    
    n_topics = model.print_topics(-1)[-1][0] + 1
    diff = list(set(list(range(0,n_topics))).difference(indlist))

    while len(prob_list) < 12:
        for pos in diff:
            prob_list.insert(pos, 0.0)
        
    return prob_list


def find_similar(movie, year, num, vectors = prob_vectors):
    similarity = []
    
    movie_index = plots[(plots.Title == movie) & (plots['Release Year'] == year)].index[0]
    movie_plot = plots[(plots.Title == movie) & (plots['Release Year'] == year)].Plot.values[0]
    
    assigned_movie_topic_dist = assign_topic_dist(movie_plot)

    for i, v in enumerate(vectors):
        dist = distance.jensenshannon(assigned_movie_topic_dist, v)
        similarity.append([i,dist])
    
    similarity = [x for x in similarity if x[0] != movie_index]
    similarity = sorted(similarity, key = lambda x: x[1])[:num]
    return similarity


def recommend_movies():
    
    num = int(input("Enter the number of movies to be recommended (upto 100): ")) # user inputs the number of recommendations
    
    movie_input = input("Enter the full name of the movie (initials capitalized): ") # user inputs the movie
    
    if movie_input in list(plots.Title):
        
        movie_options = plots[plots.Title == movie_input].iloc[:,:8].reset_index(drop = True) 
        display(movie_options) # offer all movies with that name entered by user
        year_input = int(input("Confirm your movie (enter number on the left): ")) # ask user to select the right movie
        
        # handles erroneous values
        if year_input > len(movie_options)-1 or year_input < 0:
            print("Wrong input, try again!")
            year_input = int(input("Confirm your movie (enter number on the left): "))
            
            if year_input > len(movie_options)-1 or year_input < 0:
                print("Sorry! Please start again")
                return
            else:
                year = movie_options.iloc[year_input,:]['Release Year']
            
        else:
            year = movie_options.iloc[year_input,:]['Release Year'] # gets the year of the movie to identify the exact match

        x = find_similar(movie_input, year, num) # finds similar movies

        print("\n=== Top " + str(num) + " movies similar to " + "\"" + movie_input + "\"" + " ===\n")
        
        similar_movies_dict = {"Movie": [], "Genre": [], "Year": [], "Decade": []} # dictionary to save similar movies
        
        # append values to the dictionary
        for v in x:
            similar_movies_dict['Movie'].append(plots['Title'][v[0]])
            similar_movies_dict['Genre'].append(plots['Genre'][v[0]])
            similar_movies_dict['Year'].append(plots['Release Year'][v[0]])
            similar_movies_dict['Decade'].append(plots['Decade'][v[0]])
        
        # convert dictionary to dataframe for display
        similar_movies_df = pd.DataFrame(similar_movies_dict)
        display(similar_movies_df)
        
        # ask for recommendations across different decades
        decade_rec = input("Would you like to get recommendations across different decades? (Yes/No): ")
        
        if decade_rec == "Yes" or decade_rec == "yes" or decade_rec == "y" or decade_rec == "Y":
            similar_movies_df = similar_movies_df.groupby("Decade").first().reset_index() # returns most similar movies across different decades
            return similar_movies_df
        elif decade_rec == "No" or decade_rec == "no" or decade_rec == "n" or decade_rec == "N":
            return "ENJOY!"
        else:
            print("Invalid ")
    
    else: # handles errors
        print("Sorry! Check the spelling and/or completeness of the movie.\n")
        print("Note: Some movies aren't available in the database\n")
        
        exit_option = input("Do you wish to exit? ")
        
        if exit_option == "Yes" or exit_option == "yes" or exit_option == "y" or exit_option == "Y":
            return
            
        elif exit_option == "No" or exit_option == "no" or exit_option == "n" or exit_option == "N":
            return recommend_movies(num = 10)
        
        else:
            print("Thank you for using the movie recommender! Try another movie")
            return
    
    return similar_movies_df

In [577]:
recommend_movies()

Enter the number of movies to be recommended (upto 100):  9
Enter the full name of the movie (initials capitalized):  King Kong


Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1976,King Kong,American,John Guillermin,"Jeff Bridges, Jessica Lange, Charles Grodin",adventure,https://en.wikipedia.org/wiki/King_Kong_(1976_...,"In 1976, Fred Wilson, an executive of the Petr..."
1,2005,King Kong,American,Peter Jackson,"Naomi Watts, Jack Black, Adrien Brody, Kyle Ch...",adventure,https://en.wikipedia.org/wiki/King_Kong_(2005_...,"In 1933, during the Great Depression, New York..."
2,1933,King Kong,American,"Merian C. Cooper, Ernest B. Schoedsack","Fay Wray, Robert Armstrong, Bruce Cabot",adventure horror,https://en.wikipedia.org/wiki/King_Kong_(1933_...,"In New York Harbor, filmmaker Carl Denham, fam..."


Confirm your movie (enter number on the left):  0



=== Top 9 movies similar to "King Kong" ===



Unnamed: 0,Movie,Genre,Year,Decade
0,Squanto: A Warrior's Tale,adventure,1994,1990s
1,The Glass Bottom Boat,comedy,1966,1960s
2,Big Game,unknown,2014,2010s
3,Ghost Ship,horror,2002,2000s
4,Retreat,unknown,2011,2010s
5,Ravenous,horror,1999,1990s
6,Beyond the Poseidon Adventure,action,1979,1970s
7,"Tomorrow, When the War Began",war,2010,2010s
8,Alive,"drama, biography",1993,1990s


Would you like to get recommendations across different decades? (Yes/No):  y


Unnamed: 0,Decade,Movie,Genre,Year
0,1960s,The Glass Bottom Boat,comedy,1966
1,1970s,Beyond the Poseidon Adventure,action,1979
2,1990s,Squanto: A Warrior's Tale,adventure,1994
3,2000s,Ghost Ship,horror,2002
4,2010s,Big Game,unknown,2014


Movies to test:


* drama/bio: Apollo 13, The Social Network, The Curious Case of Benjamin Button, 127 Hours, The Shawshank Redemption, Forrest Gump
* comedy: Ace Ventura: When Nature Calls, The 40-Year-Old Virgin, Hitch, The Pacifier, The Hangover
* action: xXx: State of the Union, 300, Fast & Furious, The Bourne Ultimatum, Casino Royale, Pulp Fiction
* thriller: The Da Vinci Code, V for Vendetta, Seven, Fight Club, Angels & Demons
* family/animation: Ice Age: Dawn of the Dinosaurs, Toy Story 3, The Lion King
* fantasy: Harry Potter and the Goblet of Fire, The Chronicles of Narnia: Prince Caspian, Harry Potter and the Order of the Phoenix, The Lord of the Rings: The Return of the King
* adventure: King Kong
* crime: The Condemned, Ocean's Eleven, The Prestige, The Dark Knight Rises, The Godfather
* Sci-fi: Inception, Star Wars Episode IV: A New Hope (aka Star Wars), The Terminator, Back to the Future, The Hunger Games, The Matrix
* mystery: Sherlock Holmes: A Game of Shadows, Sherlock Holmes

## To do:
1. Clean the plots folder (maybe try keeping only US movies) - Done
2. Figure out a way to test out which topics are being allocated - Done
3. Try bigrams model - Done, but not too great
4. Use nlp pipeline - Done
5. Try running the model with entire dictionary - Tried but no use
6. Experiment with number of topics - 12 seems to be the right number
7. See if more passes over the corpus helps - 5 passes is pretty good

### Prospective models:

    final_mod = gensim.models.LdaMulticore(corpus, num_topics=12, id2word=plots_dict_spacy, passes=5, workers=5, 
                                       random_state = 2020)

where, 

    bow = [plots_dict_spacy.doc2bow(plot) for plot in plots_processed_spacy]
    tfidf_mod = models.TfidfModel(bow)
    corpus = tfidf_mod[bow]

and,

    plots_dict_spacy.filter_extremes(no_below = 300, no_above = 0.5)

and,

    nlp = spacy.load("en", disable=["parser", 'ner'])

    def lemmatize_nlp(doc, pos = ['NOUN', 'ADJ', 'ADV', 'VERB']):
        doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-' and token.pos_ in pos]
        doc = u' '.join(doc)
        return nlp.make_doc(doc)

    def remove_stopwords(doc):
        doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
        return doc

    nlp.add_pipe(lemmatize_nlp, name='lemmatizer', after='tagger')
    nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [540]:
# plots.to_csv("Movie-Plots.csv", index=False)

In [541]:
# filename = "Topic_model.sav"
# pickle.dump(final_mod, open(filename, 'wb'))