In [1]:
import pandas as pd
import numpy as np
from pprint import pprint

# used for pre-processing and modeling
import gensim
from gensim import models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from gensim.models.wrappers import LdaMallet

# used for natural language processing {NLTK: Natural Language Tool-Kit} 
from nltk.stem import WordNetLemmatizer, SnowballStemmer, LancasterStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')
import spacy

# Visualization tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/siddharth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Load dataset
plots = pd.read_csv("wiki_movie_plots_deduped.csv")

## Model 1:

1. WordNetLemmatizer
2. TF-IDF
3. LDAMulticore  

In [3]:
# lemmatizer
lemmatizer = WordNetLemmatizer()

# spacy model
nlp = spacy.load("en")

In [4]:
# function to lemmatize a given word
def lemmatize(text):
    return lemmatizer.lemmatize(text, pos = "n")

# function to preprocess the text
def preprocess(text):
    result = []
    for word in gensim.utils.simple_preprocess(text):
        if word not in gensim.parsing.preprocessing.STOPWORDS and len(word) > 3:
            l = lemmatize(word)
            if len(l) > 3:
                result.append(l)
    return result

In [5]:
plots_processed = plots['Plot'].map(preprocess)
plots_processed

0        [bartender, working, saloon, serving, drink, c...
1        [moon, painted, smiling, face, hang, park, nig...
2        [film, minute, long, composed, shot, girl, sit...
3        [lasting, second, consisting, shot, shot, wood...
4        [earliest, known, adaptation, classic, fairyta...
                               ...                        
34881    [film, begin, world, ended, centre, joshua, co...
34882    [musician, salih, gürkan, described, adventure...
34883    [zafer, sailor, living, mother, döndü, coastal...
34884    [film, centre, young, woman, named, tyler, boo...
34885    [writer, orhan, şahin, return, stanbul, year, ...
Name: Plot, Length: 34886, dtype: object

In [None]:
# Dictionary for LDA model
plots_dict = Dictionary(plots_processed)
len(plots_dict)

In [None]:
# Filter dictonary
plots_dict.filter_extremes(no_below = 5, no_above = 0.5) #948
len(plots_dict)

In [8]:
# function that provides frequency of words in the dictionary
def freq_list(dict):
    l = []
    for k, v in dict.iteritems():
        l.append(v)
    
    freq_dict = dict.dfs
    freq_df = pd.DataFrame.from_dict(freq_dict, orient = 'index', columns = ['Frequency']).reset_index()
    freq_df = freq_df.sort_values(by = 'index').reset_index()
    freq_df.drop(columns = ['level_0'], inplace = True)
    freq_df.insert(1, 'word', l)
    
    freq = freq_df.sort_values(by = 'Frequency')
    
    return freq

In [9]:
# Bag of words and TFIDF
bag_of_words = [plots_dict.doc2bow(plot) for plot in plots_processed]
tfidf = models.TfidfModel(bag_of_words)
corpus_tfidf = tfidf[bag_of_words]

In [10]:
# LDA Multicore model using the tf-idf corpus
lda_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=plots_dict, passes=10,
                                       workers = 1, random_state = 2020)

for idx, topic in lda_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.046*"film" + 0.032*"story" + 0.020*"movie" + 0.019*"role" + 0.019*"character" + 0.018*"life" + 0.016*"played" + 0.014*"music" + 0.014*"director" + 0.013*"based"

Topic: 1 
Words: 0.006*"body" + 0.006*"house" + 0.006*"police" + 0.005*"kill" + 0.005*"tell" + 0.005*"room" + 0.004*"night" + 0.004*"phone" + 0.004*"home" + 0.004*"death"

Topic: 2 
Words: 0.055*"school" + 0.053*"student" + 0.029*"teacher" + 0.029*"team" + 0.023*"college" + 0.021*"high" + 0.020*"game" + 0.017*"david" + 0.014*"class" + 0.011*"peter"

Topic: 3 
Words: 0.013*"village" + 0.013*"love" + 0.012*"family" + 0.011*"father" + 0.010*"marriage" + 0.009*"daughter" + 0.009*"mother" + 0.008*"brother" + 0.008*"married" + 0.008*"marry"

Topic: 4 
Words: 0.205*"island" + 0.122*"ship" + 0.095*"boat" + 0.084*"crew" + 0.068*"pilot" + 0.066*"captain" + 0.058*"frank" + 0.042*"plane" + 0.028*"storm" + 0.022*"board"

Topic: 5 
Words: 0.006*"friend" + 0.006*"love" + 0.006*"relationship" + 0.006*"woman" + 0.005*"mother

In [11]:
# Model measure of performance
cm = CoherenceModel(model = lda_tfidf, texts = plots_processed, dictionary = plots_dict)
cm.get_coherence()

0.48014813022753183

In [12]:
# Visualization
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_tfidf, corpus_tfidf, plots_dict)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Model 2:

using spacy for lemmatizing

In [254]:
def lemmatize_nlp(doc, pos = ['NOUN', 'ADJ', 'ADV', 'VERB']):
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-' and token.pos_ in pos]
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

def remove_stopwords(doc):
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

nlp.add_pipe(lemmatize_nlp, name='lemmatizer', after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [255]:
plots_processed_spacy = plots['Plot'].str.lower().map(nlp)

In [282]:
# Dictionary for LDA model
plots_dict_spacy = Dictionary(plots_processed_spacy)
len(plots_dict_spacy)

71514

In [283]:
# Filter dictonary
plots_dict_spacy.filter_extremes(no_below = 200, no_above = 0.5)
len(plots_dict_spacy)

2874

In [289]:
# Bag of words and TFIDF
bow = [plots_dict_spacy.doc2bow(plot) for plot in plots_processed_spacy]
tfidf_mod = models.TfidfModel(bow)
corpus = tfidf_mod[bow]

In [290]:
# LDA Multicore model using the tf-idf corpus
lda_mod = gensim.models.LdaMulticore(corpus, num_topics=10, id2word=plots_dict_spacy, passes=10, workers=1, 
                                       random_state = 2020)

for idx, topic in lda_mod.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.008*"king" + 0.005*"earth" + 0.004*"power" + 0.004*"kingdom" + 0.004*"human" + 0.004*"alien" + 0.004*"monster" + 0.004*"island" + 0.004*"battle" + 0.003*"kill"

Topic: 1 
Words: 0.037*"music" + 0.034*"film" + 0.030*"band" + 0.026*"song" + 0.025*"role" + 0.024*"play" + 0.022*"singer" + 0.018*"star" + 0.017*"actor" + 0.016*"sing"

Topic: 2 
Words: 0.021*"police" + 0.021*"gang" + 0.017*"murder" + 0.015*"crime" + 0.015*"criminal" + 0.014*"gangster" + 0.013*"prison" + 0.012*"cop" + 0.012*"inspector" + 0.011*"diamond"

Topic: 3 
Words: 0.160*"crux" + 0.072*"molly" + 0.035*"drama" + 0.023*"concern" + 0.022*"duke" + 0.020*"sailor" + 0.020*"irish" + 0.017*"protagonist" + 0.012*"boxer" + 0.011*"rule"

Topic: 4 
Words: 0.005*"kill" + 0.004*"terrorist" + 0.004*"soldier" + 0.004*"officer" + 0.003*"war" + 0.003*"japanese" + 0.003*"police" + 0.003*"agent" + 0.003*"bomb" + 0.003*"korean"

Topic: 5 
Words: 0.129*"mafia" + 0.066*"horse" + 0.041*"shark" + 0.041*"ranch" + 0.032*"cattle"

In [291]:
cm_spacy = CoherenceModel(model = lda_mod, texts = plots_processed_spacy, dictionary = plots_dict_spacy)
cm_spacy.get_coherence()

0.4837585692367334

In [292]:
# Visualization
pyLDAvis.enable_notebook()
vis_spacy = pyLDAvis.gensim.prepare(lda_mod, corpus, plots_dict_spacy)
vis_spacy

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Model 3:
Using Bigrams

In [13]:
bigrams = gensim.models.Phrases(plots_processed, min_count=10, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigrams)

In [14]:
def create_bigrams(text):
    return bigram_mod[text]

In [15]:
bigrams_processed = plots_processed.map(create_bigrams)

In [58]:
# Dictionary for LDA model
bigrams_dict = Dictionary(bigrams_processed)
len(bigrams_dict)

122071

In [59]:
# Filter dictonary
bigrams_dict.filter_extremes(no_below = 5, no_above = 0.5)
len(bigrams_dict)

37360

In [60]:
f = freq_list(bigrams_dict)
f

Unnamed: 0,index,word,Frequency
21414,21414,oriole,5
32313,32313,auburn,5
32308,32308,floored,5
32304,32304,ángel,5
26788,26788,tolerable,5
...,...,...,...
304,304,tell,11067
611,611,friend,11798
239,239,time,11837
849,849,love,12118


In [61]:
# Bag of words and TFIDF
bow_bigrams = [bigrams_dict.doc2bow(plot) for plot in bigrams_processed]
tfidf_bigrams = models.TfidfModel(bow_bigrams)
corpus_bigrams = tfidf_bigrams[bow_bigrams]

In [62]:
# LDA Multicore model using the tf-idf corpus
lda_bigram = gensim.models.LdaMulticore(corpus_bigrams, num_topics=10, id2word=bigrams_dict, passes=10, 
                                        random_state = 2020)

for idx, topic in lda_bigram.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.033*"raja" + 0.010*"hari" + 0.008*"jong" + 0.005*"wang" + 0.004*"sasha" + 0.003*"chung" + 0.003*"ratan" + 0.003*"dada" + 0.003*"punjab" + 0.002*"shaolin"

Topic: 1 
Words: 0.002*"japanese" + 0.002*"earth" + 0.002*"world" + 0.002*"group" + 0.002*"ship" + 0.002*"king" + 0.001*"soldier" + 0.001*"battle" + 0.001*"army" + 0.001*"island"

Topic: 2 
Words: 0.011*"dong" + 0.009*"chandu" + 0.008*"conan" + 0.005*"rana" + 0.003*"reiko" + 0.003*"malaysia" + 0.002*"zhao" + 0.002*"kogoro" + 0.002*"masha" + 0.002*"yuen"

Topic: 3 
Words: 0.004*"sook" + 0.003*"katya" + 0.002*"jasmine" + 0.002*"saki" + 0.002*"amber" + 0.002*"rhea" + 0.002*"karim" + 0.002*"lara" + 0.001*"rina" + 0.001*"nila"

Topic: 4 
Words: 0.007*"ninja" + 0.007*"chandra" + 0.005*"chun" + 0.003*"tara" + 0.003*"miki" + 0.002*"zack" + 0.002*"manchu" + 0.002*"shim" + 0.002*"tamura" + 0.002*"shocker"

Topic: 5 
Words: 0.012*"ramu" + 0.011*"maya" + 0.007*"vishnu" + 0.004*"kira" + 0.002*"yoko" + 0.002*"suspense" + 0.002*"

In [63]:
cm_bigrams = CoherenceModel(model = lda_bigram, texts = bigrams_processed, dictionary = bigrams_dict)
cm_bigrams.get_coherence()

0.6388484061098878

In [64]:
# Visualization
pyLDAvis.enable_notebook()
vis_bigram = pyLDAvis.gensim.prepare(lda_bigram, corpus_bigrams, bigrams_dict)
vis_bigram

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-64-6b3404ef4290>", line 3, in <module>
    vis_bigram = pyLDAvis.gensim.prepare(lda_bigram, corpus_bigrams, bigrams_dict)
  File "/opt/anaconda3/lib/python3.7/site-packages/pyLDAvis/gensim.py", line 119, in prepare
    return vis_prepare(**opts)
  File "/opt/anaconda3/lib/python3.7/site-packages/pyLDAvis/_prepare.py", line 398, in prepare
    topic_info         = _topic_info(topic_term_dists, topic_proportion, term_frequency, term_topic_freq, vocab, lambda_step, R, n_jobs)
  File "/opt/anaconda3/lib/python3.7/site-packages/pyLDAvis/_prepare.py", line 255, in _topic_info
    for ls in _job_chunks(lambda_seq, n_jobs)))
  File "/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 934, in __call__
    self.retrieve()
  File "/opt/anaconda3/lib/python3.7/sit

KeyboardInterrupt: 