In [14]:
import pandas as pd
import numpy as np
from pprint import pprint

# used for pre-processing and modeling
import gensim
from gensim import models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel

# used for natural language processing {NLTK: Natural Language Tool-Kit} 
from nltk.stem import WordNetLemmatizer, SnowballStemmer, LancasterStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')
import spacy

# Visualization tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/siddharth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
# Load dataset
plots = pd.read_csv("wiki_movie_plots_deduped.csv")

## Model 1:

1. WordNetLemmatizer
2. TF-IDF
3. LDAMulticore  

In [16]:
# lemmatizer
lemmatizer = WordNetLemmatizer()

In [17]:
# function to lemmatize a given word
def lemmatize(text):
    return lemmatizer.lemmatize(text, pos = "v")


# function to preprocess the text
def preprocess(text):
    result = []
    for word in gensim.utils.simple_preprocess(text):
        if word not in gensim.parsing.preprocessing.STOPWORDS and len(word) > 3:
            l = lemmatize(word)
            if len(l) > 3:
                result.append(l)
    return result

In [18]:
plots_processed = plots['Plot'].map(preprocess)
plots_processed

0        [bartender, work, saloon, serve, drink, customers, fill, stereotypically, irish, bucket, beer, carrie, nation, followers, burst, inside, assault, irish, pull, dump, beer, head, group, begin, wreck, smash, fixtures, mirror, break, cash, register, bartender, spray, seltzer, water, nation, face, group, policemen, appear, order, everybody, leave]                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
1        [moon, paint, smile, face, hang, park, night, young, couple, walk, past, fence, learn, rail, look, moon, smile, embrace, moon, smile, bigger, bench, tree, moon, view, block, cau

In [19]:
# Dictionary for LDA model
plots_dict = Dictionary(plots_processed)
len(plots_dict)

115205

In [20]:
# Filter dictonary
plots_dict.filter_extremes(no_below = 500, no_above = 0.5)
len(plots_dict)

1457

In [21]:
# Bag of words and TFIDF
bag_of_words = [plots_dict.doc2bow(plot) for plot in plots_processed]
tfidf = models.TfidfModel(bag_of_words)
corpus_tfidf = tfidf[bag_of_words]

In [22]:
# LDA Multicore model using the tf-idf corpus
lda_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=plots_dict, passes=2, workers=1)

for idx, topic in lda_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.012*"school" + 0.007*"student" + 0.007*"film" + 0.007*"life" + 0.006*"story" + 0.006*"students" + 0.006*"family" + 0.006*"teacher" + 0.005*"young" + 0.005*"college"

Topic: 1 
Words: 0.012*"king" + 0.009*"earth" + 0.006*"power" + 0.005*"island" + 0.005*"battle" + 0.005*"destroy" + 0.005*"ship" + 0.005*"defeat" + 0.005*"kingdom" + 0.005*"queen"

Topic: 2 
Words: 0.005*"house" + 0.005*"ghost" + 0.005*"tell" + 0.004*"body" + 0.004*"kill" + 0.004*"room" + 0.004*"home" + 0.004*"mother" + 0.004*"night" + 0.004*"leave"

Topic: 3 
Words: 0.011*"marry" + 0.010*"love" + 0.009*"marriage" + 0.008*"family" + 0.008*"father" + 0.007*"daughter" + 0.007*"mother" + 0.005*"sister" + 0.005*"house" + 0.005*"girl"

Topic: 4 
Words: 0.028*"village" + 0.012*"villagers" + 0.008*"lord" + 0.007*"family" + 0.007*"temple" + 0.007*"story" + 0.006*"kill" + 0.006*"brothers" + 0.006*"gang" + 0.006*"father"

Topic: 5 
Words: 0.012*"police" + 0.008*"murder" + 0.008*"kill" + 0.007*"case" + 0.006*"gang"

In [32]:
# Model measure of performance
cm = CoherenceModel(model = lda_tfidf, texts = plots_processed, dictionary = plots_dict)
cm.get_coherence()

0.44154825620926574

In [None]:
# Visualization
vis = pyLDAvis.gensim.prepare(lda_tfidf, corpus_tfidf, plots_dict)

## Model 2:

1. WordNetLemmatizer
2. Bag of words
3. LDAMulticore

In [38]:
# LDA Multicore model using the tf-idf corpus
lda_bow = gensim.models.LdaMulticore(bag_of_words, num_topics=10, id2word=plots_dict, passes=2, workers=1)

for idx, topic in lda_bow.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.019*"film" + 0.011*"work" + 0.010*"company" + 0.009*"story" + 0.009*"singh" + 0.009*"sing" + 0.007*"minister" + 0.007*"people" + 0.007*"life" + 0.007*"time"

Topic: 1 
Words: 0.015*"kill" + 0.011*"body" + 0.009*"house" + 0.008*"time" + 0.008*"leave" + 0.008*"find" + 0.007*"begin" + 0.007*"hospital" + 0.007*"take" + 0.006*"attack"

Topic: 2 
Words: 0.034*"kill" + 0.031*"police" + 0.017*"money" + 0.015*"gang" + 0.011*"murder" + 0.010*"arrest" + 0.010*"brother" + 0.009*"help" + 0.009*"take" + 0.009*"plan"

Topic: 3 
Words: 0.021*"kill" + 0.012*"village" + 0.012*"force" + 0.012*"attack" + 0.011*"army" + 0.011*"soldier" + 0.010*"escape" + 0.009*"fight" + 0.008*"order" + 0.008*"battle"

Topic: 4 
Words: 0.026*"love" + 0.019*"father" + 0.018*"marry" + 0.016*"family" + 0.014*"mother" + 0.013*"come" + 0.011*"leave" + 0.011*"life" + 0.011*"live" + 0.010*"tell"

Topic: 5 
Words: 0.018*"kill" + 0.017*"tell" + 0.016*"police" + 0.012*"leave" + 0.012*"murder" + 0.010*"house" + 0.01

In [None]:
# Model measure of performance
cm_bow = CoherenceModel(model = lda_bow, texts = plots_processed, dictionary = plots_dict)
cm_bow.get_coherence()

In [None]:
# Visualization
vis_bow = pyLDAvis.gensim.prepare(lda_bow, corpus_tfidf, plots_dict)