In [1]:
# required libraries

import pandas as pd
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

# used for pre-processing the text data and unsupervised topic modeling
import gensim
from gensim import models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary

# used for natural language processing {NLTK: Natural Language Tool-Kit} 
from nltk.stem import WordNetLemmatizer, SnowballStemmer, LancasterStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')

# Set a seed to reproduce the results later. Seed used here is '2019'
np.random.seed(2019)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/siddharth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
# initializing the stemmer and lemmatizer
#stemmer = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

In [8]:
pd.set_option('display.max_colwidth', -1)
plots = pd.read_csv("wiki_movie_plots_deduped.csv", usecols = [1, 5, 7])

In [34]:
t = plots['Genre'].unique()
len(t)

2265

In [15]:
# function to lemmatize a given word
def lemmatize(text):
    return lemmatizer.lemmatize(text, pos = "v")


# function to preprocess the text
def preprocess(text):
    result = []
    for word in gensim.utils.simple_preprocess(text):
        if word not in gensim.parsing.preprocessing.STOPWORDS and len(word) > 3:
            l = lemmatize(word)
            if len(l) > 3:
                result.append(l)
    return result

In [16]:
plots_processed = plots['Plot'].map(preprocess)
plots_processed

0        [bartender, work, saloon, serve, drink, customers, fill, stereotypically, irish, bucket, beer, carrie, nation, followers, burst, inside, assault, irish, pull, dump, beer, head, group, begin, wreck, smash, fixtures, mirror, break, cash, register, bartender, spray, seltzer, water, nation, face, group, policemen, appear, order, everybody, leave]                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
1        [moon, paint, smile, face, hang, park, night, young, couple, walk, past, fence, learn, rail, look, moon, smile, embrace, moon, smile, bigger, bench, tree, moon, view, block, cau

In [17]:
plots_dict = Dictionary(plots_processed)

In [18]:
len(plots_dict)

115205

In [27]:
plots_dict.filter_extremes(no_below = 500, no_above = 0.5)
len(plots_dict)

1457

In [28]:
bag_of_words = [plots_dict.doc2bow(plot) for plot in plots_processed]

In [29]:
tfidf = models.TfidfModel(bag_of_words)
corpus_tfidf = tfidf[bag_of_words]

In [30]:
# LDA model using the tf-idf corpus

lda_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=plots_dict, passes=2, workers=1)

In [35]:
for idx, topic in lda_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.008*"japanese" + 0.006*"singh" + 0.006*"army" + 0.006*"soldier" + 0.006*"kill" + 0.005*"russian" + 0.004*"officer" + 0.004*"military" + 0.004*"bomb" + 0.004*"government"

Topic: 1 
Words: 0.013*"love" + 0.012*"story" + 0.011*"family" + 0.010*"marry" + 0.009*"village" + 0.009*"rich" + 0.009*"revolve" + 0.008*"money" + 0.007*"fall" + 0.007*"young"

Topic: 2 
Words: 0.014*"police" + 0.011*"murder" + 0.009*"kill" + 0.008*"case" + 0.008*"gang" + 0.007*"inspector" + 0.007*"officer" + 0.005*"crime" + 0.005*"arrest" + 0.005*"killer"

Topic: 3 
Words: 0.021*"steve" + 0.021*"race" + 0.016*"arthur" + 0.014*"horse" + 0.011*"thomas" + 0.010*"mary" + 0.009*"driver" + 0.008*"scott" + 0.006*"mark" + 0.006*"aircraft"

Topic: 4 
Words: 0.041*"johnny" + 0.034*"harry" + 0.032*"nick" + 0.025*"peter" + 0.008*"prison" + 0.006*"violent" + 0.005*"sheriff" + 0.005*"money" + 0.005*"frank" + 0.005*"kill"

Topic: 5 
Words: 0.005*"kill" + 0.005*"earth" + 0.005*"ghost" + 0.004*"power" + 0.004*"bod