In [1]:
import pandas as pd
import numpy as np
from pprint import pprint

# used for pre-processing and modeling
import gensim
from gensim import models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel

# used for natural language processing {NLTK: Natural Language Tool-Kit} 
from nltk.stem import WordNetLemmatizer, SnowballStemmer, LancasterStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')
import spacy

# Visualization tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/siddharth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Load dataset
plots = pd.read_csv("wiki_movie_plots_deduped.csv")

## Model 1:

1. WordNetLemmatizer
2. TF-IDF
3. LDAMulticore  

In [3]:
# lemmatizer
lemmatizer = WordNetLemmatizer()

In [4]:
# function to lemmatize a given word
def lemmatize(text):
    return lemmatizer.lemmatize(text, pos = "v")


# function to preprocess the text
def preprocess(text):
    result = []
    for word in gensim.utils.simple_preprocess(text):
        if word not in gensim.parsing.preprocessing.STOPWORDS and len(word) > 3:
            l = lemmatize(word)
            if len(l) > 3:
                result.append(l)
    return result

In [5]:
plots_processed = plots['Plot'].map(preprocess)
plots_processed

0        [bartender, work, saloon, serve, drink, custom...
1        [moon, paint, smile, face, hang, park, night, ...
2        [film, minute, long, compose, shots, girl, bas...
3        [last, second, consist, shots, shoot, wood, wi...
4        [earliest, know, adaptation, classic, fairytal...
                               ...                        
34881    [film, begin, world, centre, joshua, connor, r...
34882    [musicians, salih, gürkan, describe, adventure...
34883    [zafer, sailor, live, mother, döndü, coastal, ...
34884    [film, centre, young, woman, name, tyler, book...
34885    [writer, orhan, şahin, return, stanbul, years,...
Name: Plot, Length: 34886, dtype: object

In [164]:
# Dictionary for LDA model
plots_dict = Dictionary(plots_processed)
len(plots_dict)

115205

In [165]:
# Filter dictonary
plots_dict.filter_extremes(no_below = 948, no_above = 0.5)
len(plots_dict)

762

In [166]:
l = []
for k, v in plots_dict.iteritems():
    l.append(v)

In [167]:
freq_dict = plots_dict.dfs
freq_df = pd.DataFrame.from_dict(freq_dict, orient = 'index', columns = ['Frequency']).reset_index()
freq_df = freq_df.sort_values(by = 'index').reset_index()
freq_df.drop(columns = ['level_0'], inplace = True)
freq_df.insert(1, 'word', l)

In [168]:
freq = freq_df.sort_values(by = 'Frequency')

In [170]:
freq

Unnamed: 0,index,word,Frequency
213,213,judge,948
385,385,retire,949
652,652,completely,951
409,409,obtain,951
130,130,violent,952
...,...,...,...
66,66,kill,12527
124,124,tell,12763
309,309,love,12785
54,54,take,14272


In [171]:
# Bag of words and TFIDF
bag_of_words = [plots_dict.doc2bow(plot) for plot in plots_processed]
tfidf = models.TfidfModel(bag_of_words)
corpus_tfidf = tfidf[bag_of_words]

In [172]:
# LDA Multicore model using the tf-idf corpus
lda_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=plots_dict, passes=2, workers=1, 
                                       random_state = 2020)

for idx, topic in lda_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.008*"affair" + 0.008*"husband" + 0.007*"wife" + 0.007*"david" + 0.007*"divorce" + 0.006*"mother" + 0.006*"family" + 0.006*"love" + 0.006*"relationship" + 0.006*"home"

Topic: 1 
Words: 0.021*"school" + 0.019*"students" + 0.018*"team" + 0.014*"student" + 0.010*"high" + 0.010*"game" + 0.009*"college" + 0.009*"professor" + 0.007*"teacher" + 0.006*"government"

Topic: 2 
Words: 0.017*"police" + 0.012*"kill" + 0.012*"gang" + 0.011*"murder" + 0.009*"officer" + 0.008*"inspector" + 0.008*"case" + 0.007*"jail" + 0.007*"arrest" + 0.007*"prison"

Topic: 3 
Words: 0.008*"kill" + 0.008*"body" + 0.007*"killer" + 0.007*"house" + 0.006*"spirit" + 0.006*"police" + 0.005*"murder" + 0.005*"tell" + 0.005*"woman" + 0.004*"room"

Topic: 4 
Words: 0.013*"love" + 0.012*"marry" + 0.011*"village" + 0.011*"family" + 0.010*"father" + 0.009*"story" + 0.009*"marriage" + 0.008*"mother" + 0.008*"life" + 0.008*"daughter"

Topic: 5 
Words: 0.016*"michael" + 0.012*"peter" + 0.010*"park" + 0.009*"hotel

In [173]:
# Model measure of performance
cm = CoherenceModel(model = lda_tfidf, texts = plots_processed, dictionary = plots_dict)
cm.get_coherence()

0.43321389786906456

In [42]:
# Visualization
vis = pyLDAvis.gensim.prepare(lda_tfidf, corpus_tfidf, plots_dict)
pyLDAvis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
