# Topic Modelling


In [1]:
import gensim
import pandas as pd
from nltk.tokenize import word_tokenize
import string
import nltk
import numpy as np
import re
from pprint import pprint# Gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel# spaCy for preprocessing
#import spacy# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
#matplotlib inline

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))
    text = ' '.join(text.split())
    return text

In [4]:
# Load the dataset 
df = pd.read_csv('../Tom/Movie_database_BritishAmerican2000-2021.csv')

In [5]:
# Viewing dataset
df
df['Plot']

0       Kaisa is a Scot, a successful London lawyer, w...
1       Plagued by endless visions and nightmares, Jen...
2       Garland's novel centers on a young nicotine-ad...
3       In the Bronx, Joe (Sir Billy Connolly), an Iri...
4       A woman who, by a promise made years earlier, ...
                              ...                        
9264    Puss in Boots discovers that his passion for a...
9265                                                  NaN
9266    A tale of outsized ambition and outrageous exc...
9267    A joyous, emotional, heartbreaking celebration...
9268    At West Point Academy in 1830, the calm of an ...
Name: Plot, Length: 9269, dtype: object

In [6]:
#EDA 
print(len(df)) 
print(df[:5])

9269
   IMDbRating                     Title  Year                      Genre  \
0         7.1                  Aberdeen  2000                      Drama   
1         4.1                The Asylum  2000    Drama, Horror, Thriller   
2         6.6                 The Beach  2000  Adventure, Drama, Romance   
3         5.6             Beautiful Joe  2000              Comedy, Drama   
4         6.3  My Best Friend's Wedding  1997     Comedy, Drama, Romance   

                                                Plot  \
0  Kaisa is a Scot, a successful London lawyer, w...   
1  Plagued by endless visions and nightmares, Jen...   
2  Garland's novel centers on a young nicotine-ad...   
3  In the Bronx, Joe (Sir Billy Connolly), an Iri...   
4  A woman who, by a promise made years earlier, ...   

                                           Actors  
0  Stellan Skarsgård, Lena Headey, Jean Johansson  
1        Steffanie Pitt, Nick Waring, Ingrid Pitt  
2   Leonardo DiCaprio, Tilda Swinton, Daniel 

In [7]:
## Stop words 
en_stop = set(nltk.corpus.stopwords.words('english'))

In [8]:
df_new = df[df['Plot'].notnull()]
df_new['Plot'] = df_new['Plot'].apply(lambda x: preprocess_text(x))
print(len(df_new)) 
print(df_new[:5])

9119
   IMDbRating                     Title  Year                      Genre  \
0         7.1                  Aberdeen  2000                      Drama   
1         4.1                The Asylum  2000    Drama, Horror, Thriller   
2         6.6                 The Beach  2000  Adventure, Drama, Romance   
3         5.6             Beautiful Joe  2000              Comedy, Drama   
4         6.3  My Best Friend's Wedding  1997     Comedy, Drama, Romance   

                                                Plot  \
0  kaisa is a scot a successful london lawyer who...   
1  plagued by endless visions and nightmares jenn...   
2  garlands novel centers on a young nicotineaddi...   
3  in the bronx joe sir billy connolly an irish g...   
4  a woman who by a promise made years earlier is...   

                                           Actors  
0  Stellan Skarsgård, Lena Headey, Jean Johansson  
1        Steffanie Pitt, Nick Waring, Ingrid Pitt  
2   Leonardo DiCaprio, Tilda Swinton, Daniel 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Plot'] = df_new['Plot'].apply(lambda x: preprocess_text(x))


In [9]:
# From workshop - setting up lemmatisation and removing stop words 

import nltk
from nltk.corpus import wordnet

lmtzr = nltk.WordNetLemmatizer().lemmatize

## We lookup whether a word is and adjective, verb, noun or adverb here.
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

    
## This version uses word type. Needs the bigger nltp download ("popular")
def normalize_text(text):
    ## Runs on documents (vector of words)
    word_pos = nltk.pos_tag(nltk.word_tokenize(text))
    lemm_words = [lmtzr(sw[0], get_wordnet_pos(sw[1])) for sw in word_pos]

    return [x.lower() for x in lemm_words]

## This version doesn't require the "popular" download
def preprocess(text):
    ## Runs on documents (vector of words)
    lemmatizer = nltk.WordNetLemmatizer()
    return([lemmatizer.lemmatize(i) for i in text.split()])

################
## wordnet version
from nltk.corpus import wordnet as wn
def get_lemma(word):
    ## morphy does a lemma lookup and word standardization
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

## lemmatize
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

## This version is for comparison
def prepare_text_for_lda(text):
    ## Runs on documents (vector of words)
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens


In [10]:
print(df_new.iloc[1]['Plot'])

plagued by endless visions and nightmares jenny adams suspects that as a child she was responsible for the brutal murder of her own mother


In [11]:
doc_sample = df_new.iloc[1]['Plot']

from gensim import parsing

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(normalize_text(doc_sample))
print('\n\n simpler tokenized and lemmatized document: ')
print(preprocess(doc_sample))
print('\n\n method removing stop words: ')
print(prepare_text_for_lda(doc_sample))

# The method removing stopwords appears successful but the lemmatisation is pretty bad in all cases 

original document: 
['plagued', 'by', 'endless', 'visions', 'and', 'nightmares', 'jenny', 'adams', 'suspects', 'that', 'as', 'a', 'child', 'she', 'was', 'responsible', 'for', 'the', 'brutal', 'murder', 'of', 'her', 'own', 'mother']


 tokenized and lemmatized document: 
['plague', 'by', 'endless', 'vision', 'and', 'nightmare', 'jenny', 'adam', 'suspect', 'that', 'a', 'a', 'child', 'she', 'be', 'responsible', 'for', 'the', 'brutal', 'murder', 'of', 'her', 'own', 'mother']


 simpler tokenized and lemmatized document: 
['plagued', 'by', 'endless', 'vision', 'and', 'nightmare', 'jenny', 'adam', 'suspect', 'that', 'a', 'a', 'child', 'she', 'wa', 'responsible', 'for', 'the', 'brutal', 'murder', 'of', 'her', 'own', 'mother']


 method removing stop words: 
['plague', 'endless', 'vision', 'nightmare', 'jenny', 'adams', 'suspect', 'child', 'responsible', 'brutal', 'murder', 'mother']


In [12]:
doc_sample = df_new.iloc[2]['Plot']

from gensim import parsing

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(normalize_text(doc_sample))
print('\n\n simpler tokenized and lemmatized document: ')
print(preprocess(doc_sample))
print('\n\n method removing stop words: ')
print(prepare_text_for_lda(doc_sample))

#Again removing stop words looks good
# first lemmatisation seems to be doing better 

original document: 
['garlands', 'novel', 'centers', 'on', 'a', 'young', 'nicotineaddicted', 'traveler', 'named', 'richard', 'an', 'avid', 'popculture', 'buff', 'with', 'a', 'particular', 'love', 'for', 'video', 'games', 'and', 'vietnam', 'war', 'movies', 'while', 'at', 'a', 'hotel', 'in', 'bangkok', 'he', 'finds', 'a', 'map', 'left', 'by', 'his', 'strange', 'whackedout', 'neighbor', 'who', 'just', 'committed', 'suicide', 'the', 'map', 'supposedly', 'leads', 'to', 'a', 'legendary', 'island', 'paradise', 'where', 'some', 'other', 'wayward', 'souls', 'have', 'settled']


 tokenized and lemmatized document: 
['garland', 'novel', 'center', 'on', 'a', 'young', 'nicotineaddicted', 'traveler', 'name', 'richard', 'an', 'avid', 'popculture', 'buff', 'with', 'a', 'particular', 'love', 'for', 'video', 'game', 'and', 'vietnam', 'war', 'movie', 'while', 'at', 'a', 'hotel', 'in', 'bangkok', 'he', 'find', 'a', 'map', 'leave', 'by', 'his', 'strange', 'whackedout', 'neighbor', 'who', 'just', 'commit', 

In [13]:
processed_df = df_new['Plot'].map(prepare_text_for_lda) # with stop words removed
processed_df[:10]


0    [kaisa, successful, london, lawyer, snort, one...
1    [plague, endless, vision, nightmare, jenny, ad...
2    [garland, novel, center, young, nicotineaddict...
3    [bronx, billy, connolly, irish, twice, brain, ...
4    [woman, promise, years, earlier, suppose, marr...
5    [county, durham, endless, violent, strike, mar...
6    [intersect, story, daniel, plainview, sunday, ...
7    [brendan, behan, sixteen, yearold, republican,...
8    [quickwitted, young, woman, come, mexican, bor...
9    [beautiful, psychiatrist, befriend, abuse, pat...
Name: Plot, dtype: object

In [14]:
dictionary = gensim.corpora.Dictionary(processed_df)

count = 0
for k,v  in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 aberdeen
1 agree
2 airline
3 alcoholic
4 along
5 ban
6 begging
7 call
8 clive
9 collect
10 complicate


In [15]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# gets rid of too common or uncommon words

In [16]:
# Create Corpus 
# Term Document Frequency 
corpus = [dictionary.doc2bow(text) for text in processed_df ]  
# View 
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)]]


In [17]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [18]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.032*"night" + 0.026*"threaten" + 0.021*"arrive" + 0.020*"across" + '
  '0.019*"street" + 0.018*"steal" + 0.018*"determine" + 0.017*"trap" + '
  '0.014*"thought" + 0.014*"monster"'),
 (1,
  '0.045*"escape" + 0.043*"power" + 0.029*"force" + 0.027*"earth" + '
  '0.026*"human" + 0.022*"become" + 0.018*"involve" + 0.018*"santa" + '
  '0.016*"mountain" + 0.016*"center"'),
 (2,
  '0.058*"family" + 0.029*"discover" + 0.027*"find" + 0.022*"return" + '
  '0.019*"life" + 0.018*"journey" + 0.017*"house" + 0.015*"child" + '
  '0.015*"world" + 0.013*"parent"'),
 (3,
  '0.047*"bring" + 0.044*"could" + 0.037*"lead" + 0.029*"detective" + '
  '0.027*"search" + 0.025*"become" + 0.024*"enemy" + 0.021*"threat" + '
  '0.020*"life" + 0.019*"getting"'),
 (4,
  '0.051*"learn" + 0.050*"turn" + 0.037*"murder" + 0.024*"teenager" + '
  '0.022*"include" + 0.021*"mystery" + 0.021*"member" + 0.020*"star" + '
  '0.019*"greatest" + 0.017*"director"'),
 (5,
  '0.031*"friend" + 0.023*"young" + 0.021*"years" + 0

In [19]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

In [20]:
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis

  default_term_info = default_term_info.sort_values(


### Coherence and hyperparameters

In [21]:
# Baseline coherence score:

from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_df, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.26060625543683913


In [22]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_df, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=dictionary, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('./results/lda_tuning_results.csv', index=False)
    pbar.close()

## Scikit version

https://github.com/rfhussain/Topic-Modeling-with-Python-Scikit-LDA/blob/master/source/lda_test.py



In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import random

In [None]:
#initializing the count vectorizer
#max document frequencey means that the percentage of max frequency shuld be less than 90% of any word across documents
#min document frequencey is an integer, means that a word must occur at least 2 or more times to be counted
#stop words will be automatically tackled through sklearn 
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')

In [None]:
#the fit transform method will return a sparse matrix (numberofariticles x totalwords)
dtm  = cv.fit_transform(df_new['Plot'])

In [None]:
#initialize the LDA, n_components =10 means that we are opting for 10 distinct topics
#the n_components depends upon how big is the repository and how many topics you want to discover
#keep the random state as 42
LDA = LatentDirichletAllocation(n_components=10, random_state=42)

In [None]:
#fit the model into lda
LDA.fit(dtm)

#grab the vocabulary of words
#get the random words 
random_int = random.randint(0,5477)

cv.get_feature_names()[random_int] #this function will get the words from the document

#grab the topics
single_topic = LDA.components_[0]


#this way we can get index position for high probablity topics SORTED by probablity in ASC order
top_10_words = single_topic.argsort()[-10:] #to get the last 10 highest probablity words for this topic



In [None]:
for index in top_10_words:
    print(cv.get_feature_names()[index])
    
#grab the highest probablity words per topic
for i, topic in enumerate(LDA.components_):
    print(f"The top 15 words for the topic #{i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-10:]]) 
    print("\n")
    print("\n")


#attach the topic number to the original topics
topic_results = LDA.transform(dtm)

topic_results[0]