# Topic Modelling


In [1]:
import gensim
import pandas as pd
from nltk.tokenize import word_tokenize
import string
import nltk
import numpy as np

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))
    text = ' '.join(text.split())
    return text

In [7]:
# Load the dataset 
df = pd.read_csv('../Tom/Movie_database_BritishAmerican2000-2021.csv')

In [31]:
# Viewing dataset
df
df['Plot']

0       Kaisa is a Scot, a successful London lawyer, w...
1       Plagued by endless visions and nightmares, Jen...
2       Garland's novel centers on a young nicotine-ad...
3       In the Bronx, Joe (Sir Billy Connolly), an Iri...
4       A woman who, by a promise made years earlier, ...
                              ...                        
9264    Puss in Boots discovers that his passion for a...
9265                                                  NaN
9266    A tale of outsized ambition and outrageous exc...
9267    A joyous, emotional, heartbreaking celebration...
9268    At West Point Academy in 1830, the calm of an ...
Name: Plot, Length: 9269, dtype: object

In [10]:
#Remove NAs
df. dropna()
#EDA 
print(len(df)) 
print(df[:5])

9269
   IMDbRating                     Title  Year                      Genre  \
0         7.1                  Aberdeen  2000                      Drama   
1         4.1                The Asylum  2000    Drama, Horror, Thriller   
2         6.6                 The Beach  2000  Adventure, Drama, Romance   
3         5.6             Beautiful Joe  2000              Comedy, Drama   
4         6.3  My Best Friend's Wedding  1997     Comedy, Drama, Romance   

                                                Plot  \
0  Kaisa is a Scot, a successful London lawyer, w...   
1  Plagued by endless visions and nightmares, Jen...   
2  Garland's novel centers on a young nicotine-ad...   
3  In the Bronx, Joe (Sir Billy Connolly), an Iri...   
4  A woman who, by a promise made years earlier, ...   

                                           Actors  
0  Stellan Skarsgård, Lena Headey, Jean Johansson  
1        Steffanie Pitt, Nick Waring, Ingrid Pitt  
2   Leonardo DiCaprio, Tilda Swinton, Daniel 

In [13]:
## Stop words 
en_stop = set(nltk.corpus.stopwords.words('english'))


In [14]:
# From workshop - setting up lemmatisation and removing stop words 

import nltk
from nltk.corpus import wordnet

lmtzr = nltk.WordNetLemmatizer().lemmatize

## We lookup whether a word is and adjective, verb, noun or adverb here.
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

    
## This version uses word type. Needs the bigger nltp download ("popular")
def normalize_text(text):
    ## Runs on documents (vector of words)
    word_pos = nltk.pos_tag(nltk.word_tokenize(text))
    lemm_words = [lmtzr(sw[0], get_wordnet_pos(sw[1])) for sw in word_pos]

    return [x.lower() for x in lemm_words]

## This version doesn't require the "popular" download
def preprocess(text):
    ## Runs on documents (vector of words)
    lemmatizer = nltk.WordNetLemmatizer()
    return([lemmatizer.lemmatize(i) for i in text.split()])

################
## wordnet version
from nltk.corpus import wordnet as wn
def get_lemma(word):
    ## morphy does a lemma lookup and word standardization
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

## lemmatize
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

## This version is for comparison
def prepare_text_for_lda(text):
    ## Runs on documents (vector of words)
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens


In [20]:
print(df.iloc[1]['Plot'])

Plagued by endless visions and nightmares, Jenny Adams suspects that, as a child, she was responsible for the brutal murder of her own mother.


In [22]:
doc_sample = df.iloc[1]['Plot']

from gensim import parsing

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(normalize_text(doc_sample))
print('\n\n simpler tokenized and lemmatized document: ')
print(preprocess(doc_sample))
print('\n\n method removing stop words: ')
print(prepare_text_for_lda(doc_sample))

# The method removing stopwords appears successful but the lemmatisation is pretty bad in all cases 

original document: 
['Plagued', 'by', 'endless', 'visions', 'and', 'nightmares,', 'Jenny', 'Adams', 'suspects', 'that,', 'as', 'a', 'child,', 'she', 'was', 'responsible', 'for', 'the', 'brutal', 'murder', 'of', 'her', 'own', 'mother.']


 tokenized and lemmatized document: 
['plagued', 'by', 'endless', 'vision', 'and', 'nightmare', ',', 'jenny', 'adams', 'suspect', 'that', ',', 'a', 'a', 'child', ',', 'she', 'be', 'responsible', 'for', 'the', 'brutal', 'murder', 'of', 'her', 'own', 'mother', '.']


 simpler tokenized and lemmatized document: 
['Plagued', 'by', 'endless', 'vision', 'and', 'nightmares,', 'Jenny', 'Adams', 'suspect', 'that,', 'a', 'a', 'child,', 'she', 'wa', 'responsible', 'for', 'the', 'brutal', 'murder', 'of', 'her', 'own', 'mother.']


 method removing stop words: 
['Plagued', 'endless', 'vision', 'nightmare', 'Jenny', 'Adams', 'suspect', 'child', 'responsible', 'brutal', 'murder', 'mother']


In [23]:
doc_sample = df.iloc[2]['Plot']

from gensim import parsing

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(normalize_text(doc_sample))
print('\n\n simpler tokenized and lemmatized document: ')
print(preprocess(doc_sample))
print('\n\n method removing stop words: ')
print(prepare_text_for_lda(doc_sample))

#Again removing stop words looks good
# first lemmatisation seems to be doing better 

original document: 
["Garland's", 'novel', 'centers', 'on', 'a', 'young', 'nicotine-addicted', 'traveler', 'named', 'Richard,', 'an', 'avid', 'pop-culture', 'buff', 'with', 'a', 'particular', 'love', 'for', 'video', 'games', 'and', 'Vietnam', 'War', 'movies.', 'While', 'at', 'a', 'hotel', 'in', 'Bangkok,', 'he', 'finds', 'a', 'map', 'left', 'by', 'his', 'strange,', 'whacked-out', 'neighbor,', 'who', 'just', 'committed', 'suicide.', 'The', 'map', 'supposedly', 'leads', 'to', 'a', 'legendary', 'island', 'paradise', 'where', 'some', 'other', 'wayward', 'souls', 'have', 'settled.']


 tokenized and lemmatized document: 
['garland', "'s", 'novel', 'center', 'on', 'a', 'young', 'nicotine-addicted', 'traveler', 'name', 'richard', ',', 'an', 'avid', 'pop-culture', 'buff', 'with', 'a', 'particular', 'love', 'for', 'video', 'game', 'and', 'vietnam', 'war', 'movie', '.', 'while', 'at', 'a', 'hotel', 'in', 'bangkok', ',', 'he', 'find', 'a', 'map', 'leave', 'by', 'his', 'strange', ',', 'whacked-out

In [34]:
df_new = df[df['Plot'].notnull()]
processed_df = df_new['Plot'].map(preprocess) # preprocess is faster than normalise_text.
processed_df[:10]


0    [Kaisa, is, a, Scot,, a, successful, London, l...
1    [Plagued, by, endless, vision, and, nightmares...
2    [Garland's, novel, center, on, a, young, nicot...
3    [In, the, Bronx,, Joe, (Sir, Billy, Connolly),...
4    [A, woman, who,, by, a, promise, made, year, e...
5    [County, Durham,, during, the, endless,, viole...
6    [The, intersecting, life, story, of, Daniel, P...
7    [Brendan, Behan,, a, sixteen, year-old, republ...
8    [Maya, is, a, quick-witted, young, woman, who,...
9    [A, beautiful, psychiatrist, befriends, an, ab...
Name: Plot, dtype: object

In [None]:
dictionary = gensim.corpora.Dictionary(processed_docs)

count = 0
for k,v  in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

## https://github.com/rfhussain/Topic-Modeling-with-Python-Scikit-LDA/blob/master/source/lda_test.py



In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import random

In [27]:
#initializing the count vectorizer
#max document frequencey means that the percentage of max frequency shuld be less than 90% of any word across documents
#min document frequencey is an integer, means that a word must occur at least 2 or more times to be counted
#stop words will be automatically tackled through sklearn 
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')

In [35]:
#the fit transform method will return a sparse matrix (numberofariticles x totalwords)
dtm  = cv.fit_transform(df_new['Plot'])

In [36]:
#initialize the LDA, n_components =10 means that we are opting for 10 distinct topics
#the n_components depends upon how big is the repository and how many topics you want to discover
#keep the random state as 42
LDA = LatentDirichletAllocation(n_components=10, random_state=42)

In [38]:
#fit the model into lda
LDA.fit(dtm)

#grab the vocabulary of words
#get the random words 
random_int = random.randint(0,5477)

cv.get_feature_names()[random_int] #this function will get the words from the document

#grab the topics
single_topic = LDA.components_[0]


#this way we can get index position for high probablity topics SORTED by probablity in ASC order
top_10_words = single_topic.argsort()[-10:] #to get the last 10 highest probablity words for this topic



In [39]:
for index in top_10_words:
    print(cv.get_feature_names()[index])
    
#grab the highest probablity words per topic
for i, topic in enumerate(LDA.components_):
    print(f"The top 15 words for the topic #{i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-10:]]) 
    print("\n")
    print("\n")


#attach the topic number to the original topics
topic_results = LDA.transform(dtm)

topic_results[0]

great
gets
does
team
langford
perform
host
obsessed
talk
jerry
The top 15 words for the topic #0
['great', 'gets', 'does', 'team', 'langford', 'perform', 'host', 'obsessed', 'talk', 'jerry']




The top 15 words for the topic #1
['school', 'max', 'wife', 'high', 'money', 'son', 'los', 'angeles', 'home', 'family']




The top 15 words for the topic #2
['man', 'young', 'family', 'film', 'new', 'set', 'love', 'world', 'life', 'story']




The top 15 words for the topic #3
['house', 'story', 'salvation', 'luck', 'drug', 'wife', 'reverend', 'jr', 'grace', 'salo']




The top 15 words for the topic #4
['beachum', 'wants', 'time', 'soon', 'car', 'death', 'louis', 'bob', 'frank', 'steve']




The top 15 words for the topic #5
['future', 'town', 'report', 'help', 'world', 'man', 'ted', 'school', 'story', 'life']




The top 15 words for the topic #6
['father', 'love', 'family', 'year', 'high', 'friend', 'friends', 'school', 'new', 'life']




The top 15 words for the topic #7
['mission', 'time'

array([0.00169558, 0.00169542, 0.37159081, 0.00169507, 0.37041398,
       0.00169539, 0.00169552, 0.24612781, 0.00169541, 0.00169501])