In [1]:
import pandas as pd
import tqdm
import plotly.express as px
from operator import itemgetter
import os

# NLP stuff:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words

# LDA stuff:
import pyLDAvis
import pyLDAvis.gensim_models
from gensim import corpora, models, similarities
from gensim.similarities import MatrixSimilarity
from gensim.similarities import Similarity
from gensim.corpora import Dictionary

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [2]:
# import lemmatizer data:
# nltk.download('wordnet')
# nltk.download('omw-1.4')

---
### Import Movies Data:

In [3]:
movie_desc_df = pd.read_csv('data/movie_descriptions.csv')

# impute missing values:
movie_desc_df['description'] = movie_desc_df['description'].fillna('NA')
# movie_desc_df['description'] = movie_desc_df['title'] + ' ' + movie_desc_df['description']
movie_desc_df.head()

Unnamed: 0,id,movie_id,imdb_id,title,description,genres,lda_vector,sim_list
0,1,675353,12412888.0,Sonic the Hedgehog 2,"After settling in Green Hills, Sonic is eager ...","[28, 878, 35, 10751, 12]",,
1,2,453395,9419884.0,Doctor Strange in the Multiverse of Madness,"Doctor Strange, with the help of mystical alli...","[14, 28, 12]",,
2,3,629542,8115900.0,The Bad Guys,When the infamous Bad Guys are finally caught ...,"[16, 35, 28, 10751, 80]",,
3,4,414906,1877830.0,The Batman,"In his second year of fighting crime, Batman u...","[80, 9648, 53]",,
4,5,335787,1464335.0,Uncharted,"A young street-smart, Nathan Drake and his wis...","[28, 12]",,


#### Apply some feature engineering to add genre to the descriptions

In [4]:
# here we'll use the "genres" column to encode new tokens which we'll add to the original 
def add_genres(df, target_col, genre_col):
    # df[target_col] = df['title'] + ' ' + df[target_col]
    tokenizer = RegexpTokenizer(r'\w+')
    encoded_genres = []
    new_feat = []
    for i, genres in enumerate(df[genre_col]):
        raw_genres = tokenizer.tokenize(genres)
        new_genres = ['genre_'+ str(x) for x in raw_genres]
        # print(new_genres)
        new_feat.append(' '.join([df[target_col].iloc[i]] + new_genres))
        # print(new_feat[i])
    
    return new_feat

In [5]:
# add genres to new 'description' column:
movie_desc_df['new_desc'] = add_genres(df=movie_desc_df, target_col='description', genre_col='genres')
movie_desc_df['new_desc'][5]

'Peter Parker is unmasked and no longer able to separate his normal life from the high-stakes of being a super-hero. When he asks for help from Doctor Strange the stakes become even more dangerous, forcing him to discover what it truly means to be Spider-Man. genre_28 genre_12 genre_878'

---
### Define Text Cleaning Functions:
Target the "Description" column. Applies cleaning to an entire corpus of documents.

In [6]:
def corpus_cleaner_and_tokenizer(df, target_col):
    # drop/impute any vector with missing values:
    # df = df.dropna(subset=[target_col], how='any')
    df[target_col] = df[target_col].fillna('NA')
    
    # define tokenizer (any word boundary):
    tokenizer = RegexpTokenizer(r'\w+')
    
    # import stop-words:
    stop_words = get_stop_words('en')
    
    # raw text to clean:
    raw_text = df[target_col]
    
    # define lemmatizer:
    lemma = WordNetLemmatizer()
    
    # define clean text list:
    cleaned_text = []
    
    # iterate through each description and clean it:
    for text in tqdm.tqdm(raw_text):
        # set everything to lowercase:
        lowercase_text = text.lower()
        
        # tokenize text into constituent parts:
        tokenized_text = tokenizer.tokenize(lowercase_text)
        
        # remove stop words (here, you'd stem/lemmatize too if necessary):
        # here we also remove any token that is a single character (i.e. "he's" ---> ["he", "s"], so we drop the "s")
        clean_text = [x for x in tokenized_text if x not in stop_words and len(x)>1]
        # clean_text = [x for x in tokenized_text if x not in stop_words]
        # clean_text = [lemma.lemmatize(x) for x in tokenized_text if x not in stop_words]
        
        # save to clean text list:
        cleaned_text.append(clean_text)
    
    return cleaned_text

In [7]:
# cleaned_text = corpus_cleaner_and_tokenizer(df=movie_desc_df, target_col='description')
cleaned_text = corpus_cleaner_and_tokenizer(df=movie_desc_df, target_col='new_desc')

100%|███████████████████████████████████| 10000/10000 [00:01<00:00, 8313.23it/s]


In [8]:
# cleaned_text[554]

#### Applies cleaning to a single document:

In [9]:
def document_cleaner_and_tokenizer(single_doc):
    # drop/impute any vector with missing values:
    if len(single_doc)==0:
        single_doc = 'NA'
    
    # define tokenizer (any word boundary):
    tokenizer = RegexpTokenizer(r'\w+')
    
    # import stop-words:
    stop_words = get_stop_words('en')
    
    # raw text to clean:
    raw_text = single_doc
    
    # define lemmatizer:
    lemma = WordNetLemmatizer()
    
    # set everything to lowercase:
    lowercase_text = raw_text.lower()

    # tokenize text into constituent parts:
    tokenized_text = tokenizer.tokenize(lowercase_text)

    # remove stop words (here, you'd stem/lemmatize too if necessary):
    # here we also remove any token that is a single character (i.e. "he's" ---> ["he", "s"], so we drop the "s")
    clean_text = [x for x in tokenized_text if x not in stop_words and len(x)>1]
    # clean_text = [x for x in tokenized_text if x not in stop_words]
    # clean_text = [lemma.lemmatize(x) for x in tokenized_text if x not in stop_words]
    
    return clean_text

In [10]:
# document_cleaner_and_tokenizer(movie_desc_df['description'].iloc[554])
# movie_desc_df['description'].iloc[554]

---
### Build and save LDA model:

In [11]:
def build_lda_model(clean_tokens, n_topics):
    # generate dictionary of token counts for each vector of tokens:
    dictionary = corpora.Dictionary(clean_tokens)
    
    # convert to bag of words:
    corpus = [dictionary.doc2bow(vector) for vector in clean_tokens]
    
    # lda model:
    lda_model = models.ldamodel.LdaModel(corpus=corpus, 
                                         id2word=dictionary, 
                                         num_topics=n_topics)
    
    # similarities:
    index = MatrixSimilarity(corpus, num_features=len(dictionary))
    # index = similarities.MatrixSimilarity(lda_model[corpus])
    # index = similarities.MatrixSimilarity(corpus)
    
    # save all objects:
    lda_model.save('./lda_model/lda_model.model')
    dictionary.save_as_text('./lda_model/common_dictionary.txt')
    corpora.MmCorpus.serialize('./lda_model/model_corpus', corpus)
    index.save('./lda_model/model_index')
    # pyLDAvis.save_json(pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary), './lda_model/lda_model.json')
    
    
    return lda_model, dictionary, corpus, index

In [12]:
lda_model, common_dictionary, model_corpus, index = build_lda_model(clean_tokens=cleaned_text, n_topics=10)

---
### Get a Representation of all Topics:

In [31]:
# get topics from model:
lda_model.print_topics()

[(0,
  '0.007*"genre_18" + 0.006*"genre_35" + 0.005*"genre_28" + 0.004*"alex" + 0.004*"genre_12" + 0.003*"find" + 0.003*"sight" + 0.003*"genre_99" + 0.003*"johnny" + 0.003*"bed"'),
 (1,
  '0.029*"genre_18" + 0.009*"genre_10749" + 0.006*"genre_53" + 0.006*"genre_35" + 0.006*"genre_36" + 0.006*"genre_28" + 0.005*"life" + 0.005*"one" + 0.005*"man" + 0.004*"young"'),
 (2,
  '0.010*"genre_18" + 0.008*"genre_35" + 0.008*"genre_28" + 0.005*"genre_12" + 0.005*"genre_878" + 0.005*"genre_53" + 0.005*"genre_36" + 0.004*"must" + 0.004*"will" + 0.003*"genre_14"'),
 (3,
  '0.018*"genre_18" + 0.010*"genre_28" + 0.009*"genre_12" + 0.009*"genre_35" + 0.008*"genre_36" + 0.007*"genre_878" + 0.006*"genre_53" + 0.006*"world" + 0.005*"life" + 0.004*"genre_10749"'),
 (4,
  '0.015*"genre_35" + 0.012*"genre_28" + 0.009*"one" + 0.008*"genre_12" + 0.008*"genre_18" + 0.007*"genre_16" + 0.006*"genre_10751" + 0.005*"genre_14" + 0.005*"life" + 0.004*"new"'),
 (5,
  '0.018*"genre_35" + 0.010*"genre_18" + 0.009*"genre

---
### Get Topic Assignment for New Documents:

In [14]:
def get_topic_assignment(target_vector, lda_model, dictionary, top_result=True):
    # convert target doc to bag of words using dict keys:
    target_vector = document_cleaner_and_tokenizer(target_vector)
    target_doc = common_dictionary.doc2bow(target_vector)
    
    # topic_distribution = lda_model.get_document_topics(target_doc, per_word_topics=True)
    topic_distribution = lda_model.get_document_topics(target_doc)
    
    # check to see if top_result is selected, if so, only return top assignment, 
    # else, the whole list of assignments:
    if top_result:
        top_assignment = max(topic_distribution, key=itemgetter(0))
        return top_assignment
    else:
        return topic_distribution
    

In [15]:
# get_topic_assignment(target_vector=cleaned_text[5], lda_model=lda_model, dictionary=common_dictionary, top_result=True )
get_topic_assignment(target_vector=movie_desc_df['description'].iloc[554], lda_model=lda_model, dictionary=common_dictionary, top_result=True )

(9, 0.2870765)

---
### Return the Topic Assignment for all docs in the Corpus:

In [16]:
def get_corpus_topic_assignment(lda_model, corpus):
    # get a list with all topics assignments for each doc in the corpus:
    topic_assignment = lda_model.get_document_topics(model_corpus)
    
    # selects the topic with highest probability:
    top_assignment = [max(topics, key=itemgetter(0)) for topics in topic_assignment]
    
    # splits the tuple into topic ID and its assignment probability:
    topic_id, topic_proba = map(list, zip(*top_assignment))
    
    return topic_id, topic_proba

In [17]:
# topics, topic_probas = get_corpus_topic_assignment(lda_model=lda_model, corpus=model_corpus)

In [18]:
# new_df[new_df['topic']==6][['title', 'description', 'topic', 'topic probability']].sort_values(by='topic probability', ascending=False).head(20)

In [19]:
# check spider-man movies:
# new_df[new_df['description'].str.contains('Spider-Man')].sort_values(by='topic probability', ascending=False).head(20)
# new_df.query('description.str.contains("spider")', engine='python')

In [20]:
# new_df[new_df['title'].str.contains('Thor')].sort_values(by='topic probability', ascending=False).head(50)

In [21]:
# new_df[new_df['title'].str.contains('Lord of the Rings')].sort_values(by='topic probability', ascending=False).head(50)

---
### Load model objects:

In [22]:
def load_lda_model_objects(parent_dir):
    # load all constituent model objects:
    lda_model = models.LdaModel.load(os.path.join(parent_dir, 'lda_model.model'))
    dictionary = Dictionary.load_from_text(os.path.join(parent_dir, 'common_dictionary.txt'))
    corpus = corpora.MmCorpus(os.path.join(parent_dir, 'model_corpus'))
    loaded_index = index.load(os.path.join(parent_dir, 'model_index'))
    
    return lda_model, dictionary, corpus, loaded_index

In [23]:
# load_lda_model_objects('./lda_model/')

---
### Define LDA modeler function:

In [24]:
def get_corpus_topics(df, target_col, num_topics):
    # first, tokenize everything:
    cleaned_text = corpus_cleaner_and_tokenizer(df=df, target_col=target_col)
    
    # then load lda model:
    lda_model, common_dictionary, model_corpus, index = load_lda_model_objects('./lda_model/')
    # lda_model, common_dictionary, model_corpus, index = build_lda_model(clean_tokens=cleaned_text, n_topics=num_topics)
    
    # next, return topics assignment for all docs:
    topics, topic_probas = get_corpus_topic_assignment(lda_model=lda_model, corpus=model_corpus)
    
    # last, append topics and probas to original df:
    df['main topic'] = topics
    df['topic contribution'] = topic_probas
    
    return df

In [25]:
new_df = get_corpus_topics(df=movie_desc_df, target_col='new_desc', num_topics=10)
# new_df = get_lda_topics(df=movie_desc_df, target_col='description', num_topics=10)

100%|███████████████████████████████████| 10000/10000 [00:01<00:00, 6749.96it/s]


In [26]:
new_df

Unnamed: 0,id,movie_id,imdb_id,title,description,genres,lda_vector,sim_list,new_desc,main topic,topic contribution
0,1,675353,12412888.0,Sonic the Hedgehog 2,"After settling in Green Hills, Sonic is eager ...","[28, 878, 35, 10751, 12]",,,"After settling in Green Hills, Sonic is eager ...",5,0.928874
1,2,453395,9419884.0,Doctor Strange in the Multiverse of Madness,"Doctor Strange, with the help of mystical alli...","[14, 28, 12]",,,"Doctor Strange, with the help of mystical alli...",4,0.958799
2,3,629542,8115900.0,The Bad Guys,When the infamous Bad Guys are finally caught ...,"[16, 35, 28, 10751, 80]",,,When the infamous Bad Guys are finally caught ...,9,0.674903
3,4,414906,1877830.0,The Batman,"In his second year of fighting crime, Batman u...","[80, 9648, 53]",,,"In his second year of fighting crime, Batman u...",9,0.954980
4,5,335787,1464335.0,Uncharted,"A young street-smart, Nathan Drake and his wis...","[28, 12]",,,"A young street-smart, Nathan Drake and his wis...",9,0.183776
...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,576040,9228950.0,Malibu Rescue,When a long list of shenanigans lands Tyler in...,"[28, 35, 10751]",,,When a long list of shenanigans lands Tyler in...,9,0.119576
9996,9997,12797,995039.0,Ghost Town,"A spirited romantic comedy, Ghost Town is the ...","[35, 14, 10749]",,,"A spirited romantic comedy, Ghost Town is the ...",9,0.149578
9997,9998,8198,258068.0,The Quiet American,A stylish political thriller where love and wa...,"[10749, 53, 18]",,,A stylish political thriller where love and wa...,9,0.307962
9998,9999,43643,111701.0,White Fang 2: Myth of the White Wolf,"A boy and his dog, White Fang, must try to sav...","[12, 10751]",,,"A boy and his dog, White Fang, must try to sav...",4,0.604416


---
### Find and Return Document Similarities:

In [27]:
def get_most_similar_items(target_doc):
    # load model objects:
    lda_model, common_dictionary, model_corpus, index = load_lda_model_objects('./lda_model/')
    
    
    target_vector = document_cleaner_and_tokenizer(target_doc)
    target_bow = common_dictionary.doc2bow(target_vector)
    similarities = index[target_bow]
    
    # save to dataframe:
    item_id, simi_val = map(list, zip(*enumerate(similarities)))
    sim_matrix = pd.DataFrame()
    sim_matrix['movie index'] = item_id
    sim_matrix['similarity value'] = simi_val
    sim_matrix['topic assignment'], sim_matrix['assignment proba'] = get_corpus_topic_assignment(lda_model=lda_model, corpus=model_corpus)
    sim_matrix['movie title'] = movie_desc_df['title']
    sim_matrix['movie desc'] = movie_desc_df['description']
    
    # filter top 10:
    top_ten_df = sim_matrix.sort_values(by='similarity value', ascending=False)[:10]

    return top_ten_df

#### Test

In [28]:
# get some candidates:
movie_desc_df[movie_desc_df['title'].str.contains('Spider-Man')]
# movie_desc_df[movie_desc_df['title'].str.contains('Nemo')]

Unnamed: 0,id,movie_id,imdb_id,title,description,genres,lda_vector,sim_list,new_desc,main topic,topic contribution
5,6,634649,10872600.0,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,"[28, 12, 878]",,,Peter Parker is unmasked and no longer able to...,4,0.969974
147,145,1930,948470.0,The Amazing Spider-Man,Peter Parker is an outcast high schooler aband...,"[28, 12, 14]",,,Peter Parker is an outcast high schooler aband...,5,0.273188
225,223,429617,6320628.0,Spider-Man: Far From Home,Peter Parker and his friends go on a summer tr...,"[28, 12, 878]",,,Peter Parker and his friends go on a summer tr...,8,0.213722
244,241,315635,2250912.0,Spider-Man: Homecoming,Following the events of Captain America: Civil...,"[28, 12, 878, 18]",,,Following the events of Captain America: Civil...,4,0.976899
259,256,102382,1872181.0,The Amazing Spider-Man 2,"For Peter Parker, life is busy. Between taking...","[28, 12, 14]",,,"For Peter Parker, life is busy. Between taking...",6,0.535417
316,312,557,145487.0,Spider-Man,After being bitten by a genetically altered sp...,"[14, 28]",,,After being bitten by a genetically altered sp...,4,0.960539
386,381,559,413300.0,Spider-Man 3,The seemingly invincible Spider-Man goes up ag...,"[14, 28, 12]",,,The seemingly invincible Spider-Man goes up ag...,7,0.362452
443,436,324857,4633694.0,Spider-Man: Into the Spider-Verse,Miles Morales is juggling his life between bei...,"[28, 12, 16, 878]",,,Miles Morales is juggling his life between bei...,4,0.964937
1643,1621,558,316654.0,Spider-Man 2,Peter Parker is going through a major identity...,"[28, 12, 14]",,,Peter Parker is going through a major identity...,4,0.981453
1831,1808,569094,9362722.0,Spider-Man: Across the Spider-Verse,Miles Morales returns for the next chapter of ...,"[16, 28, 12, 878]",,,Miles Morales returns for the next chapter of ...,4,0.977445


In [29]:
# test document:
movie_desc_df.iloc[[5]]

Unnamed: 0,id,movie_id,imdb_id,title,description,genres,lda_vector,sim_list,new_desc,main topic,topic contribution
5,6,634649,10872600.0,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,"[28, 12, 878]",,,Peter Parker is unmasked and no longer able to...,4,0.969974


In [30]:
get_most_similar_items(movie_desc_df['description'].iloc[5])

Unnamed: 0,movie index,similarity value,topic assignment,assignment proba,movie title,movie desc
5,5,0.950382,4,0.969974,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...
316,316,0.270031,4,0.960539,Spider-Man,After being bitten by a genetically altered sp...
147,147,0.225478,5,0.273188,The Amazing Spider-Man,Peter Parker is an outcast high schooler aband...
443,443,0.218218,4,0.964937,Spider-Man: Into the Spider-Verse,Miles Morales is juggling his life between bei...
244,244,0.209165,4,0.976899,Spider-Man: Homecoming,Following the events of Captain America: Civil...
259,259,0.201853,6,0.535417,The Amazing Spider-Man 2,"For Peter Parker, life is busy. Between taking..."
1393,1393,0.197028,8,0.508109,Kick-Ass,Dave Lizewski is an unnoticed high school stud...
5823,5823,0.197028,9,0.434729,iBoy,"After an accident, Tom wakes from a coma to di..."
7299,7299,0.188982,8,0.140107,Borsalino and Co.,Marseille. Heaps of flowers and funeral wreath...
7681,7681,0.187826,8,0.317271,Spider-Man Strikes Back,"At the New York State University, one of Peter..."


---
**Note:** as we can see above, when we submit a test document containing descriptions regarding 'Spider-Man', we get back a list of most similar items that are also related to other 'Spider-Man' films. I have tested this for films with 'Thor', 'Lord of the Rings', etc.. and the sanity test has always returned similar items such as sequels/prequels. See page 269 of the text.

---
---
### Visualize Topic Clusters:

In [34]:
lda_model, common_dictionary, model_corpus, index = load_lda_model_objects('./lda_model/')
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, model_corpus, dictionary=common_dictionary)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)
