In [4]:
# Essentials
import base64
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datapane as dp
#dp.login(token='INSERT_TOKEN_HERE')
# Gensim and LDA
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
import pyLDAvis
#import pyLDAvis.gensim  # don't skip this
import pyLDAvis.gensim_models
# NLP stuff
import contractions
import demoji
import string
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
nltk.download('wordnet')
import spacy
# Plotting tools
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
%matplotlib inline
# Miscellaneous
from sklearn.manifold import TSNE
from pprint import pprint

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/denverbaumgartner/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
def data_parser(df): 
    # take in a dataframe and subset the dataframe into individual dataframes based off of the 'movie_id' column
    # return a list of dataframes
    df_list = []
    for i in df['movie_id'].unique():
        df_list.append(df[df['movie_id'] == i])
    return df_list

In [None]:
movie_dfs = data_parser(df)

In [None]:
def preprocess(text_col):
    """This function will apply NLP preprocessing lambda functions over a pandas series such as df['text'].
       These functions include converting text to lowercase, removing emojis, expanding contractions, removing punctuation,
       removing numbers, removing stopwords, lemmatization, etc."""
    
    # convert to lowercase
    text_col = text_col.apply(lambda x: ' '.join([w.lower() for w in x.split()]))
    
    # remove emojis
    text_col = text_col.apply(lambda x: demoji.replace(x, ""))
    
    # expand contractions  
    text_col = text_col.apply(lambda x: ' '.join([contractions.fix(word) for word in x.split()]))

    # remove punctuation
    text_col = text_col.apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))
    
    # remove numbers
    text_col = text_col.apply(lambda x: ' '.join(re.sub("[^a-zA-Z]+", " ", x).split()))

    # remove stopwords
    stopwords = [sw for sw in list(nltk.corpus.stopwords.words('english')) + ['thing'] if sw not in ['not']]
    text_col = text_col.apply(lambda x: ' '.join([w for w in x.split() if w not in stopwords]))

    # lemmatization
    text_col = text_col.apply(lambda x: ' '.join([WordNetLemmatizer().lemmatize(w) for w in x.split()]))

    # remove short words
    text_col = text_col.apply(lambda x: ' '.join([w.strip() for w in x.split() if len(w.strip()) >= 3]))

    return text_col

for df in movie_dfs:
    df['text'] = preprocess(df['text'])
    df.rename(columns={'text': 'review'}, inplace=True)

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence).encode('utf-8'), deacc=True))  # deacc=True removes punctuations

In [None]:
# Define functions for extracting words from df, create bigrams and trigrams
data_words = []
for df in movie_dfs:
    data_words.append(list(sent_to_words(df['text'].to_list())))

bigrams = []
for data in data_words:
    bigrams.append(gensim.models.Phrases(data, min_count=5, threshold=25))

trigrams = []
for data in data_words:
    trigrams.append(gensim.models.Phrases(bigrams[0][data], min_count=5, threshold=25))

bigrams_mod = [gensim.models.phrases.Phraser(bigram) for bigram in bigrams] 
trigrams_mod = [gensim.models.phrases.Phraser(trigram) for trigram in trigrams]

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigrams_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigrams_mod[bigrams_mod[doc]] for doc in texts]

In [None]:
# now create one final set list of dataframes for each of the movies and apply the functions above
data_no_stopwords = []
for data in data_words:
    data_no_stopwords.append(remove_stopwords(data))

data_bigrams = []
for data in data_no_stopwords:
    data_bigrams.append(make_bigrams(data))

data_preprocessed = []
for data in data_bigrams:
    data_preprocessed.append(make_trigrams(data))

In [None]:
# create dictionary and corpus needed for topic modelling 

# dictionary is a mapping of words to their integer ids
all_id2word = []
for data in data_preprocessed:
    all_id2word.append(gensim.corpora.Dictionary(data))

for data in all_id2word:
    data.filter_extremes(no_below=15, no_above=0.4, keep_n=80000)

# corpus is a mapping of word_id to their frequency in the document
all_corpus = []
for data in data_preprocessed:
    for words in all_id2word: 
        all_corpus.append([words.doc2bow(text) for text in data])



In [None]:

# build the lda model for all movies based upon the corpus and dictionary created for each movie
# this will create the topic models for each movie
all_models = {}
movie_titles = df['movie_id'].unique()
for i in range(len(movie_titles)):
    all_models[movie_titles[i]] = gensim.models.ldamodel.LdaModel(corpus=all_corpus[i],
                                           id2word=all_id2word[i],
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    
