# TLJ Topic Modeling - NMF

In [5]:
import pandas as pd
import numpy as np
import re 
from pprint import pprint

import pickle


# import string


# from sklearn.feature_extraction.text import CountVectorizer

# from textblob import TextBlob

import nltk, pprint
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.util import *

# from sklearn.metrics import accuracy_score, confusion_matrix

# from confusion import print_confusion_matrix

# import random

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

import pyLDAvis
import pyLDAvis.gensim
import pyLDAvis.sklearn
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from pprint import pprint


In [6]:
# Import tlj df pickle
tlj_all = pickle.load(open('tlj_v2.pickle', 'rb'))

In [7]:
# Create NLTK stopwords
stop_words = stopwords.words('english')

In [9]:
# Create DataFrame where that only has negative sentiment reviews
tlj = tlj_all.loc[tlj_all['google_sentiment']=='negative']

### Remove Stop Words and Lemmatization Functions

In [10]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return[[word for word in doc if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

## Preprocess with TF-IDF

In [11]:
tfidf = TfidfVectorizer(stop_words='english')
tlj_tfidf = tfidf.fit_transform(list(tlj['Reviews']))

## Find Topics with NMF Model

In [16]:
# Function to display topics
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

### Topic Count: 5

In [14]:
# Start with 5 Topics to see what the results are
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(tlj_tfidf)

In [15]:
topic_words = pd.DataFrame(nmf_model.components_.round(3),
                           index = ["component_1", "component_2", "component_3", "component_4", "component_5", ],
                           columns = tfidf.get_feature_names())
topic_words

Unnamed: 0,aa,aback,abaloth,abandon,abandoned,abandoning,abandonment,abandonned,abandons,abborent,...,zooming,zooms,zornord,zu,zucchini,zucker,état,être,últimos,über
component_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,...,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,0.001,0.0,0.0,0.013,0.017,0.003,0.003,0.001,0.003,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001
component_3,0.0,0.0,0.001,0.0,0.0,0.002,0.0,0.0,0.002,0.0,...,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_4,0.0,0.0,0.0,0.0,0.0,0.003,0.0,0.0,0.003,0.0,...,0.0,0.0,0.001,0.001,0.0,0.0,0.0,0.0,0.004,0.0
component_5,0.0,0.0,0.0,0.001,0.007,0.0,0.0,0.0,0.0,0.001,...,0.003,0.0,0.0,0.001,0.001,0.002,0.001,0.001,0.0,0.001


In [21]:
topics_nmf_5=display_topics(nmf_model, tfidf.get_feature_names(), 20)
topics_nmf_5


Topic  0
movie, bad, just, like, don, really, good, people, worst, think, reviews, didn, time, say, movies, watch, make, watching, things, want

Topic  1
luke, rey, kylo, force, snoke, jedi, ren, leia, finn, like, order, just, character, rose, did, resistance, scene, space, skywalker, poe

Topic  2
film, films, like, just, really, think, don, good, going, bad, people, reviews, say, ve, feel, time, lot, seen, review, didn

Topic  3
star, wars, disney, fan, worst, jedi, fans, movies, universe, johnson, lucas, franchise, rian, watch, ruined, saga, money, original, films, like

Topic  4
story, characters, new, plot, character, trilogy, episode, good, original, holes, johnson, development, rian, make, franchise, old, movies, sw, disney, line


### Topic Count: 10

In [22]:
# Start with 5 Topics to see what the results are
nmf_model = NMF(10)
doc_topic = nmf_model.fit_transform(tlj_tfidf)

In [23]:
topic_words = pd.DataFrame(nmf_model.components_.round(3),
                           index = ["component_1", "component_2", "component_3", "component_4", "component_5", "component_6", "component_7", "component_8", "component_9", "component_10"],
                           columns = tfidf.get_feature_names())
topic_words

Unnamed: 0,aa,aback,abaloth,abandon,abandoned,abandoning,abandonment,abandonned,abandons,abborent,...,zooming,zooms,zornord,zu,zucchini,zucker,état,être,últimos,über
component_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002,0.001,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,0.001,0.0,0.0,0.005,0.017,0.001,0.004,0.002,0.003,0.001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_3,0.0,0.0,0.001,0.0,0.0,0.002,0.0,0.0,0.002,0.0,...,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_4,0.0,0.0,0.0,0.0,0.0,0.003,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.004,0.0
component_5,0.0,0.0,0.0,0.0,0.006,0.0,0.0,0.0,0.0,0.0,...,0.002,0.0,0.0,0.0,0.001,0.002,0.001,0.001,0.001,0.0
component_6,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0
component_7,0.0,0.0,0.001,0.018,0.007,0.003,0.0,0.0,0.001,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.002
component_8,0.0,0.0,0.0,0.0,0.007,0.0,0.0,0.0,0.005,0.0,...,0.0,0.0,0.0,0.0,0.0,0.003,0.0,0.0,0.0,0.0
component_9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004,0.002,...,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_10,0.0,0.001,0.0,0.001,0.0,0.002,0.0,0.0,0.001,0.003,...,0.0,0.0,0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.001


In [24]:
topics_nmf_10=display_topics(nmf_model, tfidf.get_feature_names(), 20)
topics_nmf_10


Topic  0
like, just, don, really, people, didn, think, felt, know, good, going, feel, say, make, things, did, time, want, wasn, lot

Topic  1
luke, rey, kylo, force, jedi, ren, snoke, skywalker, character, leia, yoda, training, dark, lightsaber, ben, vader, kill, finn, awakens, powerful

Topic  2
film, films, character, jedi, good, going, awakens, previous, seen, action, franchise, ve, lot, audience, characters, feel, review, watching, scene, fans

Topic  3
star, wars, fan, disney, jedi, movies, universe, fans, watch, films, original, lucas, new, disappointed, saga, series, like, force, franchise, ruined

Topic  4
characters, story, plot, new, character, holes, trilogy, development, good, original, movies, line, old, franchise, main, scenes, effects, make, great, sw

Topic  5
movie, movies, watching, boring, point, scenes, humor, great, previous, scene, jokes, wrong, watch, sad, entire, minutes, makes, good, stars, plot

Topic  6
space, ship, order, finn, ships, rose, scene, resistanc

- Interestingly, this Topic 9 is the most decipherable, so creating more topics might help. Let's try 15 next. 
- Topic 9 seems to be the "Franchise Stewardship" topic, where people are talking about directors, producers, and studios, along with assorted negative words. 

### Topic Count: 15

In [25]:
# Start with 5 Topics to see what the results are
nmf_model = NMF(15)
doc_topic = nmf_model.fit_transform(tlj_tfidf)

In [26]:
topic_words = pd.DataFrame(nmf_model.components_.round(3),
                           index = ["component_1", "component_2", "component_3", "component_4", "component_5", 
                                    "component_6", "component_7", "component_8", "component_9", "component_10",
                                    "component_11", "component_12", "component_13", "component_14", "component_15"],
                           columns = tfidf.get_feature_names())
topic_words

Unnamed: 0,aa,aback,abaloth,abandon,abandoned,abandoning,abandonment,abandonned,abandons,abborent,...,zooming,zooms,zornord,zu,zucchini,zucker,état,être,últimos,über
component_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.001,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,0.001,0.0,0.0,0.004,0.014,0.001,0.003,0.002,0.002,0.001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_3,0.0,0.0,0.001,0.0,0.0,0.002,0.0,0.0,0.001,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_4,0.0,0.0,0.0,0.0,0.0,0.003,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0
component_5,0.0,0.0,0.0,0.0,0.015,0.0,0.001,0.0,0.003,0.0,...,0.001,0.0,0.0,0.001,0.001,0.0,0.0,0.0,0.003,0.001
component_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_7,0.0,0.0,0.001,0.021,0.009,0.003,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.001
component_8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002,...,0.0,0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.006,0.001
component_9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004,0.002,...,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_10,0.0,0.001,0.0,0.002,0.0,0.003,0.0,0.0,0.0,0.003,...,0.0,0.0,0.0,0.003,0.0,0.0,0.0,0.001,0.0,0.0


In [27]:
topics_nmf_15=display_topics(nmf_model, tfidf.get_feature_names(), 20)
topics_nmf_15


Topic  0
like, really, didn, don, felt, think, people, feel, things, lot, good, did, know, wasn, liked, wanted, make, feels, hate, scenes

Topic  1
luke, rey, kylo, force, jedi, ren, snoke, skywalker, character, leia, training, yoda, dark, lightsaber, ben, kill, finn, vader, killed, powerful

Topic  2
film, films, character, going, previous, franchise, awakens, time, seen, audience, jedi, characters, ve, review, good, reviews, watching, action, set, left

Topic  3
star, wars, fan, universe, movies, watch, jedi, disappointed, fans, films, like, series, watched, saga, rogue, ruined, terrible, feel, loved, huge

Topic  4
new, trilogy, original, characters, old, jedi, force, movies, awakens, sw, fans, empire, hope, universe, franchise, generation, tlj, ones, far, prequels

Topic  5
movie, movies, watching, point, boring, time, reviews, previous, watch, sad, wrong, want, great, humor, stars, jokes, makes, good, left, watched

Topic  6
space, ship, order, finn, resistance, ships, rose, leia

- The more topics that we add the more sense they start to make, interestingly. Let's go to 20!

### Topic Count: 20

In [28]:
# Start with 5 Topics to see what the results are
nmf_model = NMF(20)
doc_topic = nmf_model.fit_transform(tlj_tfidf)

In [29]:
topic_words = pd.DataFrame(nmf_model.components_.round(3),
                           index = ["component_1", "component_2", "component_3", "component_4", "component_5", 
                                    "component_6", "component_7", "component_8", "component_9", "component_10",
                                    "component_11", "component_12", "component_13", "component_14", "component_15",
                                    "component_16", "component_17", "component_18", "component_19", "component_20"],
                           columns = tfidf.get_feature_names())
topic_words

Unnamed: 0,aa,aback,abaloth,abandon,abandoned,abandoning,abandonment,abandonned,abandons,abborent,...,zooming,zooms,zornord,zu,zucchini,zucker,état,être,últimos,über
component_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,0.001,0.0,0.0,0.004,0.01,0.001,0.002,0.002,0.001,0.001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_3,0.0,0.0,0.001,0.0,0.0,0.002,0.0,0.0,0.001,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_4,0.0,0.0,0.0,0.0,0.0,0.003,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_5,0.0,0.0,0.0,0.003,0.004,0.0,0.0,0.0,0.0,0.0,...,0.001,0.0,0.0,0.001,0.001,0.002,0.0,0.0,0.003,0.001
component_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_7,0.0,0.0,0.001,0.024,0.012,0.003,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.001
component_8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005,...,0.0,0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.007,0.0
component_9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004,0.002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_10,0.0,0.001,0.0,0.002,0.0,0.003,0.0,0.0,0.0,0.003,...,0.0,0.0,0.0,0.003,0.0,0.0,0.0,0.001,0.0,0.0


In [30]:
topics_nmf_20=display_topics(nmf_model, tfidf.get_feature_names(), 20)
topics_nmf_20


Topic  0
like, really, felt, didn, feel, scenes, lot, good, things, scene, great, feels, did, liked, bit, wasn, little, action, moments, way

Topic  1
luke, rey, kylo, ren, snoke, character, skywalker, jedi, leia, yoda, finn, lightsaber, ben, dark, training, kill, did, killed, vader, force

Topic  2
film, films, character, franchise, previous, characters, going, audience, seen, ve, action, good, lot, moments, script, direction, left, feel, set, watching

Topic  3
star, wars, fan, universe, movies, watch, disappointed, fans, films, series, jedi, watched, like, saga, rogue, original, ruined, feel, loved, huge

Topic  4
new, characters, trilogy, original, story, old, movies, good, generation, ones, fans, empire, make, universe, hope, franchise, care, far, heroes, end

Topic  5
movie, movies, watching, point, previous, sad, left, stars, wrong, makes, great, boring, entire, humor, watched, jokes, theater, things, minutes, series

Topic  6
space, ship, order, ships, resistance, finn, leia, 

- We don't seem to be getting many clearer topics now, though it's not much more vague than 15. We'll stop here. 

### Bigrams and Trigrams
Let's see if Bigrams and Trigrams get us any clearer topics

#### Bigrams

In [50]:
# Bigram TF-IDF
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
tlj_tfidf = tfidf.fit_transform(list(tlj['Reviews']))

In [38]:
# 15 Topics to see what the results are
nmf_model = NMF(15)
doc_topic = nmf_model.fit_transform(tlj_tfidf)

In [39]:
topic_words = pd.DataFrame(nmf_model.components_.round(3),
                           index = ["component_1", "component_2", "component_3", "component_4", "component_5", 
                                    "component_6", "component_7", "component_8", "component_9", "component_10",
                                    "component_11", "component_12", "component_13", "component_14", "component_15"],
                           columns = tfidf.get_feature_names())
topic_words

Unnamed: 0,aa,aa long,aback,aback wants,abaloth,abaloth telling,abandon,abandon altogether,abandon approach,abandon base,...,zucker films,état,état apparently,être,être luke,últimos,últimos jedi,über,über alles,über powered
component_1,0.002,0.002,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,...,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.002,0.002,0.0,0.0,0.0
component_3,0.0,0.0,0.0,0.0,0.001,0.001,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_5,0.0,0.0,0.001,0.001,0.0,0.0,0.0,0.001,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0
component_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_7,0.0,0.0,0.0,0.0,0.001,0.001,0.017,0.0,0.002,0.001,...,0.0,0.001,0.001,0.0,0.0,0.0,0.0,0.001,0.0,0.001
component_8,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.0,0.0,...,0.001,0.0,0.0,0.0,0.0,0.002,0.002,0.0,0.0,0.0
component_9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
topics_nmf_15_2ngram=display_topics(nmf_model, tfidf.get_feature_names(), 20)
topics_nmf_15_2ngram


Topic  0
kylo, rey, ren, snoke, character, kylo ren, finn, luke, rose, force, poe, leia, scene, great, really, resistance, did, tfa, phasma, rey kylo

Topic  1
star, star wars, wars, disney, fan, wars movie, wars fan, universe, movies, wars movies, wars universe, fans, wars fans, watch, lucas, jedi, films, disappointed, wars film, like

Topic  2
film, wars film, films, good, character, characters, reviews, jedi, film just, franchise, going, star wars, star, wars, seen, say, ve, previous, audience, think

Topic  3
movie, wars movie, good, movie just, movies, reviews, great, good movie, watch, movie movie, just, boring, watching, scenes, people, make, point, movie good, previous, things

Topic  4
johnson, rian, rian johnson, abrams, director, kennedy, kathleen, disney, kathleen kennedy, hamill, mark, character, mark hamill, did, disney rian, episode, fans, trilogy, director rian, skywalker

Topic  5
worst, worst star, worst movie, movie worst, seen, wars movie, ve seen, movie, ve, movie

In [51]:
# 20 Topics to see what the results are
nmf_model = NMF(20)
doc_topic = nmf_model.fit_transform(tlj_tfidf)

In [53]:
topic_words = pd.DataFrame(nmf_model.components_.round(3),
                           index = ["component_1", "component_2", "component_3", "component_4", "component_5", 
                                    "component_6", "component_7", "component_8", "component_9", "component_10",
                                    "component_11", "component_12", "component_13", "component_14", "component_15",
                                    "component_16", "component_17", "component_18", "component_19", "component_20"],
                           columns = tfidf.get_feature_names())
topic_words

Unnamed: 0,aa,aa long,aback,aback wants,abaloth,abaloth telling,abandon,abandon altogether,abandon approach,abandon base,...,zucker films,état,état apparently,être,être luke,últimos,últimos jedi,über,über alles,über powered
component_1,0.002,0.002,0.0,0.0,0.0,0.0,0.003,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.002,0.002,0.0,0.0,0.0
component_3,0.0,0.0,0.0,0.0,0.001,0.001,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_5,0.0,0.0,0.001,0.001,0.0,0.0,0.0,0.001,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.0
component_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_7,0.0,0.0,0.0,0.0,0.001,0.001,0.019,0.0,0.002,0.002,...,0.0,0.001,0.001,0.0,0.0,0.0,0.0,0.001,0.0,0.001
component_8,0.0,0.0,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_10,0.0,0.0,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
display_topics(nmf_model, tfidf.get_feature_names(), 20)


Topic  0
kylo, rey, ren, snoke, kylo ren, luke, character, finn, rose, leia, poe, force, scene, really, great, did, resistance, lightsaber, just, parents

Topic  1
star, star wars, wars, disney, fan, wars fan, wars movie, universe, movies, wars universe, wars movies, fans, wars fans, lucas, jedi, watch, films, franchise, wars film, saga

Topic  2
film, wars film, films, character, good, film just, characters, franchise, jedi, previous, audience, seen, action, lot, ve, wars films, star wars, wars, moments, film star

Topic  3
movie, wars movie, good, movie just, movie movie, great, good movie, watching, boring, movies, scenes, movie good, plot, scene, previous, humor, watch movie, movie star, point, watch

Topic  4
johnson, rian, rian johnson, abrams, kennedy, kathleen, director, kathleen kennedy, disney, hamill, disney rian, mark, trilogy, mark hamill, character, did, fans, director rian, hate rian, skywalker

Topic  5
worst, worst star, worst movie, movie worst, seen, wars movie, ve 

- This is by far the clearest set of topics I've seen up until this point. 

In [55]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(nmf_model, tlj_tfidf, tfidf, mds='tsne')
panel

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Trigrams
Let's try the trigrams and see what happens

In [44]:
# Trigram TF-IDF
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
tlj_tfidf = tfidf.fit_transform(list(tlj['Reviews']))

In [45]:
# We'll start with 20 Topics to see what the results are
nmf_model = NMF(20)
doc_topic = nmf_model.fit_transform(tlj_tfidf)

In [46]:
topic_words = pd.DataFrame(nmf_model.components_.round(3),
                           index = ["component_1", "component_2", "component_3", "component_4", "component_5", 
                                    "component_6", "component_7", "component_8", "component_9", "component_10",
                                    "component_11", "component_12", "component_13", "component_14", "component_15",
                                    "component_16", "component_17", "component_18", "component_19", "component_20"],
                           columns = tfidf.get_feature_names())
topic_words

Unnamed: 0,aa,aa long,aa long time,aback,aback wants,aback wants applaud,abaloth,abaloth telling,abaloth telling character,abandon,...,être luke,être luke train,últimos,últimos jedi,últimos jedi star,über,über alles,über alles crowd,über powered,über powered mary
component_1,0.002,0.002,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.002,0.002,0.002,0.0,0.0,0.0,0.0,0.0
component_3,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.001,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.002,0.002,0.002,0.001,0.0,0.0,0.0,0.0
component_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_7,0.0,0.0,0.0,0.001,0.001,0.001,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.0,0.0
component_8,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.001,0.02,...,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.001,0.001
component_9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
topics_nmf_20_3ngram=display_topics(nmf_model, tfidf.get_feature_names(), 20)
topics_nmf_20_3ngram


Topic  0
kylo, rey, snoke, ren, luke, character, kylo ren, finn, rose, force, scene, leia, really, poe, just, did, great, like, didn, tfa

Topic  1
star, star wars, wars, disney, universe, movies, wars movie, star wars movie, star wars universe, wars universe, star wars movies, wars movies, fans, jedi, wars fans, star wars fans, lucas, franchise, films, saga

Topic  2
film, wars film, star wars film, films, wars, star wars, star, characters, good, character, jedi, film just, franchise, action, previous, ve, lot, scenes, moments, seen

Topic  3
movie, wars movie, star wars movie, good, movie just, scenes, just, great, movie movie, boring, watching, plot, movies, good movie, character, scene, movie good, things, humor, previous

Topic  4
new, characters, trilogy, story, original, sw, old, new characters, original trilogy, movies, episode, character, good, tlj, franchise, plot, universe, tfa, effects, new trilogy

Topic  5
corporate, assembly line, crass, assembly, rip offs, cinematic, h

## Visualize

In [49]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(nmf_model, tlj_tfidf, tfidf, mds='tsne')
panel

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
###