# Topic Modeling
### Without Preprocessing

In [1]:
import json
import os
import random
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

#### Define Parameters

In [2]:
n_components = 50 #Topics To generate
n = 20 #Transcripts to use
max_df=0.01
min_df=0.0001
n_terms = 10

## Load Data

In [3]:
def create_df_multi(n=25):
    list_of_text = []
    dir = 'data/aligned data/c=4'

    files = [filename for filename in os.listdir(dir)]
    sampled_files = random.choices(files,k=n)

    for filename in sampled_files:
        # choice = random.choice(os.listdir(dir))
        # print(choice)
        f = open(dir+'/'+filename)
        data = json.load(f)
        choice = random.choice(os.listdir(dir))
        # print(choice)
        f = open(dir+'/'+choice)
        data = json.load(f)

        for x in data:
            for y in x['TURNS']:
                text = ' '.join(y['UTTERANCES'])
                list_of_text.append(text)
    df = pd.DataFrame(list_of_text)
    return df

In [4]:
df = create_df_multi(n)
print(df.shape)
df.head()

(49827, 1)


Unnamed: 0,0
0,"Hello and welcome to Critical Role, the game w..."
1,A lot of classic rock!
2,"We had a new comic come up this week, didn't we?"
3,We have a new comic!
4,New comic!


## Vectorize Data



- Count Vectorizer
- Tfidf Vectorizer

Use both with each model

In [5]:
# tfidf=TfidfVectorizer(stop_words='english',max_df=.7,min_df=2,token_pattern=r'(?u)\b[A-Za-z]+\b')

tfidf = TfidfVectorizer(stop_words='english', 
    max_df=max_df,
    min_df=min_df,
    token_pattern=r'(?u)\b[A-Za-z]+\b'
    )
tfidf_sparse = tfidf.fit_transform(df[0])
print(tfidf_sparse.shape)
tfidf_df = pd.DataFrame(tfidf_sparse.toarray().transpose(),
                   index=tfidf.get_feature_names_out())
tfidf_df.tail()

(49827, 5582)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49817,49818,49819,49820,49821,49822,49823,49824,49825,49826
zephra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ziggurat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zombie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zualla,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
cv = CountVectorizer(stop_words='english', 
    max_df=max_df,
    min_df=min_df,
    token_pattern=r'(?u)\b[A-Za-z]+\b'
    )
cv_sparse = cv.fit_transform(df[0])
print(cv_sparse.shape)
cv_df = pd.DataFrame(cv_sparse.toarray().transpose(),
                   index=cv.get_feature_names_out())
cv_df.tail()

(49827, 5582)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49817,49818,49819,49820,49821,49822,49823,49824,49825,49826
zephra,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zero,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ziggurat,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zombie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zualla,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Define Output Functions

In [7]:
def print_top_terms(n_components, topics, terms, n_terms = 10):
    for x in range(n_components):
        topic = x
        components = topics[:,topic]
        top_term_indices = components.argsort()[-n_terms:]
        top_terms = np.array(terms)[top_term_indices]
        
        print(f'Topic {x}:\t{top_terms.tolist()}')


## PCA

In [8]:
pca = PCA(n_components=n_components)
topics = pca.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['party', 'storm', 'watch', 'form', 'begins', 'loss', 'city', 'open', 'begin', 'eyes']
Topic 1:	['open', 'instead', 'muted', 'thunder', 'voice', 'flash', 'ship', 'eyes', 'storm', 'loss']
Topic 2:	['new', 'held', 'members', 'chamber', 'final', 'lorenzo', 'fjord', 'jester', 'friends', 'form']
Topic 3:	['began', 'storm', 'family', 'town', 'loss', 'vox', 'machina', 'briarwoods', 'whitestone', 'party']
Topic 4:	['sound', 'flash', 'vasselheim', 'giant', 'didn', 'familiar', 'flaming', 'temple', 'city', 'watch']
Topic 5:	['tonight', 'break', 'remember', 'role', 'critical', 'week', 'great', 'plans', 'thank', 'watch']
Topic 6:	['strike', 'yenk', 'forward', 'vax', 'vorugal', 'ground', 'body', 'party', 'saving', 'throw']
Topic 7:	['pulled', 'day', 'hear', 'sort', 'left', 'guild', 'large', 'tunnel', 'party', 'small']
Topic 8:	['half', 'great', 'lionel', 'tree', 'meat', 'left', 'plans', 'man', 'saving', 'throw']
Topic 9:	['stone', 'party', 'pretty', 'times', 'past', 'remember', 'thank', 'gr

In [9]:
pca = PCA(n_components=n_components)
topics = pca.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['die', 'cool', 'rolled', 'natural', 'didn', 'fuck', 'fucking', 'damn', 'thank', 'god']
Topic 1:	['awesome', 'didn', 'mean', 'better', 'keyleth', 'cool', 'fuck', 'advantage', 'great', 'thank']
Topic 2:	['hold', 'cheering', 'man', 'didn', 'fucking', 'mean', 'advantage', 'fine', 'natural', 'fuck']
Topic 3:	['disadvantage', 'perception', 'seven', 'saving', 'throw', 'fine', 'cheering', 'advantage', 'rolled', 'natural']
Topic 4:	['didn', 'keyleth', 'dexterity', 'perception', 'constitution', 'great', 'saving', 'throw', 'fine', 'advantage']
Topic 5:	['long', 'seven', 'natural', 'fucking', 'yep', 'tell', 'didn', 'mean', 'great', 'fine']
Topic 6:	['keyleth', 'didn', 'pretty', 'mean', 'constitution', 'yep', 'cool', 'saving', 'throw', 'great']
Topic 7:	['idea', 'didn', 'hold', 'fucking', 'mean', 'pretty', 'advantage', 'cool', 'perception', 'great']
Topic 8:	['didn', 'dexterity', 'vax', 'hold', 'constitution', 'keyleth', 'saving', 'throw', 'yep', 'perception']
Topic 9:	['correct', 'pretty

## LDA

In [10]:
lda = LatentDirichletAllocation(n_components=n_components)
topics = lda.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['vanish', 'liam', 'careful', 'hole', 'raishan', 'reason', 'touch', 'rush', 'running', 'shot']
Topic 1:	['tv', 'walked', 'style', 'thousand', 'meal', 'charisma', 'slashing', 'control', 'poison', 'wind']
Topic 2:	['deck', 'bite', 'kerrion', 'sir', 'free', 'guard', 'soon', 'floor', 'correct', 'hey']
Topic 3:	['swings', 'closed', 'higher', 'gnome', 'hi', 'line', 'walls', 'strong', 'save', 'water']
Topic 4:	['catches', 'specifically', 'usually', 'experience', 'covered', 'sky', 'additional', 'piece', 'singing', 'able']
Topic 5:	['quietly', 'unless', 'fighting', 'prone', 'travel', 'word', 'shoot', 'gone', 'natural', 'tell']
Topic 6:	['multiple', 'dimension', 'solid', 'type', 'combat', 'knows', 'rage', 'question', 'mist', 'vex']
Topic 7:	['forgot', 'pouring', 'camp', 'radiant', 'twice', 'melee', 'snow', 'sleep', 'game', 'talk']
Topic 8:	['difficult', 'scream', 'send', 'terrible', 'manage', 'vorugal', 'better', 'new', 'air', 'wall']
Topic 9:	['heart', 'jarett', 'daggers', 'hot', 'open

In [11]:
lda = LatentDirichletAllocation(n_components=n_components)
topics = lda.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['plank', 'appreciate', 'care', 'oil', 'understand', 'fail', 'huh', 'dex', 'course', 'hey']
Topic 1:	['finally', 'run', 'trace', 'jarett', 'ask', 'war', 'checks', 'deception', 'dc', 'hi']
Topic 2:	['jump', 'gotcha', 'cards', 'smack', 'cool', 'succeed', 'goddamn', 'piercing', 'legendary', 'correct']
Topic 3:	['survival', 'unconscious', 'dodge', 'vial', 'food', 'baby', 'son', 'dice', 'sighs', 'molly']
Topic 4:	['listening', 'possible', 'normal', 'kash', 'thinking', 'beau', 'y', 'gave', 'kill', 'trinket']
Topic 5:	['owl', 'vampire', 'asking', 'surprise', 'won', 'rage', 'gonna', 'melee', 'inspiration', 'send']
Topic 6:	['sheet', 'bottle', 'figured', 'growls', 'beard', 'invisibility', 'traps', 'totally', 'charisma', 'whoa']
Topic 7:	['anybody', 'resist', 'died', 'guiding', 'orly', 'dude', 'gone', 'trammel', 'alive', 'die']
Topic 8:	['loot', 'fan', 'guns', 'backing', 'assuming', 'awake', 'ashley', 'intelligence', 'thank', 'everybody']
Topic 9:	['vanishes', 'agree', 'handle', 'skelet

## SVD

In [12]:
svd = TruncatedSVD(n_components=n_components)
topics = svd.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['face', 'begins', 'watch', 'left', 'party', 'form', 'open', 'city', 'begin', 'eyes']
Topic 1:	['instead', 'muted', 'thunder', 'voice', 'open', 'flash', 'ship', 'eyes', 'storm', 'loss']
Topic 2:	['new', 'held', 'members', 'chamber', 'final', 'lorenzo', 'fjord', 'jester', 'friends', 'form']
Topic 3:	['began', 'storm', 'family', 'town', 'loss', 'vox', 'machina', 'briarwoods', 'whitestone', 'party']
Topic 4:	['vasselheim', 'sound', 'didn', 'party', 'giant', 'flaming', 'familiar', 'temple', 'city', 'watch']
Topic 5:	['flaming', 'temple', 'city', 'role', 'critical', 'great', 'week', 'plans', 'thank', 'watch']
Topic 6:	['strike', 'yenk', 'forward', 'vax', 'vorugal', 'ground', 'body', 'party', 'saving', 'throw']
Topic 7:	['pulled', 'day', 'hear', 'sort', 'left', 'guild', 'large', 'tunnel', 'party', 'small']
Topic 8:	['small', 'tree', 'remember', 'thank', 'great', 'left', 'plans', 'man', 'saving', 'throw']
Topic 9:	['stone', 'pretty', 'times', 'party', 'past', 'remember', 'thank', 'gr

In [13]:
svd = TruncatedSVD(n_components=n_components)
topics = svd.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['great', 'cool', 'rolled', 'didn', 'natural', 'fuck', 'fucking', 'damn', 'thank', 'god']
Topic 1:	['mean', 'didn', 'keyleth', 'fine', 'cool', 'natural', 'great', 'advantage', 'fuck', 'thank']
Topic 2:	['hold', 'saving', 'throw', 'didn', 'fucking', 'mean', 'advantage', 'fine', 'natural', 'fuck']
Topic 3:	['disadvantage', 'perception', 'seven', 'saving', 'throw', 'fine', 'cheering', 'advantage', 'rolled', 'natural']
Topic 4:	['dexterity', 'constitution', 'perception', 'keyleth', 'didn', 'great', 'saving', 'throw', 'fine', 'advantage']
Topic 5:	['long', 'seven', 'natural', 'fucking', 'yep', 'tell', 'didn', 'mean', 'great', 'fine']
Topic 6:	['raging', 'saves', 'persuasion', 'attacks', 'god', 'fuck', 'thank', 'perception', 'fine', 'advantage']
Topic 7:	['stealth', 'fucking', 'hold', 'didn', 'mean', 'pretty', 'cool', 'advantage', 'great', 'perception']
Topic 8:	['vex', 'constitution', 'vax', 'hold', 'didn', 'keyleth', 'saving', 'throw', 'yep', 'perception']
Topic 9:	['seven', 'didn

## NMF

In [14]:
nmf = NMF(n_components=n_components, max_iter=500)
topics = nmf.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())



Topic 0:	['beautiful', 'interior', 'chamber', 'steps', 'staircase', 'guards', 'dark', 'wood', 'closer', 'seen']
Topic 1:	['thunder', 'open', 'instead', 'voice', 'bring', 'flash', 'eyes', 'ship', 'storm', 'loss']
Topic 2:	['new', 'members', 'final', 'chamber', 'held', 'lorenzo', 'fjord', 'jester', 'form', 'friends']
Topic 3:	['took', 'family', 'briarwood', 'briarwoods', 'years', 'vox', 'machina', 'town', 'whitestone', 'city']
Topic 4:	['sound', 'gets', 'center', 'giant', 'begins', 'flaming', 'familiar', 'temple', 'city', 'watch']
Topic 5:	['went', 'talk', 'past', 'fucking', 'terribly', 'times', 'week', 'remember', 'great', 'plans']
Topic 6:	['titan', 'arm', 'armor', 'impact', 'beginning', 'smoke', 'vecna', 'energy', 'form', 'body']
Topic 7:	['contract', 'rakshasa', 'given', 'small', 'sort', 'essentially', 'guild', 'tunnel', 'city', 'party']
Topic 8:	['grass', 'dome', 'built', 'definitely', 'city', 'white', 'sitting', 'erathis', 'man', 'small']
Topic 9:	['information', 'immediately', 'de

In [15]:
nmf = NMF(n_components=n_components, max_iter=200)
topics = nmf.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())



Topic 0:	['hope', 'love', 'dear', 'blessing', 'hey', 'amazing', 'war', 'die', 'damn', 'god']
Topic 1:	['matt', 'week', 'wonderful', 'watching', 'coming', 'amazing', 'love', 'awesome', 'sam', 'thank']
Topic 2:	['wow', 'getting', 'cheering', 'sharpshooter', 'gil', 'happened', 'sake', 'shut', 'holy', 'fuck']
Topic 3:	['strength', 'concentration', 'tongue', 'row', 'shot', 'minus', 'save', 'shouting', 'cheering', 'natural']
Topic 4:	['reckless', 'gets', 'athletics', 'raging', 'saves', 'persuasion', 'wisdom', 'throws', 'attacks', 'advantage']
Topic 5:	['die', 'isn', 'hour', 'wanted', 'hey', 'gone', 'leave', 'shot', 'worry', 'fine']
Topic 6:	['reckless', 'thanks', 'awesome', 'remember', 'talk', 'stuff', 'master', 'idea', 'weapon', 'great']
Topic 7:	['talking', 'welcome', 'caleb', 'old', 'wow', 'awesome', 'meat', 'love', 'hey', 'man']
Topic 8:	['beau', 'terrible', 'add', 'investigation', 'everybody', 'high', 'passive', 'checks', 'frumpkin', 'perception']
Topic 9:	['fast', 'follow', 'best', 'ni