# Topic Modeling
### Without Preprocessing

In [1]:
import json
import os
import random
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

#### Define Parameters

In [2]:
n_components = 50 #Topics To generate
n = 20 #Transcripts to use
max_df=0.5
min_df=0.0002
n_terms = 10

## Load Data

In [3]:
def create_df_multi(n=25):
    list_of_text = []
    dir = '../data/aligned data/c=4'

    files = [filename for filename in os.listdir(dir)]
    sampled_files = random.choices(files,k=n)

    for filename in sampled_files:
        # choice = random.choice(os.listdir(dir))
        # print(choice)
        f = open(dir+'/'+filename)
        data = json.load(f)
        choice = random.choice(os.listdir(dir))
        # print(choice)
        f = open(dir+'/'+choice)
        data = json.load(f)

        for x in data:
            for y in x['TURNS']:
                text = ' '.join(y['UTTERANCES'])
                list_of_text.append(text)
    df = pd.DataFrame(list_of_text)
    return df

In [4]:
df = create_df_multi(n)
print(df.shape)
df.head()

(50273, 1)


Unnamed: 0,0
0,"Welcome to tonight's episode of Critical Role,..."
1,"Yeah, three really quick announcements. Number..."
2,Sure you guys heard Hector yelling at you in t...
3,Thank you.
4,All I got!


## Vectorize Data



- Count Vectorizer
- Tfidf Vectorizer

Use both with each model

In [5]:
# tfidf=TfidfVectorizer(stop_words='english',max_df=.7,min_df=2,token_pattern=r'(?u)\b[A-Za-z]+\b')

tfidf = TfidfVectorizer(stop_words='english', 
    max_df=max_df,
    min_df=min_df,
    token_pattern=r'(?u)\b[A-Za-z]+\b',
    ngram_range=(2,2)
    )
tfidf_sparse = tfidf.fit_transform(df[0])
print(tfidf_sparse.shape)
tfidf_df = pd.DataFrame(tfidf_sparse.toarray().transpose(),
                   index=tfidf.get_feature_names_out())
tfidf_df.tail()

(50273, 1620)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50263,50264,50265,50266,50267,50268,50269,50270,50271,50272
yes ve,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
yes yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zenith fires,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zenith tower,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zombie giant,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
cv = CountVectorizer(stop_words='english', 
    max_df=max_df,
    min_df=min_df,
    token_pattern=r'(?u)\b[A-Za-z]+\b',
    ngram_range=(2,2)
    )
cv_sparse = cv.fit_transform(df[0])
print(cv_sparse.shape)
cv_df = pd.DataFrame(cv_sparse.toarray().transpose(),
                   index=cv.get_feature_names_out())
cv_df.tail()

(50273, 1620)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50263,50264,50265,50266,50267,50268,50269,50270,50271,50272
yes ve,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yes yes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zenith fires,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zenith tower,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zombie giant,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Define Output Functions

In [7]:
def print_top_terms(n_components, topics, terms, n_terms = 10):
    for x in range(n_components):
        topic = x
        components = topics[:,topic]
        top_term_indices = components.argsort()[-n_terms:]
        top_terms = np.array(terms)[top_term_indices]
        
        print(f'Topic {x}:\t{top_terms.tolist()}')


## PCA

In [8]:
pca = PCA(n_components=n_components)
topics = pca.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['didn t', 'know m', 'know don', 's going', 'know s', 't think', 't want', 'm going', 't know', 'don t']
Topic 1:	['bonus action', 'yeah m', 'let s', 'right m', 'going cast', 'going try', 'okay m', 'going use', 's going', 'm going']
Topic 2:	['right m', 'going cast', 'going try', 'information vecna', 'fortress sun', 'vex ahlia', 'vox machina', 'okay m', 'going use', 'm going']
Topic 3:	['end turn', 'going use', 'yeah s', 'know s', 'let s', 'turn s', 'guy s', 'going ahead', 's turn', 's going']
Topic 4:	['tonight s', 'vox machina', 'castle whitestone', 'episode critical', 's episode', 'right let', 'yeah let', 's just', 'critical role', 'let s']
Topic 5:	['percy s', 'couldn t', 's s', 'wasn t', 'know m', 's good', 'know s', 'doesn t', 'didn t', 't know']
Topic 6:	['make way', 's good', 't matter', 's like', 'like s', 'looks like', 've seen', 't know', 'little bit', 'doesn t']
Topic 7:	['t say', 'like s', 's like', 'make way', 'don t', 'looks like', 't want', 'doesn t', 'little b

In [9]:
pca = PCA(n_components=n_components)
topics = pca.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['didn t', 'know don', 'yeah don', 't worry', 'know s', 'm going', 't want', 't think', 't know', 'don t']
Topic 1:	['going shoot', 'going start', 'going cast', 'going run', 'yeah m', 'right m', 'going try', 'going use', 'okay m', 'm going']
Topic 2:	['s try', 's talk', 's s', 's good', 'okay let', 's going', 's just', 'right let', 'yeah let', 'let s']
Topic 3:	['s s', 'okay s', 'right s', 'oh s', 'going hit', 'going ahead', 'know s', 's turn', 'yeah s', 's going']
Topic 4:	['just went', 'god damn', 's cool', 's good', 't know', 'god m', 'didn t', 'doesn t', 'god s', 'oh god']
Topic 5:	['t happen', 't realize', 't roll', 'oh didn', 't work', 'know s', 't say', 'doesn t', 't know', 'didn t']
Topic 6:	['s fine', 'good idea', 's right', 'okay s', 't know', 'good s', 'doesn t', 'yeah s', 'oh s', 's good']
Topic 7:	['oh god', 's ve', 't worry', 't think', 'little bit', 't want', 'yeah ve', 'don t', 'didn t', 've got']
Topic 8:	['know don', 'know ve', 't appear', 'yeah ve', 'know m'

## LDA

In [10]:
lda = LatentDirichletAllocation(n_components=n_components)
topics = lda.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['s people', 'll leave', 't let', 't mean', 'make athletics', 'right going', 'just know', 'right left', 's thing', 'okay yeah']
Topic 1:	['door s', 'percival s', 'god s', 'going end', 'feet away', 'radiant damage', 'make investigation', 'investigation check', 'okay m', 'isn t']
Topic 2:	['going s', 'yeah ve', 'm good', 'yeah don', 'make constitution', 'mighty nein', 'making way', 'roll damage', 'saving throw', 's like']
Topic 3:	['slayer s', 's guy', 'points psychic', 've got', 'oh sorry', 't remember', 'does look', 'yes s', 'way s', 'think s']
Topic 4:	['sorry s', 's room', 't matter', 's crest', 'winter s', 'sure s', 'oh okay', 's happening', 's big', 's probably']
Topic 5:	['time ve', 'just case', 's best', 'oh really', 'm like', 'going look', 'plus points', 'j mon', 'damage s', 'going cast']
Topic 6:	['right scanlan', 'want use', 'want stay', 'm trying', 'going stay', 'mean s', 'make strength', 's natural', 's points', 'okay ll']
Topic 7:	['inside s', 'mm hmm', 's fun', 'g

In [11]:
lda = LatentDirichletAllocation(n_components=n_components)
topics = lda.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['make deception', 've used', 'spell attack', 'hit point', 'stealth check', 's awesome', 's gone', 'gold pieces', 'right m', 'oh shit']
Topic 1:	['ve seen', 'hit dice', 'goes right', 'looks shoulder', 'oh really', 'plus seven', 'long time', 'right right', 'making way', 'ahead make']
Topic 2:	['make stealth', 'm getting', 'righteous brand', 'just second', 'oh fuck', 'guys s', 'just got', 's weird', 'sun tree', 'going run']
Topic 3:	['sure ll', 'start walking', 'll hit', 'm happy', 'probably going', 'good job', 't s', 's currently', 'right let', 'scanlan s']
Topic 4:	['damage plus', 'percy s', 's just', 'm like', 's beautiful', 'know just', 'oh okay', 'oh wait', 'll just', 'won t']
Topic 5:	['does appear', 'll stay', 'just case', 'good know', 'yeah think', 'real fast', 's long', 'right ll', 't really', 'd d']
Topic 6:	['ll look', 'jester s', 'look forward', 'ain t', 'okay let', 'keyleth s', 'yeah don', 'right guys', 's plus', 's turn']
Topic 7:	['arcana check', 'beau s', 't come

## SVD

In [12]:
svd = TruncatedSVD(n_components=n_components)
topics = svd.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['didn t', 'know m', 'know don', 'know s', 's going', 't think', 't want', 'm going', 't know', 'don t']
Topic 1:	['vecna s', 'throne room', 'welcome left', 'physical form', 'hello welcome', 'machina having', 'fortress sun', 'information vecna', 'vex ahlia', 'vox machina']
Topic 2:	['bonus action', 'going run', 's going', 'yeah m', 'right m', 'going cast', 'going try', 'okay m', 'going use', 'm going']
Topic 3:	['going use', 'end turn', 'know s', 'yeah s', 'turn s', 'guy s', 'let s', 'going ahead', 's turn', 's going']
Topic 4:	['make way', 's good', 'castle whitestone', 'episode critical', 's episode', 'right let', 'yeah let', 's just', 'critical role', 'let s']
Topic 5:	['t say', 'percy s', 's s', 'wasn t', 'know m', 's good', 'know s', 'doesn t', 'didn t', 't know']
Topic 6:	['vox machina', 's just', 'didn t', 've seen', 'make way', 'like s', 's like', 'looks like', 'little bit', 'doesn t']
Topic 7:	['ve got', 's like', 'd d', 'make way', 't say', 't think', 'little bit', '

In [13]:
svd = TruncatedSVD(n_components=n_components)
topics = svd.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['know don', 'didn t', 'yeah don', 't worry', 'know s', 'm going', 't want', 't think', 't know', 'don t']
Topic 1:	['going shoot', 'going start', 'going cast', 'going run', 'yeah m', 'right m', 'going try', 'going use', 'okay m', 'm going']
Topic 2:	['s try', 's talk', 's s', 'okay let', 's good', 's going', 's just', 'right let', 'yeah let', 'let s']
Topic 3:	['oh god', 'going hit', 'okay s', 'right s', 'going ahead', 'oh s', 'know s', 's turn', 'yeah s', 's going']
Topic 4:	['yeah s', 's cool', 's like', 's good', 'god m', 'doesn t', 't know', 'god s', 'didn t', 'oh god']
Topic 5:	['t roll', 'oh didn', 's like', 's good', 't work', 'know s', 't say', 'doesn t', 't know', 'didn t']
Topic 6:	['good idea', 's like', 'okay s', 's right', 'good s', 've got', 'doesn t', 'yeah s', 'oh s', 's good']
Topic 7:	['oh ve', 's ve', 't worry', 't think', 'little bit', 't want', 'yeah ve', 'don t', 'didn t', 've got']
Topic 8:	['know don', 'know ve', 't appear', 'yeah ve', 'know m', 't mat

## NMF

In [14]:
nmf = NMF(n_components=n_components, max_iter=500)
topics = nmf.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())



Topic 0:	['t look', 'oh don', 't really', 't remember', 't like', 't need', 'know don', 'yeah don', 't worry', 'don t']
Topic 1:	['raven queen', 'throne room', 'vecna s', 'welcome left', 'physical form', 'hello welcome', 'fortress sun', 'information vecna', 'vex ahlia', 'vox machina']
Topic 2:	['going m', 'going shoot', 'going run', 'yeah m', 'right m', 'going cast', 'going use', 'going try', 'okay m', 'm going']
Topic 3:	['turn s', 'going hit', 'going use', 'end turn', 'going attempt', 'going end', 'going make', 'guy s', 'going ahead', 's going']
Topic 4:	['castle whitestone', 's make', 's try', 's ahead', 's talk', 's let', 'okay let', 'right let', 'yeah let', 'let s']
Topic 5:	['know going', 'know want', 'know ll', 'know ve', 'know did', 'know just', 'know don', 'know m', 'don t', 't know']
Topic 6:	['make sure', 't say', 't make', 't look', 't work', 't appear', 't hit', 't like', 't matter', 'doesn t']
Topic 7:	['t add', 't happen', 't realize', 't just', 't roll', 't work', 'oh d

In [15]:
nmf = NMF(n_components=n_components, max_iter=200)
topics = nmf.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())



Topic 0:	['oh don', 't look', 't just', 't say', 't remember', 't like', 't need', 'yeah don', 't worry', 'don t']
Topic 1:	['going shoot', 'going start', 'yeah m', 'going cast', 'going run', 'right m', 'going use', 'going try', 'okay m', 'm going']
Topic 2:	['critical role', 's make', 's ahead', 's let', 's try', 's talk', 'okay let', 'right let', 'yeah let', 'let s']
Topic 3:	['going make', 'guy s', 'end turn', 'going use', 'going try', 'going run', 'going end', 'going hit', 'going ahead', 's going']
Topic 4:	['new york', 'going die', 's coming', 'turn undead', 'just went', 'god damn', 's cool', 'god m', 'god s', 'oh god']
Topic 5:	['t just', 't ask', 't realize', 't happen', 't make', 'oh didn', 't roll', 't work', 't say', 'didn t']
Topic 6:	['fine s', 'laughs s', 'hey s', 'ooh s', 'good know', 'good thing', 'good question', 'good idea', 'good s', 's good']
Topic 7:	['think ve', 'got lot', 'short rest', 'okay ve', 'oh ve', 't worry', 'know ve', 's ve', 'yeah ve', 've got']
Topic 8: