# Topic Modeling
### Without Preprocessing

In [16]:
import json
import os
import random
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

#### Define Parameters

In [17]:
n_components = 25 #Topics To generate
n = 20 #Transcripts to use
max_df=0.5
min_df=0.0002
n_terms = 10

## Load Data

In [18]:
def create_df_multi(n=25):
    list_of_text = []
    dir = '../data/aligned data/c=4'

    files = [filename for filename in os.listdir(dir)]
    sampled_files = random.choices(files,k=n)

    for filename in sampled_files:
        # choice = random.choice(os.listdir(dir))
        # print(choice)
        f = open(dir+'/'+filename)
        data = json.load(f)
        choice = random.choice(os.listdir(dir))
        # print(choice)
        f = open(dir+'/'+choice)
        data = json.load(f)

        for x in data:
            for y in x['TURNS']:
                text = ' '.join(y['UTTERANCES'])
                list_of_text.append(text)
    df = pd.DataFrame(list_of_text)
    return df

In [19]:
df = create_df_multi(n)
print(df.shape)
df.head()

(53756, 1)


Unnamed: 0,0
0,"Hello everybody, and welcome to tonight's epis..."
1,Now from our own studio!
2,It's so weird!
3,"I know, not a whole lot has changed at the mom..."
4,This is a crazy thing above us.


## Vectorize Data



- Count Vectorizer
- Tfidf Vectorizer

Use both with each model

In [20]:
# tfidf=TfidfVectorizer(stop_words='english',max_df=.7,min_df=2,token_pattern=r'(?u)\b[A-Za-z]+\b')

tfidf = TfidfVectorizer(stop_words='english', 
    max_df=max_df,
    min_df=min_df,
    token_pattern=r'(?u)\b[A-Za-z]+\b',
    ngram_range=(2,2)
    )
tfidf_sparse = tfidf.fit_transform(df[0])
print(tfidf_sparse.shape)
tfidf_df = pd.DataFrame(tfidf_sparse.toarray().transpose(),
                   index=tfidf.get_feature_names_out())
tfidf_df.tail()

(53756, 1678)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,53746,53747,53748,53749,53750,53751,53752,53753,53754,53755
yes s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
yes yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
young age,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zadash decided,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zauber spire,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
cv = CountVectorizer(stop_words='english', 
    max_df=max_df,
    min_df=min_df,
    token_pattern=r'(?u)\b[A-Za-z]+\b',
    ngram_range=(2,2)
    )
cv_sparse = cv.fit_transform(df[0])
print(cv_sparse.shape)
cv_df = pd.DataFrame(cv_sparse.toarray().transpose(),
                   index=cv.get_feature_names_out())
cv_df.tail()

(53756, 1678)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,53746,53747,53748,53749,53750,53751,53752,53753,53754,53755
yes s,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yes yes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
young age,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zadash decided,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zauber spire,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Define Output Functions

In [22]:
def print_top_terms(n_components, topics, terms, n_terms = 10):
    for x in range(n_components):
        topic = x
        components = topics[:,topic]
        top_term_indices = components.argsort()[-n_terms:]
        top_terms = np.array(terms)[top_term_indices]
        
        print(f'Topic {x}:\t{top_terms.tolist()}')


## PCA

In [23]:
pca = PCA(n_components=n_components)
topics = pca.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['mighty nein', 'richter s', 'sutan s', 'make way', 's house', 'd like', 'look like', 'lord sutan', 'knights requital', 'high richter']
Topic 1:	['s just', 'didn t', 'high richter', 's going', 'm going', 'know s', 't think', 't want', 't know', 'don t']
Topic 2:	['bunch nerdy', 'nerdy ass', 'ass voice', 'voice actors', 'role bunch', 'critical role', 'dungeons dragons', 'oh m', 'look like', 'd d']
Topic 3:	['going start', 's going', 'going m', 'right m', 'going cast', 'going use', 'going try', 'd d', 'okay m', 'm going']
Topic 4:	['guy s', 'critical role', 'little bit', 'going end', 'going ahead', 's just', 'know s', 's turn', 'let s', 's going']
Topic 5:	['s good', 'tonight s', 'didn t', 's let', 's ahead', 'right let', 'yeah let', 'critical role', 's just', 'let s']
Topic 6:	['know don', 'looks like', 'just didn', 's just', 'wasn t', 't say', 'doesn t', 'know s', 'didn t', 't know']
Topic 7:	['wasn t', 'vex ahlia', 'looks like', 'make way', 't say', 'little bit', 't think', '

In [24]:
pca = PCA(n_components=n_components)
topics = pca.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['yeah don', 't like', 't worry', 't need', 'didn t', 'know s', 't think', 't want', 't know', 'don t']
Topic 1:	['going m', 'going run', 'yeah m', 'right m', 'going start', 'going cast', 'going use', 'going try', 'okay m', 'm going']
Topic 2:	['s s', 's say', 's ahead', 've got', 'okay let', 's going', 'right let', 'yeah let', 's just', 'let s']
Topic 3:	['going end', 've got', 's turn', 'doesn t', 'okay s', 'yeah s', 'didn t', 'know s', 't know', 's going']
Topic 4:	['wasn t', 't work', 'oh didn', 'know s', 't say', 'doesn t', 'oh god', 've got', 't know', 'didn t']
Topic 5:	['s right', 'got s', 'got right', 'right ve', 'okay ve', 's ve', 'oh ve', 'don t', 'yeah ve', 've got']
Topic 6:	['going try', 've got', 'points damage', 'haven t', 'oh yeah', 't think', 't want', 'don t', 'god s', 'oh god']
Topic 7:	['know ll', 'know ve', 'know don', 's just', 'know m', 've got', 'know s', 'oh god', 'doesn t', 't know']
Topic 8:	['okay s', 'points damage', 'little bit', 's just', 's lik

## LDA

In [25]:
lda = LatentDirichletAllocation(n_components=n_components)
topics = lda.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['did roll', 'sutan s', 'scroll case', 's thing', 'yeah know', 't just', 's house', 'knights requital', 'going say', 'high richter']
Topic 1:	['ll try', 'second attack', 's beautiful', 'lead box', 'yasha s', 'just want', 'oh man', 'going use', 'wouldn t', 's really']
Topic 2:	['s doing', 'wait wait', 'oh right', 'm trying', 'points slashing', 's looking', 'yeah ll', 'slashing damage', 'aren t', 'oh s']
Topic 3:	['investigation check', 'just like', 'going cast', 't really', 's hard', 'yeah m', 'isn t', 'like s', 'know s', 'looks like']
Topic 4:	['oh don', 'persuasion check', 'sounds like', 'flying carpet', 'make persuasion', 'm good', 'oh m', 's coming', 'okay right', 's s']
Topic 5:	['okay just', 'm saying', 's amazing', 's head', 'episode critical', 's episode', 'going ahead', 's little', 'critical role', 't think']
Topic 6:	['okay yeah', 't mind', 'going run', 's natural', 's better', 'm doing', 'roll attack', 'oh wait', 's true', 'oh god']
Topic 7:	['know right', 't quite',

In [26]:
lda = LatentDirichletAllocation(n_components=n_components)
topics = lda.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['think need', 'people s', 'like m', 't help', 's different', 'half damage', 'right ends', 'insight check', 'oh shit', 'haven t']
Topic 1:	['okay roll', 'attack opportunity', 'plus s', 'maybe s', 'm looking', 'weren t', 'roll attack', 'vox machina', 'aren t', 'hit points']
Topic 2:	['just got', 'll fine', 's kind', 'yeah just', 'good s', 'really good', 'd d', 'going try', 's turn', 'looks like']
Topic 3:	['s awesome', 'shouldn t', 't like', 'wait s', 's actually', 'long time', 'scanlan s', 'sneak attack', 've heard', 'right ll']
Topic 4:	['just case', 's points', 'okay good', 's coming', 'saving throws', 'oh right', 's okay', 's s', 's good', 's right']
Topic 5:	['right ahead', 'guy s', 'just little', 'd say', 's way', 'okay ll', 's little', 'oh okay', 'll just', 'wasn t']
Topic 6:	['level spell', 'bad news', 'feet away', 'just saying', 'trinket s', 's thing', 't say', 'leaky tap', 's great', 's really']
Topic 7:	['s mark', 's hit', 's fair', 'going ahead', 'm good', 'right le

## SVD

In [27]:
svd = TruncatedSVD(n_components=n_components)
topics = svd.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['couldn t', 's house', 'left mighty', 'mighty nein', 'lord sutan', 'd like', 'look like', 'make way', 'knights requital', 'high richter']
Topic 1:	['s just', 'd d', 'didn t', 's going', 'know s', 'm going', 't think', 't want', 't know', 'don t']
Topic 2:	['nerdy ass', 'ass voice', 'voice actors', 'role bunch', 'dungeons dragons', 'critical role', 'oh m', 'look like', 'm going', 'd d']
Topic 3:	['going say', 'going start', 's going', 'going m', 'right m', 'going cast', 'going use', 'going try', 'okay m', 'm going']
Topic 4:	['right let', 'saving throw', 'critical role', 'going ahead', 'little bit', 'know s', 's just', 's turn', 'let s', 's going']
Topic 5:	['s good', 'tonight s', 's let', 'didn t', 's ahead', 'right let', 'yeah let', 'critical role', 's just', 'let s']
Topic 6:	['know don', 'just didn', 'looks like', 's just', 'wasn t', 't say', 'doesn t', 'know s', 'didn t', 't know']
Topic 7:	['s good', 't say', 's just', 'looks like', 'make way', 't think', 'don t', 't wan

In [28]:
svd = TruncatedSVD(n_components=n_components)
topics = svd.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['t like', 't worry', 't need', 'm going', 'didn t', 'know s', 't think', 't want', 't know', 'don t']
Topic 1:	['let s', 's going', 'yeah m', 'right m', 'going start', 'going cast', 'going use', 'going try', 'okay m', 'm going']
Topic 2:	['s say', 's s', 's ahead', 've got', 'okay let', 's going', 'right let', 'yeah let', 's just', 'let s']
Topic 3:	['oh s', 's turn', 'okay s', 'doesn t', 'yeah s', 'know s', 've got', 't know', 'didn t', 's going']
Topic 4:	['wasn t', 't work', 'oh didn', 'know s', 't say', 'doesn t', 'oh god', 've got', 't know', 'didn t']
Topic 5:	['got s', 's right', 'got right', 'right ve', 'okay ve', 's ve', 'oh ve', 'don t', 'yeah ve', 've got']
Topic 6:	['s different', 'going try', 'haven t', 'oh yeah', 'points damage', 't think', 't want', 'god s', 'don t', 'oh god']
Topic 7:	['know ll', 'know ve', 'know don', 's just', 'know m', 've got', 'know s', 'oh god', 'doesn t', 't know']
Topic 8:	['okay s', 'points damage', 's just', 's like', 'little bit', '

## NMF

In [29]:
nmf = NMF(n_components=n_components, max_iter=500)
topics = nmf.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())



Topic 0:	['sutan s', 'left mighty', 'mighty nein', 'couldn t', 's house', 'lord sutan', 'd like', 'look like', 'knights requital', 'high richter']
Topic 1:	['t trust', 't don', 'just don', 't need', 'think s', 't like', 't worry', 't think', 't want', 'don t']
Topic 2:	['m doing', 'nerdy ass', 'ass voice', 'okay right', 'bunch nerdy', 'guys know', 'dungeons dragons', 'oh m', 'look like', 'd d']
Topic 3:	['yeah m', 'going say', 'going start', 'right m', 'going m', 'going use', 'going cast', 'going try', 'okay m', 'm going']
Topic 4:	['going make', 'going attempt', 'going hit', 'guy s', 'going use', 'going s', 'going end', 'going ahead', 'know s', 's going']
Topic 5:	['war camp', 's sleep', 's say', 's s', 'okay let', 's let', 's ahead', 'right let', 'yeah let', 'let s']
Topic 6:	['know means', 've heard', 'know ll', 'know going', 'know ve', 'know m', 'know don', 'know s', 'don t', 't know']
Topic 7:	['m sorry', 'feel like', 'vax ildan', 't just', 'just didn', 't think', 'wasn t', 't wan

In [30]:
nmf = NMF(n_components=n_components, max_iter=200)
topics = nmf.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())



Topic 0:	['oh don', 'think s', 'yeah don', 't mind', 't like', 't worry', 't need', 't want', 't think', 'don t']
Topic 1:	['going m', 'going run', 'yeah m', 'right m', 'going start', 'going cast', 'going use', 'going try', 'okay m', 'm going']
Topic 2:	['s try', 's s', 's let', 's sleep', 's say', 's ahead', 'okay let', 'right let', 'yeah let', 'let s']
Topic 3:	['going hit', 'going ahead', 'going use', 'going attempt', 'going try', 'going end', 'okay s', 's turn', 'know s', 's going']
Topic 4:	['just didn', 'wasn t', 't really', 't roll', 't work', 'oh didn', 't think', 't say', 't want', 'didn t']
Topic 5:	['wouldn t', 's people', 'got s', 'right ve', 'got right', 'okay ve', 's ve', 'oh ve', 'yeah ve', 've got']
Topic 6:	['going cast', 'going die', 'city s', 's coming', 'grog grog', 'say s', 's different', 'going try', 'god s', 'oh god']
Topic 7:	['know say', 'know just', 'know going', 'know ll', 'know ve', 'know don', 'know m', 'know s', 'don t', 't know']
Topic 8:	['okay yeah', 's