# Topic Modeling
### Without Preprocessing

In [1]:
import json
import os
import random
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

#### Define Parameters

In [61]:
n_components = 10 #Topics To generate
n = 20 #Transcripts to use
max_df=0.5
min_df=0.0002
n_terms = 10

## Load Data

In [18]:
def create_df_multi(n=25):
    list_of_text = []
    dir = '../data/aligned data/c=4'

    files = [filename for filename in os.listdir(dir)]
    sampled_files = random.choices(files,k=n)

    for filename in sampled_files:
        # choice = random.choice(os.listdir(dir))
        # print(choice)
        f = open(dir+'/'+filename)
        data = json.load(f)
        choice = random.choice(os.listdir(dir))
        # print(choice)
        f = open(dir+'/'+choice)
        data = json.load(f)

        for x in data:
            for y in x['TURNS']:
                text = ' '.join(y['UTTERANCES'])
                list_of_text.append(text)
    df = pd.DataFrame(list_of_text)
    return df

In [19]:
df = create_df_multi(n)
print(df.shape)
df.head()

(45474, 1)


Unnamed: 0,0
0,[cheering]
1,"Hello, New York! [cheering] Wow! I would ask h..."
2,"Well, if that isn't a rush, I don't know what is."
3,Oh yeah. Mainline that shit.
4,"Oh man. Hello everyone, and welcome-- [cheering]"


## Vectorize Data



- Count Vectorizer
- Tfidf Vectorizer

Use both with each model

In [62]:
# tfidf=TfidfVectorizer(stop_words='english',max_df=.7,min_df=2,token_pattern=r'(?u)\b[A-Za-z]+\b')

tfidf = TfidfVectorizer(stop_words='english', 
    max_df=max_df,
    min_df=min_df,
    token_pattern=r'(?u)\b[A-Za-z]+\b',
    ngram_range=(2,2)
    )
tfidf_sparse = tfidf.fit_transform(df[0])
print(tfidf_sparse.shape)
tfidf_df = pd.DataFrame(tfidf_sparse.toarray().transpose(),
                   index=tfidf.get_feature_names_out())
tfidf_df.tail()

(45474, 1505)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45464,45465,45466,45467,45468,45469,45470,45471,45472,45473
yes s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
yes thank,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
yes ve,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
yes yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
young age,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
cv = CountVectorizer(stop_words='english', 
    max_df=max_df,
    min_df=min_df,
    token_pattern=r'(?u)\b[A-Za-z]+\b',
    ngram_range=(2,2)
    )
cv_sparse = cv.fit_transform(df[0])
print(cv_sparse.shape)
cv_df = pd.DataFrame(cv_sparse.toarray().transpose(),
                   index=cv.get_feature_names_out())
cv_df.tail()

(45474, 1505)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45464,45465,45466,45467,45468,45469,45470,45471,45472,45473
yes s,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yes thank,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yes ve,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yes yes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
young age,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Define Output Functions

In [73]:
def print_top_terms(n_components, topics, terms, n_terms = 10):
    for x in range(n_components):
        topic = x
        components = topics[:,topic]
        top_term_indices = components.argsort()[-n_terms:]
        top_terms = np.array(terms)[top_term_indices]
        
        print(f'Topic {x}:\t{top_terms.tolist()}')


## PCA

In [74]:
pca = PCA(n_components=n_components)
topics = pca.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['know don', 'didn t', 't worry', 'know s', 's going', 't think', 'm going', 't want', 't know', 'don t']
Topic 1:	['s going', 'yeah m', 'going m', 'little bit', 'going try', 'right m', 'going use', 'going cast', 'okay m', 'm going']
Topic 2:	['vox machina', 'going ahead', 'know s', 'going make', 'critical role', 's turn', 'going hit', 'yeah s', 'let s', 's going']
Topic 3:	['episode critical', 'tonight s', 'right let', 's episode', 's ahead', 'd d', 'yeah let', 'critical role', 's just', 'let s']
Topic 4:	['d amazing', 's fun', 'people s', 'thank d', 'd game', 't know', 'like oh', 'critical role', 've got', 'd d']
Topic 5:	['comes clutching', 'swats hand', 'hand away', 'hole hear', 'eventually comes', 'goes m', 'going need', 'dark brown', 'looks like', 'little bit']
Topic 6:	['know don', 's just', 'wasn t', 'know ve', 'let s', 'know m', 'doesn t', 'know s', 'didn t', 't know']
Topic 7:	['look like', 'make way', 's just', 's like', 't want', 'haven t', 'like s', 'looks like', 

In [75]:
pca = PCA(n_components=n_components)
topics = pca.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['oh don', 'didn t', 'm going', 't like', 'know s', 't worry', 't think', 't want', 't know', 'don t']
Topic 1:	['going make', 'going run', 'let s', 'right m', 'yeah m', 'going try', 'going use', 'going cast', 'okay m', 'm going']
Topic 2:	['didn t', 's try', 'okay let', 's ahead', 's look', 's let', 'right let', 'yeah let', 's just', 'let s']
Topic 3:	['going try', 'doesn t', 'know s', 'going hit', 'oh s', 'yeah s', 'didn t', 'oh god', 't know', 's going']
Topic 4:	['wouldn t', 'little bit', 't work', 'oh didn', 't say', 'know s', 'doesn t', 'oh god', 't know', 'didn t']
Topic 5:	['individual currently', 'oh plus', 'come grog', 'everybody s', 'm sorry', 't think', 't want', 'don t', 'god s', 'oh god']
Topic 6:	['know ve', 's just', 'oh god', 'got little', 's like', 'yeah ve', 'doesn t', 't know', 'little bit', 've got']
Topic 7:	['know don', 'haven t', 'know just', 's fine', 'know m', 'wouldn t', 'oh god', 'doesn t', 'know s', 't know']
Topic 8:	['points damage', 's little', 

## LDA

In [76]:
lda = LatentDirichletAllocation(n_components=n_components)
topics = lda.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['far away', 'just like', 'going make', 'yeah m', 'don t', 'wouldn t', 'know s', 't want', 's just', 't know']
Topic 1:	['look s', 'didn t', 'oh wait', 'going hit', 'going ahead', 'right ll', 'percy s', 'trinket s', 'look like', 's like']
Topic 2:	['s big', 'okay right', 'long time', 'going try', 'just going', 'make sure', 'think s', 'm sorry', 'make way', 'oh s']
Topic 3:	['s glorious', 'investigation check', 'does hit', 't say', 's probably', 'll say', 'going use', 'ends turn', 'gilmore s', 's true']
Topic 4:	['ll just', 'aren t', 't really', 'won t', 'perception check', 'okay m', 'ahead make', 'm gonna', 'vox machina', 's turn']
Topic 5:	['hunter s', 've heard', 'right m', 'right guys', 'okay s', 'like s', 'right s', 's okay', 'oh god', 'yeah s']
Topic 6:	['stealth check', 's happening', 'hasn t', 'guys make', 'yeah ll', 'grog s', 'lady kima', 't think', 's really', 's got']
Topic 7:	['m glad', 'sneak attack', 'roll damage', 'gold pieces', 's s', 'd like', 'm just', 's fine

In [77]:
lda = LatentDirichletAllocation(n_components=n_components)
topics = lda.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['t say', 'yeah yeah', 'going try', 'going cast', 'll say', 's pretty', 'bonus action', 'oh yeah', 'make way', 's okay']
Topic 1:	['s looking', 's probably', 'make perception', 's great', 'right guys', 'ends turn', 's s', 'okay m', 'd like', 's right']
Topic 2:	['s kind', 'sounds like', 's plus', 'm glad', 'trinket s', 's way', 'doesn t', 've seen', 'look like', 'oh god']
Topic 3:	['yes s', 'did say', 's hard', 'oh okay', 'm sure', 'won t', 'wasn t', 'm gonna', 'know s', 's just']
Topic 4:	['making way', 'll just', 'hunter s', 'bag holding', 'ahead make', 'make sure', 'okay s', 'oh shit', 't know', 's got']
Topic 5:	['good know', 'thank guys', 's better', 'end turn', 't like', 's amazing', 'pretty good', 'sneak attack', 'just like', 'yeah m']
Topic 6:	['does hit', 's lot', 'ahead roll', 'gold pieces', 'yeah s', 's fine', 'saving throw', 't want', 'points damage', 's like']
Topic 7:	['right ll', 's feet', 've heard', 'aren t', 'yeah ll', 't really', 'like s', 'wouldn t', 'right

## SVD

In [78]:
svd = TruncatedSVD(n_components=n_components)
topics = svd.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['know don', 'didn t', 't worry', 'know s', 's going', 't think', 'm going', 't want', 't know', 'don t']
Topic 1:	['yeah m', 'going m', 's going', 'little bit', 'going try', 'right m', 'going use', 'going cast', 'okay m', 'm going']
Topic 2:	['going make', 'know s', 'doesn t', 'vox machina', 'going hit', 's turn', 'critical role', 'yeah s', 'let s', 's going']
Topic 3:	['right let', 'tonight s', 'little bit', 's episode', 's ahead', 'yeah let', 'd d', 'critical role', 's just', 'let s']
Topic 4:	['doesn t', 'thank d', 'd game', 'make way', 'like oh', 'critical role', 'looks like', 've got', 'little bit', 'd d']
Topic 5:	['grumbling eventually', 'hole hear', 'hand away', 'eventually comes', 'goes m', 'going need', 'dark brown', 'make way', 'looks like', 'little bit']
Topic 6:	['know don', 'wasn t', 's just', 'know ve', 'let s', 'know m', 'doesn t', 'know s', 'didn t', 't know']
Topic 7:	['s just', 'make way', 't want', 's good', 's like', 'like s', 'haven t', 'looks like', 'di

In [79]:
svd = TruncatedSVD(n_components=n_components)
topics = svd.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['oh don', 'didn t', 't like', 'm going', 'know s', 't worry', 't think', 't want', 't know', 'don t']
Topic 1:	['going make', 'going run', 'right m', 'yeah m', 'going try', 'let s', 'going use', 'going cast', 'okay m', 'm going']
Topic 2:	['didn t', 's try', 'okay let', 's ahead', 's look', 's let', 'right let', 'yeah let', 's just', 'let s']
Topic 3:	['going hit', 'know s', 'little bit', 'doesn t', 'oh s', 'yeah s', 'oh god', 't know', 'didn t', 's going']
Topic 4:	['s like', 've got', 'little bit', 'oh didn', 't say', 'know s', 'doesn t', 'oh god', 't know', 'didn t']
Topic 5:	['oh plus', 'individual currently', 'everybody s', 'come grog', 'pretty good', 't think', 't want', 'god s', 'don t', 'oh god']
Topic 6:	['got little', 'points damage', 'yeah ve', 's just', 'looks like', 'doesn t', 's good', 's like', 'little bit', 've got']
Topic 7:	['know just', 'know ve', 'let s', 'know m', 'wouldn t', 'oh god', 'doesn t', 'know s', 've got', 't know']
Topic 8:	['yeah s', 's little

## NMF

In [80]:
nmf = NMF(n_components=n_components, max_iter=500)
topics = nmf.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())



Topic 0:	['t need', 'feel like', 't really', 't don', 't like', 'think s', 't worry', 't think', 't want', 'don t']
Topic 1:	['bonus action', 'going make', 'yeah m', 'going m', 'going try', 'right m', 'going use', 'going cast', 'okay m', 'm going']
Topic 2:	['going attempt', 'going end', 'critical role', 'going ahead', 'know s', 'going make', 's turn', 'going hit', 'yeah s', 's going']
Topic 3:	['tonight s', 'episode critical', 's episode', 's let', 'right let', 's ahead', 'yeah let', 'critical role', 's just', 'let s']
Topic 4:	['ll week', 'd amazing', 's fun', 'people s', 'thank d', 'd game', 'like oh', 'critical role', 've got', 'd d']
Topic 5:	['brown glass', 'need ugh', 'falls shatters', 'hole hear', 'hand away', 'eventually comes', 'goes m', 'going need', 'dark brown', 'little bit']
Topic 6:	['know ll', 'know just', 've got', 'wasn t', 'know ve', 'know don', 'know m', 'know s', 'don t', 't know']
Topic 7:	['t really', 'points damage', 't want', 't look', 'feel like', 'keeper yenn

In [81]:
nmf = NMF(n_components=n_components, max_iter=200)
topics = nmf.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())



Topic 0:	['yeah don', 't mind', 't really', 'oh don', 't need', 't like', 't worry', 't think', 't want', 'don t']
Topic 1:	['going m', 'going make', 'going run', 'right m', 'yeah m', 'going try', 'going use', 'going cast', 'okay m', 'm going']
Topic 2:	['s way', 's try', 'okay let', 's ahead', 's look', 's let', 'right let', 'yeah let', 's just', 'let s']
Topic 3:	['going make', 's turn', 'going end', 'going attempt', 'going s', 'know s', 'going try', 'going hit', 'yeah s', 's going']
Topic 4:	['know ve', 'know maybe', 'know just', 'know don', 'wouldn t', 'know m', 'doesn t', 'know s', 'don t', 't know']
Topic 5:	['oh plus', 'come grog', 'pretty good', 'like oh', 'everybody s', 's got', 'haven t', 'm sorry', 'god s', 'oh god']
Topic 6:	['oh yeah', 'got good', 'okay s', 'got lot', 'doesn t', 'm gonna', 'know ve', 'got little', 'yeah ve', 've got']
Topic 7:	['ve seen', 't roll', 't tell', 't work', 'm sorry', 't want', 't think', 'oh didn', 't say', 'didn t']
Topic 8:	['got little', 'ma