# Topic Modeling
### Without Preprocessing

In [1]:
import json
import os
import random
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

#### Define Parameters

In [2]:
n_components = 10 #Topics To generate
n = 20 #Transcripts to use
max_df=0.01
min_df=0.0001
n_terms = 10

## Load Data

In [3]:
def create_df_multi(n=25):
    list_of_text = []
    dir = 'data/aligned data/c=4'

    files = [filename for filename in os.listdir(dir)]
    sampled_files = random.choices(files,k=n)

    for filename in sampled_files:
        # choice = random.choice(os.listdir(dir))
        # print(choice)
        f = open(dir+'/'+filename)
        data = json.load(f)
        choice = random.choice(os.listdir(dir))
        # print(choice)
        f = open(dir+'/'+choice)
        data = json.load(f)

        for x in data:
            for y in x['TURNS']:
                text = ' '.join(y['UTTERANCES'])
                list_of_text.append(text)
    df = pd.DataFrame(list_of_text)
    return df

In [4]:
df = create_df_multi(n)
print(df.shape)
df.head()

(51561, 1)


Unnamed: 0,0
0,"Hello everyone, and welcome to tonight's episo..."
1,Put the stank down! Loot Crate!
2,Once again--
3,(airhorn sounds) (laughter)
4,"That's wonderful. So yeah, so those of you guy..."


## Vectorize Data



- Count Vectorizer
- Tfidf Vectorizer

Use both with each model

In [5]:
# tfidf=TfidfVectorizer(stop_words='english',max_df=.7,min_df=2,token_pattern=r'(?u)\b[A-Za-z]+\b')

tfidf = TfidfVectorizer(stop_words='english', 
    max_df=max_df,
    min_df=min_df,
    token_pattern=r'(?u)\b[A-Za-z]+\b'
    )
tfidf_sparse = tfidf.fit_transform(df[0])
print(tfidf_sparse.shape)
tfidf_df = pd.DataFrame(tfidf_sparse.toarray().transpose(),
                   index=tfidf.get_feature_names_out())
tfidf_df.tail()

(51561, 5404)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51551,51552,51553,51554,51555,51556,51557,51558,51559,51560
zephra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ziggurat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zombie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zombies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
cv = CountVectorizer(stop_words='english', 
    max_df=max_df,
    min_df=min_df,
    token_pattern=r'(?u)\b[A-Za-z]+\b'
    )
cv_sparse = cv.fit_transform(df[0])
print(cv_sparse.shape)
cv_df = pd.DataFrame(cv_sparse.toarray().transpose(),
                   index=cv.get_feature_names_out())
cv_df.tail()

(51561, 5404)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51551,51552,51553,51554,51555,51556,51557,51558,51559,51560
zephra,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zero,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ziggurat,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zombie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zombies,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Define Output Functions

In [7]:
def print_top_terms(n_components, topics, terms, n_terms = 10):
    for x in range(n_components):
        topic = x
        components = topics[:,topic]
        top_term_indices = components.argsort()[-n_components:]
        top_terms = np.array(terms)[top_term_indices]
        
        print(f'Topic {x}:\t{top_terms.tolist()}')


## PCA

In [8]:
pca = PCA(n_components=n_components)
topics = pca.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['k', 'left', 'discovered', 'vax', 'elder', 'tunnels', 'brain', 'duergar', 'city', 'took']
Topic 1:	['various', 'mansion', 'vox', 'machina', 'vecna', 'plane', 'dragon', 'left', 'vax', 'city']
Topic 2:	['undying', 'dispelled', 'titan', 'dragon', 'vecna', 'risen', 'return', 'mansion', 'plane', 'vax']
Topic 3:	['amazing', 'story', 'tell', 'year', 'play', 'world', 'tabletop', 'game', 'games', 'love']
Topic 4:	['returns', 'resurrection', 'aware', 'bringing', 'ship', 'evening', 'leader', 'pike', 'vax', 'ashari']
Topic 5:	['went', 'allura', 'duergar', 'creature', 'getting', 'evil', 'mines', 'city', 'kraghammer', 'party']
Topic 6:	['battle', 'friend', 'form', 'members', 'held', 'lorenzo', 'fjord', 'jester', 'friends', 'final']
Topic 7:	['uriel', 'percy', 'whitestone', 'council', 'vox', 'machina', 'emon', 'town', 'wildemount', 'briarwoods']
Topic 8:	['dangerous', 'ground', 'westruun', 'black', 'vox', 'machina', 'town', 'herd', 'group', 'dragon']
Topic 9:	['wall', 'begin', 'day', 'small

In [9]:
pca = PCA(n_components=n_components)
topics = pca.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['barely', 'rolled', 'god', 'laugh', 'modifier', 'sorry', 'definitely', 'seven', 'plus', 'hits']
Topic 1:	['stealth', 'bonus', 'strength', 'rolled', 'laugh', 'modifier', 'god', 'seven', 'sorry', 'plus']
Topic 2:	['damn', 'pike', 'laughs', 'great', 'didn', 'fuck', 'fine', 'laugh', 'god', 'sorry']
Topic 3:	['magic', 'awesome', 'dead', 'yep', 'plus', 'amazing', 'kill', 'damn', 'laughs', 'god']
Topic 4:	['stealth', 'book', 'hey', 'fucking', 'yep', 'cool', 'plus', 'fine', 'great', 'laugh']
Topic 5:	['tell', 'totally', 'man', 'percy', 'fucking', 'hold', 'maybe', 'fuck', 'great', 'fine']
Topic 6:	['fucking', 'true', 'wow', 'away', 'hold', 'percy', 'didn', 'maybe', 'great', 'laughs']
Topic 7:	['idea', 'rolled', 'pike', 'nice', 'ooh', 'maybe', 'fuck', 'pretty', 'cool', 'great']
Topic 8:	['help', 'stealth', 'throw', 'bad', 'hold', 'laughs', 'man', 'maybe', 'didn', 'fuck']
Topic 9:	['advantage', 'man', 'stealth', 'saving', 'cheering', 'maybe', 'didn', 'throw', 'rolled', 'natural']


## LDA

In [10]:
lda = LatentDirichletAllocation(n_components=n_components)
topics = lda.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['elemental', 'lightning', 'sleep', 'uriel', 'bear', 'actual', 'stop', 'believe', 'arrow', 'sorry']
Topic 1:	['won', 'playing', 'wow', 'critical', 'true', 'high', 'pick', 'advantage', 'kill', 'week']
Topic 2:	['lava', 'cold', 'understand', 'fjord', 'turns', 'yasha', 'leave', 'outside', 'god', 'big']
Topic 3:	['wanna', 'shot', 'fun', 'everybody', 'amazing', 'course', 'ask', 'thought', 'cool', 'throw']
Topic 4:	['holy', 'kitchen', 'care', 'waiting', 'double', 'hour', 'yep', 'hard', 'bad', 'spell']
Topic 5:	['wrong', 'spells', 'dc', 'straight', 'deal', 'matt', 'pieces', 'magic', 'natural', 'gold']
Topic 6:	['damn', 'fly', 'guess', 'save', 'able', 'bag', 'remember', 'stealth', 'better', 'plus']
Topic 7:	['camp', 'wants', 'correct', 'using', 'ah', 'add', 'ooh', 'jester', 'rolled', 'talk']
Topic 8:	['mark', 'fair', 'traps', 'happened', 'happy', 'read', 'caleb', 'hey', 'awesome', 'book']
Topic 9:	['strike', 'minutes', 'running', 'sounds', 'number', 'slowly', 'dead', 'close', 'action'

In [11]:
lda = LatentDirichletAllocation(n_components=n_components)
topics = lda.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['stop', 'follow', 'total', 'fjord', 'aw', 'somebody', 'heal', 'wanna', 'die', 'guess']
Topic 1:	['vasselheim', 'yasha', 'plan', 'walking', 'went', 'gone', 'dice', 'disadvantage', 'dead', 'ooh']
Topic 2:	['happen', 'killed', 'happened', 'cause', 'save', 'caleb', 'perception', 'idea', 'game', 'magic']
Topic 3:	['home', 'attacks', 'wanted', 'sneak', 'weird', 'investigation', 'read', 'night', 'work', 'sorry']
Topic 4:	['matt', 'times', 'town', 'fly', 'group', 'break', 'strength', 'saw', 'question', 'leave']
Topic 5:	['taking', 'dc', 'sounds', 'jester', 'insight', 'wow', 'thought', 'fucking', 'laughs', 'god']
Topic 6:	['sort', 'shot', 'talking', 'high', 'ah', 'getting', 'money', 'amazing', 'percy', 'tell']
Topic 7:	['creature', 'giant', 'believe', 'ends', 'makes', 'pick', 'hands', 'sword', 'tiberius', 'mind']
Topic 8:	['righty', 'hair', 'fun', 'damn', 'everybody', 'hey', 'yep', 'guy', 'rolled', 'bad']
Topic 9:	['beau', 'gasps', 'old', 'happy', 'real', 'correct', 'bring', 'course',

## SVD

In [12]:
svd = TruncatedSVD(n_components=n_components)
topics = svd.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['tunnels', 'discovered', 'dragon', 'brain', 'party', 'duergar', 'left', 'vax', 'took', 'city']
Topic 1:	['clarota', 'great', 'discovered', 'varn', 'k', 'duergar', 'elder', 'tunnels', 'brain', 'took']
Topic 2:	['titan', 'left', 'risen', 'mansion', 'vecna', 'dragon', 'city', 'return', 'plane', 'vax']
Topic 3:	['amazing', 'story', 'tell', 'year', 'play', 'world', 'tabletop', 'game', 'games', 'love']
Topic 4:	['returns', 'resurrection', 'aware', 'bringing', 'ship', 'evening', 'leader', 'pike', 'ashari', 'vax']
Topic 5:	['went', 'allura', 'duergar', 'getting', 'creature', 'evil', 'mines', 'kraghammer', 'city', 'party']
Topic 6:	['chamber', 'friend', 'members', 'form', 'held', 'lorenzo', 'fjord', 'jester', 'final', 'friends']
Topic 7:	['uriel', 'percy', 'whitestone', 'council', 'vox', 'emon', 'machina', 'town', 'wildemount', 'briarwoods']
Topic 8:	['dangerous', 'tribe', 'westruun', 'black', 'vox', 'machina', 'town', 'herd', 'group', 'dragon']
Topic 9:	['throw', 'moment', 'begins', 

In [13]:
svd = TruncatedSVD(n_components=n_components)
topics = svd.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['modifier', 'rolled', 'god', 'nice', 'sorry', 'laugh', 'definitely', 'seven', 'plus', 'hits']
Topic 1:	['bonus', 'didn', 'stealth', 'rolled', 'modifier', 'laugh', 'seven', 'god', 'sorry', 'plus']
Topic 2:	['pretty', 'pike', 'great', 'cool', 'fuck', 'didn', 'fine', 'laugh', 'god', 'sorry']
Topic 3:	['yep', 'awesome', 'dead', 'hits', 'amazing', 'fine', 'pretty', 'laughs', 'damn', 'god']
Topic 4:	['natural', 'bad', 'god', 'book', 'stealth', 'fuck', 'cool', 'great', 'fine', 'laugh']
Topic 5:	['kill', 'pretty', 'ooh', 'lot', 'man', 'help', 'nice', 'maybe', 'great', 'fine']
Topic 6:	['pike', 'tell', 'maybe', 'pretty', 'natural', 'didn', 'cool', 'fuck', 'laughs', 'great']
Topic 7:	['ooh', 'kill', 'yep', 'pretty', 'advantage', 'maybe', 'tell', 'fuck', 'fine', 'laughs']
Topic 8:	['kill', 'fucking', 'true', 'pike', 'stealth', 'nice', 'maybe', 'man', 'didn', 'fuck']
Topic 9:	['bad', 'throw', 'pike', 'rolled', 'man', 'tell', 'cool', 'maybe', 'didn', 'natural']


## NMF

In [14]:
nmf = NMF(n_components=n_components, max_iter=500)
topics = nmf.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())



Topic 0:	['currently', 'open', 'inside', 'dark', 'wall', 'chamber', 'stone', 'small', 'begin', 'large']
Topic 1:	['clarota', 'great', 'varn', 'discovered', 'k', 'duergar', 'elder', 'tunnels', 'brain', 'took']
Topic 2:	['various', 'dragon', 'risen', 'mansion', 'vecna', 'return', 'left', 'plane', 'vax', 'city']
Topic 3:	['wheaton', 'amazing', 'year', 'tell', 'play', 'world', 'game', 'tabletop', 'games', 'love']
Topic 4:	['massive', 'aware', 'final', 'ship', 'leader', 'bringing', 'evening', 'pike', 'ashari', 'vax']
Topic 5:	['allura', 'went', 'evil', 'mines', 'creature', 'duergar', 'getting', 'kraghammer', 'party', 'city']
Topic 6:	['new', 'lorenzo', 'held', 'final', 'members', 'fjord', 'jester', 'chamber', 'form', 'friends']
Topic 7:	['uriel', 'lot', 'came', 'council', 'town', 'percy', 'whitestone', 'emon', 'wildemount', 'briarwoods']
Topic 8:	['powerful', 'city', 'black', 'westruun', 'vax', 'group', 'herd', 'vox', 'machina', 'dragon']
Topic 9:	['face', 'hear', 'eyes', 'forward', 'ground

In [15]:
nmf = NMF(n_components=n_components, max_iter=200)
topics = nmf.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())



Topic 0:	['nice', 'dagger', 'cheering', 'wall', 'attacks', 'ground', 'sneak', 'barely', 'definitely', 'hits']
Topic 1:	['dex', 'proficiency', 'times', 'stealth', 'bonus', 'rolled', 'strength', 'modifier', 'seven', 'plus']
Topic 2:	['mind', 'question', 'better', 'hold', 'whispers', 'saying', 'bad', 'pike', 'thought', 'sorry']
Topic 3:	['new', 'bad', 'rolled', 'awesome', 'fucking', 'dead', 'amazing', 'kill', 'damn', 'god']
Topic 4:	['mouth', 'matt', 'guy', 'stealth', 'cast', 'book', 'fucking', 'hey', 'yep', 'laugh']
Topic 5:	['plan', 'bonus', 'walk', 'hair', 'water', 'fucking', 'takes', 'kaylie', 'totally', 'fine']
Topic 6:	['job', 'awesome', 'ooh', 'game', 'stealth', 'idea', 'sounds', 'fucking', 'pretty', 'great']
Topic 7:	['acrobatics', 'walk', 'disadvantage', 'seven', 'jesus', 'nervously', 'vex', 'true', 'wow', 'laughs']
Topic 8:	['work', 'stay', 'end', 'idea', 'gil', 'advantage', 'guy', 'ah', 'man', 'fuck']
Topic 9:	['away', 'said', 'pretty', 'door', 'pike', 'tell', 'natural', 'cool'