# Topic Modeling
### Without Preprocessing

In [12]:
import json
import os
import random
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

#### Define Parameters

In [13]:
n_components = 25 #Topics To generate
n = 20 #Transcripts to use
max_df=0.01
min_df=0.0001
n_terms = 10

## Load Data

In [14]:
def create_df_multi(n=25):
    list_of_text = []
    dir = 'data/aligned data/c=4'

    files = [filename for filename in os.listdir(dir)]
    sampled_files = random.choices(files,k=n)

    for filename in sampled_files:
        # choice = random.choice(os.listdir(dir))
        # print(choice)
        f = open(dir+'/'+filename)
        data = json.load(f)
        choice = random.choice(os.listdir(dir))
        # print(choice)
        f = open(dir+'/'+choice)
        data = json.load(f)

        for x in data:
            for y in x['TURNS']:
                text = ' '.join(y['UTTERANCES'])
                list_of_text.append(text)
    df = pd.DataFrame(list_of_text)
    return df

In [15]:
df = create_df_multi(n)
print(df.shape)
df.head()

(51137, 1)


Unnamed: 0,0
0,"What's up, GenCon?! (cheering)"
1,That's a lot of people. How you guys doing? (c...
2,Oh my god. Send me the bill for your plane tic...
3,"You know what's up, brother. Thank you guys fo..."
4,What is happening?! We are alive at the best m...


## Vectorize Data



- Count Vectorizer
- Tfidf Vectorizer

Use both with each model

In [16]:
# tfidf=TfidfVectorizer(stop_words='english',max_df=.7,min_df=2,token_pattern=r'(?u)\b[A-Za-z]+\b')

tfidf = TfidfVectorizer(stop_words='english', 
    max_df=max_df,
    min_df=min_df,
    token_pattern=r'(?u)\b[A-Za-z]+\b'
    )
tfidf_sparse = tfidf.fit_transform(df[0])
print(tfidf_sparse.shape)
tfidf_df = pd.DataFrame(tfidf_sparse.toarray().transpose(),
                   index=tfidf.get_feature_names_out())
tfidf_df.tail()

(51137, 5113)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51127,51128,51129,51130,51131,51132,51133,51134,51135,51136
zenwick,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ziggurat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zombie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zone,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
cv = CountVectorizer(stop_words='english', 
    max_df=max_df,
    min_df=min_df,
    token_pattern=r'(?u)\b[A-Za-z]+\b'
    )
cv_sparse = cv.fit_transform(df[0])
print(cv_sparse.shape)
cv_df = pd.DataFrame(cv_sparse.toarray().transpose(),
                   index=cv.get_feature_names_out())
cv_df.tail()

(51137, 5113)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51127,51128,51129,51130,51131,51132,51133,51134,51135,51136
zenwick,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zero,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ziggurat,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zombie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zone,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Define Output Functions

In [18]:
def print_top_terms(n_components, topics, terms, n_terms = 10):
    for x in range(n_components):
        topic = x
        components = topics[:,topic]
        top_term_indices = components.argsort()[-n_terms:]
        top_terms = np.array(terms)[top_term_indices]
        
        print(f'Topic {x}:\t{top_terms.tolist()}')


## PCA

In [19]:
pca = PCA(n_components=n_components)
topics = pca.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['eyes', 'long', 'center', 'large', 'city', 'past', 'brought', 'dark', 'hear', 'begin']
Topic 1:	['week', 'emon', 'family', 'town', 'vox', 'dragon', 'machina', 'party', 'percy', 'city']
Topic 2:	['comes', 'eyes', 'light', 'flash', 'dark', 'darkness', 'feel', 'ground', 'vision', 'thunder']
Topic 3:	['twitch', 'creatures', 'players', 'role', 'tonight', 'critical', 'game', 'cool', 'episode', 'week']
Topic 4:	['powerful', 'creatures', 'encounter', 'seen', 'players', 'family', 'cool', 'battle', 'pike', 'percy']
Topic 5:	['years', 'scanlan', 'episode', 'left', 'arm', 'vox', 'family', 'machina', 'pike', 'percy']
Topic 6:	['house', 'gnome', 'swamp', 'herd', 'wilhand', 'scanlan', 'great', 'life', 'left', 'pike']
Topic 7:	['comes', 'soon', 'strength', 'dragon', 'coming', 'immediately', 'muscle', 'face', 'swing', 'arm']
Topic 8:	['mighty', 'northward', 'past', 'cut', 'safe', 'muscle', 'swing', 'percy', 'swamp', 'arm']
Topic 9:	['best', 'chamber', 'fantastic', 'family', 'life', 'episode',

In [20]:
pca = PCA(n_components=n_components)
topics = pca.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['cheering', 'pike', 'doesn', 'armor', 'seven', 'natural', 'definitely', 'second', 'plus', 'hits']
Topic 1:	['didn', 'percy', 'fucking', 'plus', 'fuck', 'pike', 'holy', 'sorry', 'god', 'shit']
Topic 2:	['dice', 'plus', 'fine', 'awesome', 'tell', 'love', 'die', 'laughs', 'damn', 'god']
Topic 3:	['pike', 'rolled', 'second', 'percy', 'great', 'strength', 'seven', 'natural', 'sorry', 'plus']
Topic 4:	['fjord', 'vax', 'fine', 'scanlan', 'keyleth', 'percy', 'didn', 'mean', 'pike', 'sorry']
Topic 5:	['cool', 'didn', 'scanlan', 'probably', 'natural', 'great', 'laughs', 'percy', 'pike', 'fine']
Topic 6:	['mean', 'throw', 'vex', 'keyleth', 'vax', 'cool', 'laughs', 'scanlan', 'percy', 'pike']
Topic 7:	['gold', 'ooh', 'advantage', 'great', 'fucking', 'second', 'rolled', 'cheering', 'laughs', 'natural']
Topic 8:	['pretty', 'nice', 'idea', 'man', 'sorry', 'fuck', 'cool', 'didn', 'great', 'laughs']
Topic 9:	['ends', 'gold', 'keyleth', 'mean', 'run', 'scanlan', 'cool', 'vex', 'vax', 'percy']


## LDA

In [21]:
lda = LatentDirichletAllocation(n_components=n_components)
topics = lda.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['worth', 'checks', 'add', 'troll', 'stealth', 'die', 'strike', 'kill', 'talk', 'second']
Topic 1:	['level', 'care', 'using', 'dexterity', 'wow', 'wouldn', 'definitely', 'white', 'movement', 'probably']
Topic 2:	['read', 'slow', 'ago', 'anybody', 'watching', 'later', 'sitting', 'live', 'amazing', 'closer']
Topic 3:	['glad', 'rolling', 'cover', 'j', 'mon', 'piece', 'happening', 'heading', 'manage', 'kima']
Topic 4:	['smoke', 'dangerous', 'cassandra', 'shoot', 'power', 'constitution', 'happy', 'happened', 'table', 'tiberius']
Topic 5:	['main', 'investigation', 'heal', 'hide', 'hour', 'correct', 'yep', 'laughs', 'cool', 'plus']
Topic 6:	['platform', 'invisible', 'shift', 'effect', 'somebody', 'frumpkin', 'happens', 'question', 'rolled', 'fjord']
Topic 7:	['mouth', 'understand', 'singing', 'jump', 'stairs', 'attention', 'chains', 'home', 'metal', 'getting']
Topic 8:	['rough', 'meet', 'deal', 'weapon', 'drink', 'bunch', 'ice', 'nope', 'play', 'range']
Topic 9:	['sand', 'finish', 'r

In [22]:
lda = LatentDirichletAllocation(n_components=n_components)
topics = lda.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['hmm', 'whisper', 'punch', 'chirping', 'cage', 'shirts', 'stupid', 'checks', 'whoa', 'yep']
Topic 1:	['seven', 'zero', 'cat', 'map', 'intelligence', 'draconia', 'technically', 'dc', 'word', 'fair']
Topic 2:	['write', 'works', 'hell', 'heal', 'uh', 'order', 'excited', 'potions', 'ask', 'beau']
Topic 3:	['hate', 'enchantment', 'opportunity', 'certainly', 'double', 'hi', 'singing', 'wasn', 'insight', 'wow']
Topic 4:	['forgot', 'true', 'travis', 'prone', 'surprise', 'mansion', 'kiri', 'counting', 'constitution', 'gonna']
Topic 5:	['news', 'hits', 'horse', 'trouble', 'tail', 'bludgeoning', 'man', 'bird', 'straight', 'ugh']
Topic 6:	['shouldn', 'missed', 'playing', 'rod', 'fun', 'lockheed', 'perfect', 'acid', 'caleb', 'said']
Topic 7:	['quietly', 'broom', 'checking', 'common', 'thinking', 'hunter', 'buy', 'care', 'persuasion', 'trinket']
Topic 8:	['ashari', 'dwarf', 'groans', 'climb', 'natural', 'gasps', 'misses', 'matt', 'fly', 'advantage']
Topic 9:	['goddamn', 'sing', 'obviously'

## SVD

In [23]:
svd = TruncatedSVD(n_components=n_components)
topics = svd.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['eyes', 'center', 'brought', 'past', 'long', 'large', 'hear', 'dark', 'city', 'begin']
Topic 1:	['family', 'town', 'episode', 'dragon', 'week', 'vox', 'party', 'machina', 'percy', 'city']
Topic 2:	['comes', 'eyes', 'light', 'flash', 'darkness', 'dark', 'feel', 'ground', 'vision', 'thunder']
Topic 3:	['twitch', 'creatures', 'players', 'role', 'tonight', 'critical', 'game', 'cool', 'episode', 'week']
Topic 4:	['powerful', 'great', 'encounter', 'players', 'seen', 'cool', 'family', 'battle', 'pike', 'percy']
Topic 5:	['years', 'prime', 'episode', 'left', 'arm', 'vox', 'family', 'pike', 'machina', 'percy']
Topic 6:	['house', 'gnome', 'swamp', 'herd', 'wilhand', 'scanlan', 'great', 'life', 'left', 'pike']
Topic 7:	['attacks', 'dragon', 'soon', 'strength', 'coming', 'immediately', 'muscle', 'face', 'swing', 'arm']
Topic 8:	['mighty', 'past', 'northward', 'cut', 'safe', 'muscle', 'swing', 'percy', 'arm', 'swamp']
Topic 9:	['seen', 'second', 'moment', 'chamber', 'far', 'episode', 'wee

In [24]:
svd = TruncatedSVD(n_components=n_components)
topics = svd.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['percy', 'pike', 'armor', 'doesn', 'seven', 'natural', 'definitely', 'second', 'plus', 'hits']
Topic 1:	['mean', 'fucking', 'fuck', 'percy', 'pike', 'holy', 'plus', 'sorry', 'god', 'shit']
Topic 2:	['dice', 'awesome', 'love', 'die', 'tell', 'fine', 'plus', 'laughs', 'damn', 'god']
Topic 3:	['fine', 'second', 'strength', 'great', 'pike', 'percy', 'seven', 'natural', 'sorry', 'plus']
Topic 4:	['vex', 'vax', 'scanlan', 'keyleth', 'fine', 'percy', 'didn', 'mean', 'pike', 'sorry']
Topic 5:	['door', 'throw', 'probably', 'scanlan', 'natural', 'great', 'laughs', 'percy', 'pike', 'fine']
Topic 6:	['totally', 'fjord', 'feel', 'hits', 'probably', 'didn', 'plus', 'shit', 'sorry', 'fine']
Topic 7:	['mean', 'didn', 'sorry', 'far', 'fuck', 'fjord', 'cool', 'plus', 'great', 'laughs']
Topic 8:	['fucking', 'keyleth', 'advantage', 'rolled', 'gold', 'second', 'great', 'cheering', 'laughs', 'natural']
Topic 9:	['pretty', 'far', 'perception', 'door', 'saving', 'cool', 'great', 'vax', 'throw', 'per

## NMF

In [25]:
nmf = NMF(n_components=n_components, max_iter=500)
topics = nmf.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())



Topic 0:	['devils', 'metal', 'various', 'gives', 'hear', 'begin', 'occasionally', 'chains', 'past', 'brought']
Topic 1:	['fantastic', 'gathered', 'central', 'information', 'best', 'beneath', 'decided', 'stack', 'far', 'city']
Topic 2:	['sound', 'fades', 'light', 'figure', 'flash', 'ground', 'dark', 'darkness', 'vision', 'thunder']
Topic 3:	['master', 'interesting', 'fight', 'powerful', 'game', 'battle', 'encounter', 'players', 'creatures', 'cool']
Topic 4:	['shirt', 'fun', 'welcome', 'tonight', 'thursday', 'game', 'episode', 'role', 'critical', 'week']
Topic 5:	['lady', 'town', 'castle', 'years', 'took', 'briarwoods', 'whitestone', 'briarwood', 'family', 'percy']
Topic 6:	['killed', 'heal', 'best', 'felt', 'great', 'wilhand', 'left', 'life', 'family', 'pike']
Topic 7:	['attacks', 'soon', 'strength', 'past', 'comes', 'face', 'coming', 'muscle', 'swing', 'arm']
Topic 8:	['merrow', 'northward', 'nein', 'mighty', 'journey', 'labenda', 'heading', 'house', 'safe', 'swamp']
Topic 9:	['dragon'

In [26]:
nmf = NMF(n_components=n_components, max_iter=200)
topics = nmf.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())



Topic 0:	['doesn', 'seven', 'course', 'cheering', 'sneak', 'barely', 'armor', 'definitely', 'second', 'hits']
Topic 1:	['sighs', 'caleb', 'laugh', 'balls', 'went', 'ah', 'piece', 'fucking', 'holy', 'shit']
Topic 2:	['bad', 'kill', 'thought', 'amazing', 'dice', 'awesome', 'love', 'die', 'damn', 'god']
Topic 3:	['psychic', 'bonus', 'sneak', 'proficiency', 'total', 'level', 'modifier', 'strength', 'seven', 'plus']
Topic 4:	['tried', 'matt', 'caleb', 'laugh', 'fjord', 'buddy', 'thought', 'course', 'love', 'sorry']
Topic 5:	['stay', 'strength', 'temple', 'hand', 'vex', 'help', 'hi', 'ends', 'says', 'pike']
Topic 6:	['making', 'yep', 'shot', 'worry', 'dagger', 'taking', 'totally', 'guess', 'work', 'fine']
Topic 7:	['save', 'blood', 'fucking', 'row', 'history', 'ooh', 'groaning', 'second', 'cheering', 'natural']
Topic 8:	['awesome', 'start', 'stay', 'bear', 'true', 'quietly', 'currently', 'smart', 'worst', 'laughs']
Topic 9:	['trinket', 'seen', 'skull', 'bear', 'tiberius', 'wanted', 'love', '