# Topic Modeling
### Without Preprocessing

In [1]:
import json
import os
import random
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

#### Define Parameters

In [2]:
n_components = 5 #Topics To generate
n = 20 #Transcripts to use
max_df=0.01
min_df=0.0001
n_terms = 10

## Load Data

In [3]:
def create_df_multi(n=25):
    list_of_text = []
    dir = 'data/aligned data/c=4'

    files = [filename for filename in os.listdir(dir)]
    sampled_files = random.choices(files,k=n)

    for filename in sampled_files:
        # choice = random.choice(os.listdir(dir))
        # print(choice)
        f = open(dir+'/'+filename)
        data = json.load(f)
        choice = random.choice(os.listdir(dir))
        # print(choice)
        f = open(dir+'/'+choice)
        data = json.load(f)

        for x in data:
            for y in x['TURNS']:
                text = ' '.join(y['UTTERANCES'])
                list_of_text.append(text)
    df = pd.DataFrame(list_of_text)
    return df

In [4]:
df = create_df_multi(n)
print(df.shape)
df.head()

(53998, 1)


Unnamed: 0,0
0,"Hello everyone, and welcome to tonight's episo..."
1,Go Yankees!
2,"Go Razi! Anyway, as a reminder, Tome of Foes i..."
3,"Yes, our second sponsor for the evening is Bac..."
4,If you don't know Backblaze-- I'm sorry. If yo...


## Vectorize Data



- Count Vectorizer
- Tfidf Vectorizer

Use both with each model

In [5]:
# tfidf=TfidfVectorizer(stop_words='english',max_df=.7,min_df=2,token_pattern=r'(?u)\b[A-Za-z]+\b')

tfidf = TfidfVectorizer(stop_words='english', 
    max_df=max_df,
    min_df=min_df,
    token_pattern=r'(?u)\b[A-Za-z]+\b'
    )
tfidf_sparse = tfidf.fit_transform(df[0])
print(tfidf_sparse.shape)
tfidf_df = pd.DataFrame(tfidf_sparse.toarray().transpose(),
                   index=tfidf.get_feature_names_out())
tfidf_df.tail()

(53998, 5188)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,53988,53989,53990,53991,53992,53993,53994,53995,53996,53997
zauber,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zoen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zombie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zombies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
cv = CountVectorizer(stop_words='english', 
    max_df=max_df,
    min_df=min_df,
    token_pattern=r'(?u)\b[A-Za-z]+\b'
    )
cv_sparse = cv.fit_transform(df[0])
print(cv_sparse.shape)
cv_df = pd.DataFrame(cv_sparse.toarray().transpose(),
                   index=cv.get_feature_names_out())
cv_df.tail()

(53998, 5188)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,53988,53989,53990,53991,53992,53993,53994,53995,53996,53997
zauber,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zero,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zombie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zombies,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Define Output Functions

In [7]:
def print_top_terms(n_components, topics, terms, n_terms = 10):
    for x in range(n_components):
        topic = x
        components = topics[:,topic]
        top_term_indices = components.argsort()[-n_components:]
        top_terms = np.array(terms)[top_term_indices]
        
        print(f'Topic {x}:\t{top_terms.tolist()}')


## PCA

In [8]:
pca = PCA(n_components=n_components)
topics = pca.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['begin', 'currently', 'left', 'rest', 'city']
Topic 1:	['emon', 'number', 'currently', 'gilmore', 'city']
Topic 2:	['city', 'stone', 'currently', 'body', 'gilmore']
Topic 3:	['large', 'form', 'center', 'begin', 'feet']
Topic 4:	['great', 'week', 'pike', 'scanlan', 'life']


In [9]:
pca = PCA(n_components=n_components)
topics = pca.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['seven', 'definitely', 'second', 'plus', 'hits']
Topic 1:	['modifier', 'natural', 'wait', 'seven', 'plus']
Topic 2:	['wait', 'laughter', 'fuck', 'god', 'shit']
Topic 3:	['natural', 'laughs', 'laughter', 'damn', 'god']
Topic 4:	['laughter', 'feet', 'doing', 'mean', 'wait']


## LDA

In [10]:
lda = LatentDirichletAllocation(n_components=n_components)
topics = lda.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['fucking', 'fuck', 'god', 'throw', 'action']
Topic 1:	['direction', 'sort', 'hold', 'scanlan', 'shit']
Topic 2:	['vax', 'nice', 'bad', 'doing', 'wait']
Topic 3:	['gold', 'percy', 'fine', 'great', 'lot']
Topic 4:	['door', 'didn', 'hits', 'feel', 'feet']


In [11]:
lda = LatentDirichletAllocation(n_components=n_components)
topics = lda.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['fuck', 'doing', 'great', 'shit', 'plus']
Topic 1:	['love', 'trinket', 'nott', 'caleb', 'wait']
Topic 2:	['nice', 'water', 'natural', 'fine', 'laughter']
Topic 3:	['help', 'advantage', 'second', 'pretty', 'god']
Topic 4:	['keyleth', 'perception', 'laughs', 'mean', 'hits']


## SVD

In [12]:
svd = TruncatedSVD(n_components=n_components)
topics = svd.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['currently', 'begin', 'left', 'rest', 'city']
Topic 1:	['town', 'number', 'currently', 'gilmore', 'city']
Topic 2:	['currently', 'city', 'stone', 'body', 'gilmore']
Topic 3:	['form', 'temple', 'center', 'begin', 'feet']
Topic 4:	['bryce', 'begin', 'town', 'rest', 'city']


In [13]:
svd = TruncatedSVD(n_components=n_components)
topics = svd.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['shit', 'definitely', 'second', 'plus', 'hits']
Topic 1:	['modifier', 'shit', 'wait', 'seven', 'plus']
Topic 2:	['laughter', 'holy', 'wait', 'god', 'shit']
Topic 3:	['yep', 'laughter', 'doing', 'damn', 'god']
Topic 4:	['didn', 'feet', 'advantage', 'mean', 'wait']


## NMF

In [14]:
nmf = NMF(n_components=n_components, max_iter=500)
topics = nmf.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())



Topic 0:	['center', 'watch', 'large', 'feet', 'begin']
Topic 1:	['pool', 'hand', 'eyes', 'stone', 'body']
Topic 2:	['tal', 'emon', 'currently', 'gilmore', 'city']
Topic 3:	['evening', 'left', 'rest', 'town', 'city']
Topic 4:	['years', 'family', 'pike', 'scanlan', 'life']


In [15]:
nmf = NMF(n_components=n_components, max_iter=200)
topics = nmf.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())



Topic 0:	['ground', 'barely', 'second', 'definitely', 'hits']
Topic 1:	['throw', 'modifier', 'natural', 'seven', 'plus']
Topic 2:	['fucking', 'laughter', 'holy', 'fuck', 'shit']
Topic 3:	['fucking', 'doing', 'laughter', 'damn', 'god']
Topic 4:	['didn', 'doing', 'mean', 'feet', 'wait']
