# DND Topic Modeling MVP

****

### Background
- The client is looking to use Dnd session transcripts to inform their model/product design and improve customer experience. We are using Topic Modeling to better understand the trends that take place during a DnD session.

****

### Takeaways
- With NMF:
  - CV shows topics in the in game content (Settings, Actions, Characters..)
  - TFIDF shows more macro topics (Sessions introductions, Rolling, Attacking, Meta gaming)
- Current topics have overlap. Expanding topic numbers may be needed.
- Further preprocessing text may be helpful.
- Expanding the number of transcripts used decreases session specific bias

### Imports

In [18]:
import json
import os
import random
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

n_components = 10 #Topics To generate
n = 10 #Transcripts to use
max_df=0.01
min_df=0.0001
n_terms = 10

## Load Data

In [6]:
def create_df_multi(n=25):
    list_of_text = []
    dir = 'data/aligned data/c=4'

    files = [filename for filename in os.listdir(dir)]
    sampled_files = random.choices(files,k=n)

    for filename in sampled_files:
        # choice = random.choice(os.listdir(dir))
        # print(choice)
        f = open(dir+'/'+filename)
        data = json.load(f)
        choice = random.choice(os.listdir(dir))
        # print(choice)
        f = open(dir+'/'+choice)
        data = json.load(f)

        for x in data:
            for y in x['TURNS']:
                text = ' '.join(y['UTTERANCES'])
                list_of_text.append(text)
    df = pd.DataFrame(list_of_text)
    return df

In [7]:
df = create_df_multi(n)
print(df.shape)
df.head()

(27330, 1)


Unnamed: 0,0
0,"Welcome to tonight's episode of Critical Role,..."
1,"Before we get into tonight's story, welcome. W..."
2,So good.
3,"It's out for PS4, Xbox One, and PC. The latest..."
4,You're cultists?


## Vectorize Data



- Count Vectorizer
- Tfidf Vectorizer

Use both with each model

In [19]:
# tfidf=TfidfVectorizer(stop_words='english',max_df=.7,min_df=2,token_pattern=r'(?u)\b[A-Za-z]+\b')

tfidf = TfidfVectorizer(stop_words='english', 
    max_df=max_df,
    min_df=min_df,
    token_pattern=r'(?u)\b[A-Za-z]+\b'
    )
tfidf_sparse = tfidf.fit_transform(df[0])
print(tfidf_sparse.shape)
tfidf_df = pd.DataFrame(tfidf_sparse.toarray().transpose(),
                   index=tfidf.get_feature_names_out())
tfidf_df.tail()

(27330, 5636)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27320,27321,27322,27323,27324,27325,27326,27327,27328,27329
zolezzo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zombie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zombies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zooming,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zsundie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
cv = CountVectorizer(stop_words='english', 
    max_df=max_df,
    min_df=min_df,
    token_pattern=r'(?u)\b[A-Za-z]+\b'
    )
cv_sparse = cv.fit_transform(df[0])
print(cv_sparse.shape)
cv_df = pd.DataFrame(cv_sparse.toarray().transpose(),
                   index=cv.get_feature_names_out())
cv_df.tail()

(27330, 5636)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27320,27321,27322,27323,27324,27325,27326,27327,27328,27329
zolezzo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zombie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zombies,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zooming,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zsundie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Define Output Functions

In [21]:
def print_top_terms(n_components, topics, terms, n_terms = 10):
    for x in range(n_components):
        topic = x
        components = topics[:,topic]
        top_term_indices = components.argsort()[-n_components:]
        top_terms = np.array(terms)[top_term_indices]
        
        print(f'Topic {x}:\t{top_terms.tolist()}')


## PCA

In [22]:
pca = PCA(n_components=n_components)
topics = pca.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['familiar', 'begin', 'fjord', 'hand', 'eyes', 'city', 'left', 'water', 'stone', 'body']
Topic 1:	['vanishes', 'glance', 'forward', 'flesh', 'familiar', 'skin', 'pool', 'eyes', 'stone', 'body']
Topic 2:	['half', 'center', 'arms', 'girl', 'table', 'small', 'room', 'man', 'begins', 'begin']
Topic 3:	['second', 'body', 'orb', 'cultists', 'deeper', 'thar', 'amphala', 'gathered', 'tower', 'city']
Topic 4:	['members', 'elite', 'familiar', 'family', 'taken', 'trying', 'left', 'keg', 'shepherds', 'iron']
Topic 5:	['century', 'produce', 'worship', 'local', 'crown', 'trostenwald', 'town', 'near', 'empire', 'small']
Topic 6:	['walking', 'wall', 'number', 'making', 'table', 'door', 'harbor', 'light', 'beautiful', 'lighthouse']
Topic 7:	['awesome', 'tuesday', 'week', 'new', 'tonight', 'episode', 'thank', 'twitch', 'role', 'critical']
Topic 8:	['decided', 'merrow', 'brief', 'action', 'seen', 'cali', 'safe', 'tunnel', 'left', 'house']
Topic 9:	['black', 'hear', 'darkness', 'shadows', 'dark',

In [23]:
pca = PCA(n_components=n_components)
topics = pca.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['yep', 'armor', 'strike', 'sneak', 'seven', 'definitely', 'natural', 'second', 'plus', 'hits']
Topic 1:	['seven', 'wait', 'man', 'fucking', 'natural', 'damn', 'thank', 'plus', 'god', 'fuck']
Topic 2:	['amazing', 'didn', 'rolled', 'seven', 'wait', 'fucking', 'plus', 'damn', 'thank', 'god']
Topic 3:	['level', 'action', 'second', 'bonus', 'rolled', 'sorry', 'wait', 'seven', 'natural', 'plus']
Topic 4:	['day', 'help', 'night', 'coming', 'great', 'love', 'man', 'nice', 'sam', 'thank']
Topic 5:	['fucking', 'man', 'second', 'thought', 'hold', 'action', 'didn', 'natural', 'sorry', 'wait']
Topic 6:	['sorry', 'second', 'luck', 'great', 'god', 'break', 'nott', 'rolled', 'cheering', 'natural']
Topic 7:	['big', 'bonus', 'didn', 'perception', 'action', 'nice', 'nott', 'great', 'fine', 'sorry']
Topic 8:	['fucking', 'action', 'gonna', 'nott', 'didn', 'perception', 'great', 'ooh', 'nice', 'fine']
Topic 9:	['bonus', 'cool', 'nott', 'great', 'action', 'ooh', 'fucking', 'advantage', 'perception'

## LDA

In [24]:
lda = LatentDirichletAllocation(n_components=n_components)
topics = lda.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['heard', 'gold', 'terrible', 'ask', 'dead', 'gets', 'grog', 'guy', 'tower', 'nott']
Topic 1:	['pulls', 'everybody', 'brings', 'idea', 'rope', 'ship', 'ooh', 'holding', 'stay', 'god']
Topic 2:	['correct', 'sutan', 'richter', 'horses', 'isn', 'giant', 'able', 'catch', 'moving', 'quite']
Topic 3:	['healing', 'damn', 'ulog', 'righty', 'step', 'cool', 'said', 'keg', 'far', 'door']
Topic 4:	['round', 'role', 'dice', 'bring', 'distance', 'critical', 'molly', 'advantage', 'direction', 'trying']
Topic 5:	['trinket', 'doesn', 'remember', 'boat', 'seeing', 'sounds', 'walking', 'person', 'percy', 'fine']
Topic 6:	['sense', 'saying', 'real', 'manage', 'hour', 'kill', 'used', 'yasha', 'bad', 'fucking']
Topic 7:	['cat', 'singing', 'standing', 'range', 'says', 'horse', 'wanted', 'definitely', 'great', 'thank']
Topic 8:	['happens', 'sondur', 'constitution', 'plan', 'shot', 'disadvantage', 'keyleth', 'wait', 'hits', 'sorry']
Topic 9:	['question', 'send', 'evening', 'wow', 'yep', 'ready', 'hey'

In [25]:
lda = LatentDirichletAllocation(n_components=n_components)
topics = lda.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['fast', 'body', 'saying', 'moving', 'correct', 'person', 'definitely', 'ends', 'place', 'advantage']
Topic 1:	['haven', 'rope', 'total', 'athletics', 'gone', 'gets', 'tower', 'molly', 'yasha', 'perception']
Topic 2:	['die', 'best', 'inside', 'house', 'walk', 'stealth', 'scanlan', 'wow', 'hey', 'rolled']
Topic 3:	['idea', 'fuck', 'high', 'throw', 'bad', 'trying', 'spell', 'fine', 'sorry', 'god']
Topic 4:	['singing', 'round', 'healing', 'used', 'attacks', 'yep', 'ooh', 'gonna', 'seven', 'big']
Topic 5:	['cart', 'corner', 'wrong', 'counting', 'ulog', 'straight', 'mind', 'nope', 'getting', 'better']
Topic 6:	['ship', 'talking', 'day', 'investigation', 'work', 'close', 'thought', 'doesn', 'lot', 'man']
Topic 7:	['armor', 'sounds', 'percy', 'dice', 'laughs', 'damn', 'magic', 'righty', 'keg', 'water']
Topic 8:	['making', 'plan', 'ah', 'dead', 'leave', 'said', 'cool', 'great', 'nice', 'hits']
Topic 9:	['ask', 'save', 'course', 'seen', 'frumpkin', 'disadvantage', 'true', 'kill', 'end'

## SVD

In [26]:
svd = TruncatedSVD(n_components=n_components)
topics = svd.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())

Topic 0:	['begins', 'fjord', 'hand', 'begin', 'eyes', 'water', 'body', 'stone', 'left', 'city']
Topic 1:	['vanishes', 'glance', 'forward', 'flesh', 'skin', 'familiar', 'pool', 'eyes', 'stone', 'body']
Topic 2:	['information', 'elemental', 'zsundie', 'jester', 'body', 'stone', 'individual', 'algar', 'water', 'fjord']
Topic 3:	['second', 'body', 'orb', 'cultists', 'deeper', 'gathered', 'thar', 'amphala', 'tower', 'city']
Topic 4:	['members', 'elite', 'familiar', 'family', 'taken', 'trying', 'left', 'keg', 'shepherds', 'iron']
Topic 5:	['eyes', 'face', 'orc', 'arms', 'pole', 'half', 'voice', 'toad', 'devil', 'girl']
Topic 6:	['coming', 'wall', 'number', 'making', 'harbor', 'table', 'door', 'light', 'beautiful', 'lighthouse']
Topic 7:	['awesome', 'tuesday', 'week', 'new', 'episode', 'tonight', 'thank', 'twitch', 'role', 'critical']
Topic 8:	['decided', 'strange', 'merrow', 'brief', 'seen', 'cali', 'safe', 'tunnel', 'left', 'house']
Topic 9:	['black', 'darkness', 'hear', 'shadows', 'dark', 

In [27]:
svd = TruncatedSVD(n_components=n_components)
topics = svd.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())

Topic 0:	['yep', 'armor', 'strike', 'sneak', 'seven', 'definitely', 'natural', 'second', 'plus', 'hits']
Topic 1:	['nice', 'wait', 'man', 'fucking', 'natural', 'damn', 'thank', 'plus', 'god', 'fuck']
Topic 2:	['trying', 'amazing', 'rolled', 'wait', 'seven', 'fucking', 'plus', 'damn', 'thank', 'god']
Topic 3:	['rolled', 'thank', 'second', 'bonus', 'action', 'sorry', 'wait', 'seven', 'natural', 'plus']
Topic 4:	['sorry', 'fine', 'great', 'natural', 'nott', 'sam', 'man', 'wait', 'nice', 'thank']
Topic 5:	['big', 'action', 'nott', 'man', 'natural', 'fine', 'nice', 'didn', 'sorry', 'wait']
Topic 6:	['sorry', 'fucking', 'nice', 'ooh', 'great', 'man', 'rolled', 'beau', 'cheering', 'natural']
Topic 7:	['bonus', 'great', 'cool', 'big', 'action', 'nice', 'nott', 'fine', 'didn', 'sorry']
Topic 8:	['doesn', 'nott', 'perception', 'didn', 'advantage', 'bonus', 'great', 'nice', 'action', 'fine']
Topic 9:	['beau', 'far', 'ooh', 'great', 'advantage', 'bonus', 'didn', 'perception', 'action', 'nice']


## NMF

In [28]:
nmf = NMF(n_components=n_components, max_iter=500)
topics = nmf.fit_transform(cv_df)
print_top_terms(n_components=n_components, topics=topics, terms=cv.get_feature_names_out())



Topic 0:	['hand', 'zsundie', 'information', 'met', 'elemental', 'jester', 'individual', 'algar', 'water', 'fjord']
Topic 1:	['forward', 'gone', 'night', 'hand', 'water', 'pool', 'familiar', 'eyes', 'stone', 'body']
Topic 2:	['orc', 'arms', 'voice', 'eyes', 'man', 'devil', 'toad', 'half', 'girl', 'begins']
Topic 3:	['second', 'discovered', 'orb', 'cultists', 'deeper', 'thar', 'amphala', 'gathered', 'tower', 'city']
Topic 4:	['taken', 'managed', 'family', 'trying', 'run', 'left', 'keg', 'city', 'shepherds', 'iron']
Topic 5:	['worship', 'local', 'trostenwald', 'crown', 'near', 'town', 'slowly', 'small', 'empire', 'begin']
Topic 6:	['wall', 'walking', 'buildings', 'ocean', 'docks', 'beautiful', 'city', 'tower', 'harbor', 'lighthouse']
Topic 7:	['awesome', 'tuesday', 'week', 'new', 'episode', 'tonight', 'thank', 'twitch', 'role', 'critical']
Topic 8:	['brief', 'strange', 'number', 'seen', 'taken', 'cali', 'tunnel', 'safe', 'left', 'house']
Topic 9:	['hand', 'light', 'lights', 'dark', 'door'

In [29]:
nmf = NMF(n_components=n_components, max_iter=200)
topics = nmf.fit_transform(tfidf_df)
print_top_terms(n_components=n_components, topics=topics, terms=tfidf.get_feature_names_out())



Topic 0:	['class', 'misses', 'barely', 'yep', 'armor', 'strike', 'sneak', 'definitely', 'second', 'hits']
Topic 1:	['ooh', 'sake', 'gonna', 'ah', 'cheering', 'fucking', 'jester', 'gil', 'man', 'fuck']
Topic 2:	['cool', 'terrible', 'better', 'didn', 'rolled', 'amazing', 'trying', 'fucking', 'damn', 'god']
Topic 3:	['dex', 'action', 'rolled', 'modifier', 'second', 'bonus', 'ooh', 'level', 'seven', 'plus']
Topic 4:	['day', 'better', 'help', 'night', 'great', 'coming', 'love', 'man', 'sam', 'thank']
Topic 5:	['action', 'big', 'doesn', 'wrong', 'man', 'hold', 'minutes', 'thought', 'didn', 'wait']
Topic 6:	['high', 'saving', 'body', 'throw', 'luck', 'break', 'second', 'rolled', 'cheering', 'natural']
Topic 7:	['saving', 'throw', 'bonus', 'beau', 'door', 'didn', 'action', 'great', 'nott', 'sorry']
Topic 8:	['sounds', 'didn', 'half', 'leave', 'water', 'gonna', 'door', 'nott', 'great', 'fine']
Topic 9:	['work', 'cool', 'man', 'advantage', 'fucking', 'ooh', 'bonus', 'action', 'perception', 'nice