In [46]:
import numpy as np
import os
import spacy
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from spacy.lang.en import English
import random
import pandas as pd
from tqdm import tqdm 
from collections import Counter

In [47]:
df = pd.read_excel('BetaData.xlsx')

In [48]:
ALPHA = 0.1
BETA = 0.1
NUM_TOPICS = 20
sp = spacy.load('en_core_web_sm')

np.random.seed(42)
random.seed(42)

In [49]:
print(sp.Defaults.stop_words)

{'full', 'one', '’m', 'nowhere', 'whom', 'was', 'although', 'whence', 'herein', 'made', 'are', 'among', 'such', 'whose', 'five', 'still', 'at', 'your', 'except', 'again', 'doing', 'everyone', 'say', 'keep', 'most', 'four', 'not', 'give', 'what', 'cannot', 'across', 'ourselves', 'much', '‘s', 'anywhere', 'himself', 'anyhow', 'bottom', 'already', 'their', 'he', 'being', 'ever', 'never', 'fifty', "'re", 'this', 'it', 'by', 'meanwhile', 'anything', 'has', '’ve', 'just', 'side', 'n‘t', 'against', 'themselves', 'yourself', 'n’t', 'became', 'none', 'her', "'s", 'everything', 'two', 'did', 'done', 'put', 'between', '’re', 'than', 'eight', 'few', 'off', 'beforehand', 'upon', 'becomes', 'of', 'moreover', 'make', 'down', 'yours', 'six', 'show', 'up', 'nor', 'yourselves', 'about', 'whether', 'seems', 'becoming', 'had', 'must', 'seem', 'she', 'do', 'they', 'around', 'further', 'too', 'after', 'the', 'more', "'d", "'m", 'while', 'am', 'itself', 'an', 'should', 'third', 'nevertheless', 're', 'well', 

In [50]:
def generate_frequencies(data, max_docs = 10000):
    freqs = Counter()
    all_stopwords = sp.Defaults.stop_words
    all_stopwords.add("enron")
    nr_tokens = 0

    for doc in data[:max_docs]:
        if isinstance(doc, str):
            tokens = sp.tokenizer(doc)
            for token in tokens:
                token_text = token.text.lower()
                if token_text not in all_stopwords and token.is_alpha:
                    nr_tokens += 1
                    freqs[token_text] += 1
        else:
            continue

    return freqs

def get_vocab(freqs, freq_threshold=3):
    vocab = {}
    vocab_idx_str = {}
    vocab_idx = 0

    for word in freqs:
        if freqs[word] >= freq_threshold:
            vocab[word]= vocab_idx
            vocab_idx_str[vocab_idx] = word
            vocab_idx += 1
    return vocab, vocab_idx_str
        


def tokenize_dataset(data, vocab, max_docs= 10000):
    nr_tokens = 0
    nr_docs = 0
    docs = []

    for doc in data[:max_docs]:
        tokens = sp.tokenizer(doc)

        if len(tokens) > 1:
            doc = []
            for token in tokens:
                token_text = token.text.lower()
                if token_text in vocab:
                    doc.append(token_text)
                    nr_tokens += 1
            nr_docs += 1
            docs.append(doc)

    print(f"Number of text messages: {nr_docs}")
    print(f"Number of tokens: {nr_tokens}")

    corpus = []
    for doc in docs:
        corpus_d = []

        for token in doc:
            corpus_d.append(vocab[token])

        corpus.append(np.asarray(corpus_d))

    return docs, corpus


In [51]:
data = df['Column1'].astype(str).sample(frac=1).values
freqs = generate_frequencies(data)
vocab, vocab_idx_str = get_vocab(freqs)
docs, corpus = tokenize_dataset(data, vocab)
vocab_size = len(vocab)
print(f"Vocab size: {vocab_size}")

Number of text messages: 1425
Number of tokens: 1791
Vocab size: 246


In [52]:
def LDA_Collapsed_Gibbs(corpus, num_iter=200):

    Z = []
    num_docs = len(corpus)

    for _, doc in enumerate(corpus):
        Zd = np.random.randint(low=0, high = NUM_TOPICS, size=(len(doc)))
        Z.append(Zd)

    ndk = np.zeros((num_docs, NUM_TOPICS))
    for d in range(num_docs):
        for k in range(NUM_TOPICS):
            ndk[d, k] = np.sum(Z[d]==k)

    nkw = np.zeros((NUM_TOPICS, vocab_size))
    for doc_idx, doc in enumerate(corpus):
        for i, word in enumerate(doc):
            topic = Z[doc_idx][i]
            nkw[topic, word] += 1

    nk = np.sum(nkw, axis=1)
    topic_list = [i for i in range(NUM_TOPICS)]

    for _ in tqdm(range(num_iter)):
        for doc_idx, doc in enumerate(corpus):
            for i in range(len(doc)):
                word = doc[i]
                topic = Z[doc_idx][i]

                ndk[doc_idx, topic] -= 1
                nkw[topic, word] -= 1
                nk[topic] -= 1

                p_z = (ndk[doc_idx, :] + ALPHA * (nkw[:, word] + BETA)/ (nk[:] + BETA*vocab_size))
                topic = random.choices(topic_list, weights=p_z, k=1)[0]

                Z[doc_idx][i] = topic
                ndk[doc_idx, topic] += 1
                nkw[topic, word] += 1
                nk[topic] += 1

    return Z, ndk, nkw, nk

Z, ndk, nkw, nk = LDA_Collapsed_Gibbs(corpus)


100%|██████████| 200/200 [00:02<00:00, 88.62it/s]


In [53]:
phi = nkw / nk.reshape(NUM_TOPICS, 1)

num_words = 15
for k in range(NUM_TOPICS):
    most_common_words = np.argsort(phi[k])[::-1][:num_words]
    print(f"Topic {k} most common words: ")

    for word in most_common_words:
        print(vocab_idx_str[word])

    print("\n")

Topic 0 most common words: 
null
hey
vn
girl
pay
na
gon
leaving
tomorrow
yep
listen
soon
free
tell
yoh


Topic 1 most common words: 
okay
know
let
day
ready
coming
null
yoh
sushi
bed
taking
working
week
start
look


Topic 2 most common words: 
know
let
today
come
lecture
ready
want
finished
night
uber
kay
tell
kea
break
book


Topic 3 most common words: 
null
sure
yeah
morning
like
moment
asking
check
come
quick
voice
missed
hear
hi
paid


Topic 4 most common words: 
come
try
morning
going
null
sweet
group
pick
man
hours
days
know
cool
outside
wait


Topic 5 most common words: 
know
let
sorry
working
better
girl
awe
buy
place
thought
tenant
arrive
like
today
hey


Topic 6 most common words: 
said
missed
voice
like
think
yup
sorry
yes
year
time
asked
friend
thank
office
campus


Topic 7 most common words: 
yes
good
home
talk
yeah
okidoki
time
find
tlholo
friend
let
need
know
hana
ask


Topic 8 most common words: 
oh
sure
yeah
work
month
happening
going
hatfield
said
right
way
lecture
he