In [3]:
import numpy as np
import os
import spacy
import pandas as pd
from tqdm import tqdm 
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from spacy.lang.en import English
import random
from collections import Counter

In [4]:
df = pd.read_excel('BetaData.xlsx')

In [5]:
ALPHA = 0.5
BETA = 0.5
NUM_TOPICS = 40
sp = spacy.load('en_core_web_sm')

np.random.seed(100)
random.seed(100)

In [8]:
def generate_frequencies(data, max_docs = 10000):
    freqs = Counter()
    all_stopwords = sp.Defaults.stop_words
    all_stopwords.add('okidoki')
    all_stopwords.add('hey')
    all_stopwords.add('yep')
    all_stopwords.add('girl')
    all_stopwords.add('na')
    all_stopwords.add('nuh')
    all_stopwords.add('null')
    all_stopwords.add('yoh')
    all_stopwords.add('okay')
    all_stopwords.add('yup')
    all_stopwords.add('eish')
    all_stopwords.add('ai')
    all_stopwords.add('sure')
    all_stopwords.add('oh')
    all_stopwords.add('hi')
    all_stopwords.add('nope')
    all_stopwords.add('awe')
    all_stopwords.add('https')
    all_stopwords.add('ah')
    all_stopwords.add('heyo')
    all_stopwords.add('whoop')
    all_stopwords.add('yeah')
    all_stopwords.add('gon')
    all_stopwords.add('said')
    all_stopwords.add('yes')
    all_stopwords.add('know')
    all_stopwords.add("enron")
    nr_tokens = 0

    for doc in data[:max_docs]:
        if isinstance(doc, str):
            tokens = sp.tokenizer(doc)
            for token in tokens:
                token_text = token.text.lower()
                if token_text not in all_stopwords and token.is_alpha:
                    nr_tokens += 1
                    freqs[token_text] += 1
        else:
            continue

    return freqs

def get_vocab(freqs, freq_threshold=3):
    vocab = {}
    vocab_idx_str = {}
    vocab_idx = 0

    for word in freqs:
        if freqs[word] >= freq_threshold:
            vocab[word]= vocab_idx
            vocab_idx_str[vocab_idx] = word
            vocab_idx += 1
    return vocab, vocab_idx_str
        


def tokenize_dataset(data, vocab, max_docs= 10000):
    nr_tokens = 0
    nr_docs = 0
    docs = []

    for doc in data[:max_docs]:
        tokens = sp.tokenizer(doc)

        if len(tokens) > 1:
            doc = []
            for token in tokens:
                token_text = token.text.lower()
                if token_text in vocab:
                    doc.append(token_text)
                    nr_tokens += 1
            nr_docs += 1
            docs.append(doc)

    print(f"Number of text messages: {nr_docs}")
    print(f"Number of tokens: {nr_tokens}")

    corpus = []
    for doc in docs:
        corpus_d = []

        for token in doc:
            corpus_d.append(vocab[token])

        corpus.append(np.asarray(corpus_d))

    return docs, corpus


In [9]:
data = df['Column1'].astype(str).sample(frac=1).values
freqs = generate_frequencies(data)
vocab, vocab_idx_str = get_vocab(freqs)
docs, corpus = tokenize_dataset(data, vocab)
vocab_size = len(vocab)
print(f"Vocab size: {vocab_size}")

Number of text messages: 1425
Number of tokens: 1335
Vocab size: 221


In [10]:
def LDA_Collapsed_Gibbs(corpus, num_iter=200):

    Z = []
    num_docs = len(corpus)

    for _, doc in enumerate(corpus):
        Zd = np.random.randint(low=0, high = NUM_TOPICS, size=(len(doc)))
        Z.append(Zd)

    ndk = np.zeros((num_docs, NUM_TOPICS))
    for d in range(num_docs):
        for k in range(NUM_TOPICS):
            ndk[d, k] = np.sum(Z[d]==k)

    nkw = np.zeros((NUM_TOPICS, vocab_size))
    for doc_idx, doc in enumerate(corpus):
        for i, word in enumerate(doc):
            topic = Z[doc_idx][i]
            nkw[topic, word] += 1

    nk = np.sum(nkw, axis=1)
    topic_list = [i for i in range(NUM_TOPICS)]

    for _ in tqdm(range(num_iter)):
        for doc_idx, doc in enumerate(corpus):
            for i in range(len(doc)):
                word = doc[i]
                topic = Z[doc_idx][i]

                ndk[doc_idx, topic] -= 1
                nkw[topic, word] -= 1
                nk[topic] -= 1

                p_z = (ndk[doc_idx, :] + ALPHA * (nkw[:, word] + BETA)/ (nk[:] + BETA*vocab_size))
                topic = random.choices(topic_list, weights=p_z, k=1)[0]

                Z[doc_idx][i] = topic
                ndk[doc_idx, topic] += 1
                nkw[topic, word] += 1
                nk[topic] += 1

    return Z, ndk, nkw, nk

Z, ndk, nkw, nk = LDA_Collapsed_Gibbs(corpus)


100%|██████████| 200/200 [00:01<00:00, 103.42it/s]


In [11]:
phi = nkw / nk.reshape(NUM_TOPICS, 1)

num_words = 15
for k in range(NUM_TOPICS):
    most_common_words = np.argsort(phi[k])[::-1][:num_words]
    print(f"Topic {k} most common words: ")

    for word in most_common_words:
        print(vocab_idx_str[word])

    print("\n")

Topic 0 most common words: 
way
dumb
coming
time
better
wants
nah
delivery
sending
mind
thinking
trust
study
getting
paid


Topic 1 most common words: 
morning
today
time
send
happened
better
t
paid
worry
wanted
email
tomorrow
soon
tight
sleep


Topic 2 most common words: 
meet
page
laundry
chat
vn
listen
class
story
week
paid
mind
moment
year
crime
missed


Topic 3 most common words: 
thought
kill
send
year
tomorrow
tell
soon
tonight
weekend
moving
skipping
pick
told
excited
date


Topic 4 most common words: 
year
happy
let
going
new
hope
friend
delivery
pick
t
try
wrong
gate
happened
desk


Topic 5 most common words: 
outside
saying
wait
right
asked
talk
pay
meeting
leaving
hope
mind
google
started
soon
apartment


Topic 6 most common words: 
sorry
want
work
like
coming
told
think
forward
thing
change
text
outside
talking
got
hope


Topic 7 most common words: 
saw
ready
leave
time
let
update
things
friend
peanut
soon
sent
lost
got
actually
best


Topic 8 most common words: 
home
brin