In [60]:
import numpy as np
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import spacy
from spacy.lang.en import English
import random
import pandas as pd
from tqdm import tqdm 
from collections import Counter

In [61]:
df = pd.read_excel("BetaData.xlsx")

In [62]:
ALPHA = 0.1
BETA = 0.1
NUM_TOPICS = 20
sp = English()

np.random.seed(42)
random.seed(42)

In [63]:
def generate_frequencies(data, max_docs = 10000):
    freqs = Counter()
    all_stopwords = sp.Defaults.stop_words
    all_stopwords.add("enron")
    nr_tokens = 0

    for doc in data[:max_docs]:
        if isinstance(doc, str):
            tokens = sp.tokenizer(doc)
            for token in tokens:
                token_text = token.text.lower()
                if token_text not in all_stopwords and token.is_alpha:
                    nr_tokens += 1
                    freqs[token_text] += 1
        else:
            continue

    return freqs

def get_vocab(freqs, freq_threshold=3):
    vocab = {}
    vocab_idx_str = {}
    vocab_idx = 0

    for word in freqs:
        if freqs[word] >= freq_threshold:
            vocab[word]= vocab_idx
            vocab_idx_str[vocab_idx] = word
            vocab_idx += 1
    return vocab, vocab_idx_str
        


def tokenize_dataset(data, vocab, max_docs= 10000):
    nr_tokens = 0
    nr_docs = 0
    docs = []

    for doc in data[:max_docs]:
        tokens = sp.tokenizer(doc)

        if len(tokens) > 1:
            doc = []
            for token in tokens:
                token_text = token.text.lower()
                if token_text in vocab:
                    doc.append(token_text)
                    nr_tokens += 1
            nr_docs += 1
            docs.append(doc)

    print(f"Number of text messages: {nr_docs}")
    print(f"Number of tokens: {nr_tokens}")

    corpus = []
    for doc in docs:
        corpus_d = []

        for token in doc:
            corpus_d.append(vocab[token])

        corpus.append(np.asarray(corpus_d))

    return docs, corpus



In [64]:
data = df['Column1'].astype(str).sample(frac=1).values
freqs = generate_frequencies(data)
vocab, vocab_idx_str = get_vocab(freqs)
docs, corpus = tokenize_dataset(data, vocab)
vocab_size = len(vocab)
print(f"Vocab size: {vocab_size}")

Number of text messages: 1618
Number of tokens: 2177
Vocab size: 248
