In [58]:
import numpy as np
import spacy
import random
import pandas as pd
from tqdm import tqdm
from collections import Counter

In [59]:
df = pd.read_excel('Pranav_RGA_Reports.xlsx')



In [60]:
ALPHA = 0.1
BETA = 0.1
NUM_TOPICS = 4
sp = spacy.load("en_core_web_sm")

np.random.seed(42)
random.seed(42)

In [61]:
def generate_frequencies(data, max_docs=80):
    freqs = Counter()
    all_stopwords = sp.Defaults.stop_words
    # all_stopwords.add('application')
    # all_stopwords.add('inspection')
    # all_stopwords.add('review')
    # all_stopwords.add('customer')
    # all_stopwords.add('fail')
    # all_stopwords.add('engineering')
    # all_stopwords.add('returned')
    # all_stopwords.add('lasted')
    # all_stopwords.add('bearings')
    # all_stopwords.add('clutch')
    # all_stopwords.add('machine')
    # all_stopwords.add('reported')
    # all_stopwords.add('clutches')
    # all_stopwords.add('set')
    # all_stopwords.add('units')
    # all_stopwords.add('seizure')
    nr_tokens = 0

    for doc in data[:max_docs]:
        tokens = sp.tokenizer(doc)
        for token in tokens:
            token_text = token.text.lower()
            if token_text not in all_stopwords and token.is_alpha:
                nr_tokens += 1
                freqs[token_text] += 1
    
    return freqs

def get_vocab(freqs, freq_threshold=3):
    vocab = {}
    vocab_idx_str = {}
    vocab_idx = 0

    for word in freqs:
        if freqs[word] >= freq_threshold:
            vocab[word] = vocab_idx
            vocab_idx_str[vocab_idx] = word
            vocab_idx += 1
    
    return vocab, vocab_idx_str

def tokenize_dataset(data, vocab, max_docs=80):
    nr_tokens = 0
    nr_docs = 0
    docs = []

    for doc in data[:max_docs]:
        tokens  = sp.tokenizer(doc)

        if len(tokens) > 1:
            doc = []
            for token in tokens:
                token_text = token.text.lower()
                if token_text in vocab:
                    doc.append(token_text)
                    nr_tokens += 1
            nr_docs += 1
            docs.append(doc)

    print(f"Number of emails: {nr_docs}")
    print(f"Number of tokens: {nr_tokens}")

    corpus = []
    for doc in docs:
        corpus_d = []
        
        for token in doc:
            corpus_d.append(vocab[token])

        corpus.append(np.asarray(corpus_d))
    
    return docs, corpus



In [62]:
data = df['Description'].sample(frac=0.1, random_state=10).values
print(data)
freqs = generate_frequencies(data)
vocab, vocab_idx_str = get_vocab(freqs)
docs, corpus = tokenize_dataset(data, vocab)
vocab_size = len(vocab)
print(f"Vocab size: {vocab_size}")
print(vocab_idx_str)


['EFP 308: Evaluation \n\nThe returned item was inspected and per the attached Application Engineering review, no \nmanufacturing anomalies were detected and cause of customer problem is believed to be \napplication related. Therefore credit cannot be issued for this unit.'
 'Inspection only  \n\n8 units were returned for inspection to Florence, KY RGA lab.  It was reported two of \nthese units were removed from in-service machine until seizure occurred, but it wasn’t \nclear how long the other 6 lasted.  The inspection was to determine any observed \ndamages that could help identify possible issues relating to why recent returns haven’t \nlasted to its expected bearing life. Typically, bearing life was reached at minimum of 2 \nyears.  Lately, some returns have lasted less than 1 year.'
 'Chatter, begin to slip or stall and then go \n\nThese units have been used many years on a printer ink roller machine application that \nhasn’t changed its operating condition.  Typically, these clut

In [63]:
def LDA_Collapsed_Gibbs(corpus, num_itr=200):
    Z=[]
    num_docs = len(corpus)

    for _, doc in enumerate(corpus):
        Zd = np.random.randint(low=0,high=NUM_TOPICS,size=(len(doc)))
        Z.append(Zd)

    ndk = np.zeros((num_docs,NUM_TOPICS))
    for d in range(num_docs):
        for k in range(NUM_TOPICS):
            ndk[d, k] = np.sum(Z[d]==k)
    
    nkw = np.zeros((NUM_TOPICS,vocab_size))
    for doc_idx, doc in enumerate(corpus):
        for i,word in enumerate(doc):
            topic=Z[doc_idx][i]
            nkw[topic,word] += 1
    
    nk = np.sum(nkw,axis=1)
    topic_list = [i for i in range(NUM_TOPICS)]

    for _ in tqdm(range(num_itr)):
        for doc_idx,doc in enumerate(corpus):
            for i in range(len(doc)):
                word=doc[i]
                topic = Z[doc_idx][i]

                ndk[doc_idx,topic] -= 1
                nkw[topic,word] -= 1
                nk[topic] -= 1

                p_z = (ndk[doc_idx,:] + ALPHA) * (nkw[:,word] + BETA) / (nk[:] + BETA*vocab_size)
                topic = random.choices(topic_list,weights=p_z,k=1)[0]

                Z[doc_idx][i] = topic
                ndk[doc_idx,topic] += 1
                nkw[topic,word] += 1
                nk[topic] += 1

    return Z, ndk, nkw, nk

Z, ndk, nkw, nk = LDA_Collapsed_Gibbs(corpus)

100%|██████████| 200/200 [00:00<00:00, 199871.53it/s]


In [64]:
phi = nkw/nk.reshape(NUM_TOPICS,1)

num_words = 1
for k in range(NUM_TOPICS):
    most_common_words = np.argsort(phi[k])[::-1][:num_words]
    print(f"Topic {k} most common words: ")
    
    for word in most_common_words:
        print(vocab_idx_str[word])
    
    print('\n')

Topic 0 most common words: 


Topic 1 most common words: 


Topic 2 most common words: 


Topic 3 most common words: 


