In [20]:
import spacy
import gensim
import gensim.corpora as corpora
from gensim import models
from pprint import pprint

In [21]:
nlp = spacy.load("en_core_web_lg")

In [23]:
from os import listdir

# Name of the folder containing the files
folder_path = "/Users/sunnie/Desktop/School/UIUC/CS410/CourseProject/ami-transcripts"

# Get a list of filenames
filenames = listdir(folder_path)

ECallDocuments = [] # List to store all documents in the training corpus as a 'list of lists'

# For each file
for filename in filenames:
    # Create the filepath
    file_path = f"{folder_path}/{filename}"

    # Open the file (using "with" for file opening will autoclose the file at the end. It's a good practice)
    with open(file_path, "r") as f:
        # Get the file content
        ECallTxt = f.read()
        
        # Clean text
        ECallTxt = ECallTxt.strip()  # Remove white space at the beginning and end
        ECallTxt = ECallTxt.replace('\n', ' ') # Replace the \n (new line) character with space
        ECallTxt = ECallTxt.replace('\r', '') # Replace the \r (carriage returns -if you're on windows) with null
        ECallTxt = ECallTxt.replace(' ', ' ') # Replace " " (a special character for space in HTML) with space. 
        ECallTxt = ECallTxt.replace(' ', ' ') # Replace " " (a special character for space in HTML) with space.
        while '  ' in ECallTxt:
            ECallTxt = ECallTxt.replace('  ', ' ') # Remove extra spaces
        
        # Parse document with SpaCy
        ECall = nlp(ECallTxt)
        
        ECallDoc = [] # Temporary list to store individual document
    
        # Further cleaning and selection of text characteristics
        for token in ECall:
            if token.is_stop == False and token.is_punct == False and (token.pos_ == "NOUN" or token.pos_ == "ADJ" or token.pos_ =="VERB"): # Retain words that are not a stop word nor punctuation, and only if a Noun, Adjective or Verb
                ECallDoc.append(token.lemma_.lower()) # Convert to lower case and retain the lemmatized version of the word (this is a string object)
            
       
        # Append the content to the list
        ECallDocuments.append(ECallDoc) # Build the training corpus 'list of lists'

### NUMERIC REPRESENTATION OF TRAINING CORPUS USING BAG OF WORDS AND TF-IDF ###

# Form dictionary by mapping word IDs to words
ID2word = corpora.Dictionary(ECallDocuments)

# Set up Bag of Words and TFIDF
corpus = [ID2word.doc2bow(doc) for doc in ECallDocuments] # Apply Bag of Words to all documents in training corpus
TFIDF = models.TfidfModel(corpus) # Fit TF-IDF model
trans_TFIDF = TFIDF[corpus] # Apply TF-IDF model

### SET UP & TRAIN LDA MODEL ###

SEED = 75 # Set random seed
NUM_topics = 3 # Set number of topics
ALPHA = 0.9 # Set alpha
ETA = 0.35 # Set eta

# Train LDA model on the training corpus
lda_model = gensim.models.LdaMulticore(corpus=trans_TFIDF, num_topics=NUM_topics, id2word=ID2word, random_state=SEED, alpha=ALPHA, eta=ETA, passes=100)

In [30]:
#generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_topics, id2word = ID2word, passes = 200)
pprint(ldamodel.print_topics(num_topics=3, num_words=20))

[(0,
  '0.031*"mm" + 0.029*"think" + 0.017*"know" + 0.012*"remote" + 0.011*"thing" '
  '+ 0.011*"design" + 0.010*"good" + 0.010*"control" + 0.010*"movie" + '
  '0.009*"line" + 0.008*"want" + 0.007*"work" + 0.007*"going" + 0.007*"go" + '
  '0.007*"project" + 0.007*"mean" + 0.007*"look" + 0.006*"like" + 0.006*"draw" '
  '+ 0.006*"meeting"'),
 (1,
  '0.029*"mm" + 0.026*"know" + 0.025*"think" + 0.019*"mean" + 0.016*"thing" + '
  '0.011*"time" + 0.009*"work" + 0.009*"look" + 0.008*"want" + 0.008*"people" '
  '+ 0.007*"need" + 0.007*"good" + 0.007*"get" + 0.007*"word" + '
  '0.007*"meeting" + 0.007*"going" + 0.006*"way" + 0.006*"use" + 0.006*"stuff" '
  '+ 0.006*"right"'),
 (2,
  '0.033*"think" + 0.023*"mm" + 0.022*"button" + 0.019*"remote" + 0.017*"know" '
  '+ 0.014*"control" + 0.013*"thing" + 0.012*"use" + 0.011*"want" + '
  '0.011*"look" + 0.009*"mean" + 0.008*"need" + 0.008*"people" + '
  '0.008*"design" + 0.008*"good" + 0.008*"going" + 0.007*"get" + '
  '0.007*"colour" + 0.006*"go" + 0

In [24]:
# Print topics generated from the training corpus
pprint(lda_model.print_topics(num_words=10))

[(0,
  '0.000*"galaxy" + 0.000*"quasar" + 0.000*"continuum" + 0.000*"absorption" + '
  '0.000*"emission" + 0.000*"nominate" + 0.000*"flux" + 0.000*"posterior" + '
  '0.000*"damp" + 0.000*"gamma"'),
 (1,
  '0.000*"galaxy" + 0.000*"quasar" + 0.000*"continuum" + 0.000*"absorption" + '
  '0.000*"emission" + 0.000*"nominate" + 0.000*"flux" + 0.000*"posterior" + '
  '0.000*"gamma" + 0.000*"damp"'),
 (2,
  '0.002*"remote" + 0.002*"button" + 0.001*"scroll" + 0.001*"rubber" + '
  '0.001*"control" + 0.001*"wheel" + 0.001*"animal" + 0.001*"fruit" + '
  '0.001*"voice" + 0.001*"chip"')]


In [25]:
### GET TOPIC ALLOCATIONS FOR TRAINING CORPUS DOCUMENTS ###

doc_no = 0 # Set document counter
for doc in ECallDocuments:
    TFIDF_doc = TFIDF[corpus[doc_no]] # Apply TFIDF model to individual documents
    print(lda_model.get_document_topics(TFIDF_doc)) # Get and print document topic allocations
    doc_no += 1

print('-'*50)

[(0, 0.06986645), (1, 0.069869354), (2, 0.86026424)]
[(0, 0.09112012), (1, 0.09111749), (2, 0.81776243)]
[(0, 0.07858098), (1, 0.07858476), (2, 0.84283423)]
[(0, 0.10928683), (1, 0.10928966), (2, 0.7814235)]
[(0, 0.0613722), (1, 0.06137418), (2, 0.8772536)]
[(0, 0.08306101), (1, 0.08306363), (2, 0.83387536)]
[(0, 0.05633648), (1, 0.056338154), (2, 0.88732535)]
[(0, 0.10169335), (1, 0.10169672), (2, 0.79660994)]
[(0, 0.1021157), (1, 0.10212858), (2, 0.79575574)]
[(0, 0.06001427), (1, 0.06001498), (2, 0.8799707)]
[(0, 0.063970186), (1, 0.06397001), (2, 0.8720598)]
[(0, 0.05820639), (1, 0.05820844), (2, 0.8835852)]
[(0, 0.075100504), (1, 0.07510316), (2, 0.8497963)]
[(0, 0.058024798), (1, 0.058024384), (2, 0.8839508)]
[(0, 0.07274521), (1, 0.07273722), (2, 0.8545175)]
[(0, 0.07352267), (1, 0.07352241), (2, 0.8529549)]
[(0, 0.06612195), (1, 0.06612139), (2, 0.86775666)]
[(0, 0.07387409), (1, 0.07387418), (2, 0.8522517)]
[(0, 0.077706285), (1, 0.07770469), (2, 0.844589)]
[(0, 0.056113243), 

In [31]:
doc_no = 0 # Set document counter\n",
for doc in ECallDocuments:
    bof_doc = ldamodel[corpus[doc_no]] # Apply TFIDF model to individual documents
    print(ldamodel.get_document_topics(bof_doc)) # Get and print document topic allocations
    doc_no += 1
    
print('-'*50)

[(0, 0.18261838), (1, 0.21567237), (2, 0.60170925)]
[(0, 0.17462824), (1, 0.61472625), (2, 0.21064547)]
[(0, 0.18672107), (1, 0.18003319), (2, 0.63324577)]
[(0, 0.17396608), (1, 0.62107074), (2, 0.20496318)]
[(0, 0.18251841), (1, 0.21817459), (2, 0.599307)]
[(0, 0.17428634), (1, 0.616382), (2, 0.20933169)]
[(0, 0.18253054), (1, 0.21842319), (2, 0.5990463)]
[(0, 0.18621932), (1, 0.17952922), (2, 0.6342514)]
[(0, 0.17264675), (1, 0.6241066), (2, 0.2032467)]
[(0, 0.18672204), (1, 0.17996952), (2, 0.6333084)]
[(0, 0.17566268), (1, 0.6081623), (2, 0.21617503)]
[(0, 0.18262956), (1, 0.21833283), (2, 0.5990376)]
[(0, 0.1795998), (1, 0.5814766), (2, 0.23892355)]
[(0, 0.18333139), (1, 0.21935913), (2, 0.59730947)]
[(0, 0.18658824), (1, 0.17992242), (2, 0.6334894)]
[(0, 0.18253544), (1, 0.21821342), (2, 0.59925115)]
[(0, 0.1825351), (1, 0.21814042), (2, 0.5993245)]
[(0, 0.18226431), (1, 0.22034365), (2, 0.5973921)]
[(0, 0.17410655), (1, 0.61725706), (2, 0.20863634)]
[(0, 0.18118407), (1, 0.23015