In [11]:
#!/usr/bin/env python
# coding: utf-8

# In[ ]:


"""
Description
This is a Natural Language Processing(NLP) Based App useful for basic NLP concepts such as follows;
+ Tokenization & Lemmatization using Spacy
+ Named Entity Recognition(NER) using SpaCy
+ Sentiment Analysis using TextBlob
+ Document/Text Summarization using Gensim/Sumy
This is built with Streamlit Framework, an awesome framework for building ML and NLP tools.

Purpose
To perform basic and useful NLP task with Streamlit,Spacy,Textblob and Gensim/Sumy

"""
# Core Pkgs
import streamlit as st
import os
from PIL import Image 

#Visualization
import matplotlib.pyplot as plt 
import matplotlib
matplotlib.use("Agg")
import seaborn as sns

#Open Ai GPT-3
import openai
openai.api_key = "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"

# NLP Pkgs
from textblob import TextBlob
import spacy
from gensim.summarization.summarizer import summarize
import nltk
nltk.download('punkt')

# Sumy Summary Pkg
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
#from dataset_milestone1 import df
#from cancer_dataset import cancer as df

#DATA_URL = df
st.markdown("# PationCom™")
st.markdown("By Reda Mastouri & Kalyani Pavuluri")
original_title = '<p style="color:Orange; font-size: 30px;">Examination of Digital Community Conversations Within Specific Disease States Via Reddit</p>'
st.markdown(original_title, unsafe_allow_html=True)

img=Image.open('img/logo.png')
st.image(img,width=200)
st.markdown('''
- **Vision**: Development of a repeatable process for the analysis of Reddit conversations
within specific condition and/or disease state with applicable threads and subreddit
threads (subreddits) to potentially inform strategy and content development. Create a
simplified and repeatable process that does not require the users to be fluent in Reddit.
- **Issue**: While Reddit offers robust, open, and community-minded discussions surrounding
conditions and disease states, Reddit also provides volumes of unstructured and
unclassified data. The development of a repeatable process – that continues to monitor
evolving conversations over time – currently requires multiple tools (ex. – tools to scrape
threads, tools to analyze keyword content, tools to analyze sentiment, etc.).
- **Method**: After identifying priority conditions and/or disease states with active Reddit
communities (ex. – prostate cancer, breast cancer, HIV, etc.), build relational taxonomy
(ex. – medicine, treatment, and adherence all have specific topics but have relational
discussions) of topical themes addressed within.
- **Potential Output**: Provide use case for healthcare companies on the importance of
Reddit as an early source of social indicator of trends and conversational “lexicon” to be
used for patient communications and programs.
''')
st.markdown("The data presented is of 5 different diseases - **Cancer, ProstateCancer, HIV, heart disease and cerebrovascular disease,** collected from PRAW API **https://praw.readthedocs.io/**")

if st.button("Learn more about Reda Mastouri and Kalyani Pavuluri"):
    reda=Image.open('img/mastouri.png')
    kalyani=Image.open('img/kalyani.png')
    st.markdown('''**Reda Mastouri ** Reda Mastouri is Security Data Scientist with a passion for teaching and coaching. | Data Analytics | Machine Learning | Predictive Modeling | Data Visualization | NLP | Network Analytics | Network Security | Ethical Hacking |
He is knowledgeable and technically certified engineer with 7 years of continued hands-on experience in the implementation, administration and troubleshooting..''')
    st.image(reda,width=200, caption="Reda Mastouri 🤵‍")
    
    st.markdown('''<br>**Reda Mastouri ** Reda Mastouri is Security Data Scientist with a passion for teaching and coaching. | Data Analytics | Machine Learning | Predictive Modeling | Data Visualization | NLP | Network Analytics | Network Security | Ethical Hacking |
He is knowledgeable and technically certified engineer with 7 years of continued hands-on experience in the implementation, administration and troubleshooting..''')
    st.image(kalyani,width=200, caption="Kalyani Pavuluri 👩‍💼‍")
    
    st.markdown("The data was collected and made available by **[Reda Mastouri](https://www.linkedin.com/in/reda-mastouri/**.")
    st.markdown("and **[Kalyani Pavuluri](https://www.linkedin.com/in/kalyani-pavuluri-30416519**.")
    images=Image.open('img/presentation.png')
    st.image(images,width=700)
    #Ballons
    st.balloons()



#GPT-3 Text summarizer
def gptSummarizer(text):
    import os
    import openai

    openai.api_key = "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"

    response = openai.Completion.create(
      engine="davinci-instruct-beta",
      prompt=text,
      temperature=1,
      max_tokens=100,
      top_p=1.0,
      frequency_penalty=0.0,
      presence_penalty=0.0
    )
    #A = response.get('choices')[0]
    #answer = A.get('text')
    return response


# Function for Sumy Summarization
def sumy_summarizer(docx):
	parser = PlaintextParser.from_string(docx,Tokenizer("english"))
	lex_summarizer = LexRankSummarizer()
	summary = lex_summarizer(parser.document,3)
	summary_list = [str(sentence) for sentence in summary]
	result = ' '.join(summary_list)
	return result

# Function to Analyse Tokens and Lemma
@st.cache
def text_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	# tokens = [ token.text for token in docx]
	allData = [('"Token":{},\n"Lemma":{}'.format(token.text,token.lemma_))for token in docx ]
	return allData

# Function For Extracting Entities
@st.cache
def entity_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	tokens = [ token.text for token in docx]
	entities = [(entity.text,entity.label_)for entity in docx.ents]
	allData = ['"Token":{},\n"Entities":{}'.format(tokens,entities)]
	return allData

placeholder = '''
In an attempt to build an AI-ready workforce, Microsoft announced Intelligent Cloud Hub which has been launched to empower the next generation of students with AI-ready skills. Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services. As part of the program, the Redmond giant which wants to expand its reach and is planning to build a strong developer ecosystem in India with the program will set up the core AI infrastructure and IoT Hub for the selected campuses. The company will provide AI development tools and Azure AI services such as Microsoft Cognitive Services, Bot Services and Azure Machine Learning.According to Manish Prakash, Country General Manager-PS, Health and Education, Microsoft India, said, "With AI being the defining technology of our time, it is transforming lives and industry and the jobs of tomorrow will require a different skillset. This will require more collaborations and training and working with AI. That’s why it has become more critical than ever for educational institutions to integrate new cloud and AI technologies. The program is an attempt to ramp up the institutional set-up and build capabilities among the educators to educate the workforce of tomorrow." The program aims to build up the cognitive skills and in-depth understanding of developing intelligent cloud connected solutions for applications across industry. Earlier in April this year, the company announced Microsoft Professional Program In AI as a learning track open to the public. The program was developed to provide job ready skills to programmers who wanted to hone their skills in AI and data science with a series of online courses which featured hands-on labs and expert instructors as well. This program also included developer-focused AI school that provided a bunch of assets to help build AI skills.

'''
def main():
	""" NLP Based App with Streamlit """

	# Title
	st.title("Let's get started ..")
	st.subheader("Description")
	st.markdown('''
    	+ Because Reddit is regarded as one of the most effective social network sources for tracking the prevalence of public interests in infectious diseases (e.g., Coronavirus, HIV, and cancer) and controversial health-related issues (e.g., electronic cigarettes and marijuana) over time, reporting on findings derived from social media data nowadays becomes critical for understanding public reactions to infectious diseases. 

        + As a result, we require a faster, more intelligent, and more accurate sentiment analyzer and web scrapper-based engine capable of tracking the latest trends on novel diseases, as well as any conversational "lexicon."
        
        + This will serve as a social indicator, providing a collection of use cases for healthcare companies to sensitize consumers through various mediums, communications, and programs to learn about either polemics or significant takeaways from what is happening in social media.
        
        Click any of the checkboxes to get started.
    	''')

	# Summarization
	if st.checkbox("Get the summary of your text"):
		st.subheader("Summarize Your Text")

		message = st.text_area("Enter Text",placeholder)
		summary_options = st.selectbox("Choose Summarizer",['GPT-3','sumy','gensim'])
		if st.button("Summarize"):
			if summary_options == 'sumy':
				st.text(placeholder)
				summary_result = sumy_summarizer(message)
			elif summary_options == 'GPT-3':
				st.text(placeholder)
				summary_result = gptSummarizer(message)
			elif summary_options == 'gensim':
				st.text(placeholder)
				summary_result = summarize(message)
			else:
				st.warning("Using Default Summarizer")
				st.text("Using Gensim Summarizer ..")
				summary_result = summarize(message)
			st.success(summary_result)

	# Sentiment Analysis
	if st.checkbox("Get the Sentiment Score of your text"):
		st.subheader("Identify Sentiment in your Text")

		message = st.text_area("Enter Text","Type Here...")
		if st.button("Analyze"):
			blob = TextBlob(message)
			result_sentiment = blob.sentiment
			st.success(result_sentiment)

	#Sentiment Analysis
	if option == 'Sentiment Analysis':
		#Creating graph for sentiment across each sentence in the text inputted
		sents = sent_tokenize(text) #tokenizing the text data into a list of sentences
		entireText = TextBlob(text) #storing the entire text in one string
		sentScores = [] #storing sentences in a list to plot
		for sent in sents:
			text = TextBlob(sent) #sentiment for each sentence
			score = text.sentiment[0] #extracting polarity of each sentence
			sentScores.append(score) 

		#Plotting sentiment scores per sentencein line graph
		st.line_chart(sentScores) #using line_chart st call to plot polarity for each sentence
    

	# Entity Extraction
	if st.checkbox("Get the Named Entities of your text"):
		st.subheader("Identify Entities in your text")

		message = st.text_area("Enter Text","Type Here..")
		if st.button("Extract"):
			entity_result = entity_analyzer(message)
			st.json(entity_result)

	# Tokenization
	if st.checkbox("Get the Tokens and Lemma of text"):
		st.subheader("Tokenize Your Text")

		message = st.text_area("Enter Text","Type Here.")
		if st.button("Analyze"):
			nlp_result = text_analyzer(message)
			st.json(nlp_result)



	st.sidebar.subheader("About the App")
	logobottom=Image.open('img/logo.png')
	st.sidebar.image(logobottom,width=150)
	st.sidebar.text("PatientCom via REDDIT 🤖")
	st.sidebar.info("Examination of Digital Community Conversations Within Specific Disease States Via Reddit")   
	st.sidebar.markdown("[Data Source API](https://praw.readthedocs.io/en/stable/")
	st.sidebar.info("Linkedin [Reda Mastouri](https://www.linkedin.com/in/reda-mastouri/) ")
	st.sidebar.info("Linkedin [Kalyani Pavuluri](https://www.linkedin.com/in/kalyani-pavuluri-30416519) ")
	st.sidebar.text("PationCom™ - Copyright © 2021")




if __name__ == '__main__':
	main()


# In[ ]:






[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rmastour\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
import pandas as pd
df = pd.read_csv('dataset/cancer.csv')
df

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created,UID,comments
0,I’m not ready.,632,9isza1,cancer,https://www.reddit.com/r/cancer/comments/9isza...,82,In 2016 I was diagnosed with stage 4 of a seve...,2018-09-25 15:13:21+00:00,2,...
1,"I’ve got 2-4 weeks to live, we’re in the end g...",590,m1h5h5,cancer,https://www.reddit.com/r/cancer/comments/m1h5h...,149,Hey everyone you might have seen my post on he...,2021-03-09 21:42:48+00:00,3,...
2,Onto Hospice. End of journey.,458,8y27xr,cancer,https://www.reddit.com/r/cancer/comments/8y27x...,108,"The last treatment option, Immunotherapy, for ...",2018-07-11 18:41:20+00:00,9,...
3,"Diagnosed in June, Dead in August",438,cuo28h,cancer,https://www.reddit.com/r/cancer/comments/cuo28...,49,My wonderful husband was diagnosed with cancer...,2019-08-24 03:00:29+00:00,10,...
4,Goodbye my sweet angel. I Lost my 5 year old d...,441,e1o110,cancer,https://www.reddit.com/r/cancer/comments/e1o11...,47,We had an incredible six months together after...,2019-11-25 22:31:46+00:00,11,...
...,...,...,...,...,...,...,...,...,...,...
66,I am getting a Bone Marrow Transplant tonight!!!,204,asu1lg,cancer,https://www.reddit.com/r/cancer/comments/asu1l...,67,Please give me those good reddit vibes. Quick ...,2019-02-20 21:29:55+00:00,94,...
67,"They say you die twice: once when you pass, an...",207,n9fmjd,cancer,https://www.reddit.com/r/cancer/comments/n9fmj...,29,"Maybe I'm posting this for my own benefit, but...",2021-05-10 20:46:10+00:00,95,...
68,Today is my 10th Cancerversay - diagnosed with...,203,jlosty,cancer,https://www.reddit.com/r/cancer/comments/jlost...,35,I (38M) got my biopsy results 10 years ago on ...,2020-10-31 19:46:51+00:00,98,...
69,"Hi, I'm NED!!",204,l7xc0p,cancer,https://www.reddit.com/r/cancer/comments/l7xc0...,44,Last year I was diagnosed with Stage 4 bowel c...,2021-01-29 16:46:41+00:00,99,...


In [18]:
import seaborn as sns
sns.histplot(df['score']);

In [20]:
sns.histplot(df['num_comments']);

In [29]:
#Scattertext
Description_Scattertext='''
Scattertext is a Python package that lets you interactively visualize how two categories of text are different from each other (Kessler 2017). 
Most of the work I’ve done on Scattertext focuses on how you can visualize the differences in how single words and (and bigrams) are used with different frequencies across categories.
'''

def scattertextvisualizer(dataframe):
    import pytextrank, spacy
    import scattertext
    nlp = spacy.load('en_core_web_sm')
    corpus_dataframe = dataframe
    
    corpus = (scattertext.CorpusFromPandas(dataframe,
                                           category_col='body', 
                                           text_col='comments',
                                           nlp=nlp)
              .build()
              .remove_terms(nlp.Defaults.stop_words, ignore_absences=True)
              )
    html = scattertext.produce_scattertext_explorer(
                       corpus,
                       category='comments',
                       category_name='body',
                       not_category_name='score',
                       width_in_pixels=1000,
                       metadata=corpus_dataframe['title'])
    return html

In [64]:
def mywordcloud(dataframe):
    from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
    import warnings
    warnings.filterwarnings("ignore")
    from collections import Counter
    c = Counter()    
    
    plt.figure(figsize = (20,20))
    W_C = WordCloud(min_font_size=3, max_words=3200, width=1600, height=850, stopwords=STOPWORDS).generate(str(" ".join(dataframe.title)))
    return plt.imshow(W_C, interpolation='bilinear')
    #return W_C

In [66]:
mywordcloud(df)

<matplotlib.image.AxesImage at 0x1f9b7350880>

In [61]:
plt.figure(figsize=[20,10])
plt.imshow(mywordcloud(df), interpolation='bilinear')
plt.axis("off")
plt.show()

In [63]:
plt.savefig("img/cancer_wordcloud.png", format="png")
plt.show()

In [104]:
#LDA
def ldavisualizer(dataset):
    #librairies
    import warnings
    warnings.filterwarnings("ignore")
    # Run in python console
    import nltk; 
    nltk.download('stopwords')

    # Gensim
    '''
    NLP Librairies
    '''
    # Gensim
    import gensim
    import gensim.corpora as corpora
    from gensim.utils import simple_preprocess
    from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel

    # spacy for lemmatization
    import spacy

    # Plotting tools
    import pyLDAvis
    #import pyLDAvis.gensim 
    import pyLDAvis.gensim_models
    import matplotlib.pyplot as plt
    %matplotlib inline

    # Enable logging for gensim - optional
    import logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

    import warnings
    warnings.filterwarnings("ignore",category=DeprecationWarning)
    
    #Body pkgs
    def sent_to_words(sentences):
        for sentence in sentences:
            yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
    
    #wordings
    df_words = list(sent_to_words(dataset))
    
    # Build the bigram and trigram models

    bigram = gensim.models.Phrases(df_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[df_words], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram

    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    

    
    #remove stop words
    def remove_stopwords(texts):
        return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    def make_bigrams(texts):
        return [bigram_mod[doc] for doc in texts]

    def make_trigrams(texts):
        return [trigram_mod[bigram_mod[doc]] for doc in texts]

    def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        texts_out = []
        for sent in texts:
            doc = nlp(" ".join(sent)) 
            texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        return texts_out
    # NLTK Stop words

    from nltk.corpus import stopwords
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use','a','about', 'above', 'across'])
    
    st1= ['after', 'afterwards','again','against', 'all', 'almost','alone','along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides',
           'between',
           'beyond',
           'bill',
           'both',
           'bottom',
           'but',
           'by',
           'call',
           'can',
           'cannot',
           'cant',
           'co',
           'con',
           'could',
           'couldnt',
           'cry',
           'de',
           'describe',
           'detail',
           'do',
           'done',
           'down',
           'due',
           'during',
           'each',
           'eg',
           'eight',
           'either',
           'eleven',
           'else',
           'elsewhere',
           'empty',
           'enough',
           'etc',
           'even',
           'ever',
           'every',
           'everyone',
           'everything',
           'everywhere',
           'except',
           'few',
           'fifteen',
           'fifty',
           'fill',
           'find',
           'fire',
           'first',
           'five',
           'for',
           'former',
           'formerly',
           'forty',
           'found',
           'four',
           'from',
           'front',
           'full',
           'further',
           'get',
           'give',
           'go',
           'had',
           'has',
           'hasnt',
           'have',
           'he',
           'hence',
           'her',
           'here',
           'hereafter',
           'hereby',
           'herein',
           'hereupon',
           'hers',
           'herself',
           'him',
           'himself',
           'his',
           'how',
           'however',
           'hundred',
           'i',
           'ie',
           'if',
           'in',
           'inc',
           'indeed',
           'interest',
           'into',
           'is',
           'it',
           'its',
           'itself',
           'keep',
           'last',
           'latter',
           'latterly',
           'least',
           'less',
           'ltd',
           'made',
           'many',
           'may',
           'me',
           'meanwhile',
           'might',
           'mill',
           'mine',
           'more',
           'moreover',
           'most',
           'mostly',
           'move',
           'much',
           'must',
           'my',
           'myself',
           'name',
           'namely',
           'neither',
           'never',
           'nevertheless',
           'next',
           'nine',
           'no',
           'nobody',
           'none',
           'noone',
           'nor',
           'not',
           'nothing',
           'now',
           'nowhere',
           'of',
           'off',
           'often',
           'on',
           'once',
           'one',
           'only',
           'onto',
           'or',
           'other',
           'others',
           'otherwise',
           'our',
           'ours',
           'ourselves',
           'out',
           'over',
           'own',
           'part',
           'per',
           'perhaps',
           'please',
           'put',
           'rather',
           're',
           'same',
           'see',
           'seem',
           'seemed',
           'seeming',
           'seems',
           'serious',
           'several',
           'she',
           'should',
           'show',
           'side',
           'since',
           'sincere',
           'six',
           'sixty',
           'so',
           'some',
           'somehow',
           'someone',
           'something',
           'sometime',
           'sometimes',
           'somewhere',
           'still',
           'such',
           'system',
           'take',
           'ten',
           'than',
           'that',
           'the',
           'their',
           'them',
           'themselves',
           'then',
           'thence',
           'there',
           'thereafter',
           'thereby',
           'therefore',
           'therein',
           'thereupon',
           'these',
           'they',
           'thick',
           'thin',
           'third',
           'this',
           'those',
           'though',
           'three',
           'through',
           'throughout',
           'thru',
           'thus',
           'to',
           'together',
           'too',
           'top',
           'toward',
           'towards',
           'twelve',
           'twenty',
           'two',
           'un',
           'under',
           'until',
           'up',
           'upon',
           'us',
           'very',
           'via',
           'was',
           'we',
           'well',
           'were',
           'what',
           'whatever',
           'when',
           'whence',
           'whenever',
           'where',
           'whereafter',
           'whereas',
           'whereby',
           'wherein',
           'whereupon',
           'wherever',
           'whether',
           'which',
           'while',
           'whither',
           'who',
           'whoever',
           'whole',
           'whom',
           'whose',
           'why',
           'will',
           'with',
           'within',
           'without',
           'would',
           'yet',
           'you',
           'your',
           'yours',
           'yourself',
           'yourselves']
    
    stop_words.extend(st1)
    # Remove Stop Words
    data_words_nostops = remove_stopwords(df_words)
    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops)
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
   # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)
    # Create Corpus
    texts = data_lemmatized
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    
    # Compute Perplexity

    print('\nPerplexity: ', lda_model.log_perplexity(corpus)) 

    # Compute Coherence Score

    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)
    
    pyLDAvis.enable_notebook()
    panel = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds='tsne', sort_topics=True)
    #topic_data =  pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds = 'tsne', sort_topics=True)
    return pyLDAvis.display(panel)

In [105]:
ldavisualizer(df['comments'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rmastour\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2021-11-09 01:12:38.959 collecting all words and their counts
2021-11-09 01:12:38.960 PROGRESS: at sentence #0, processed 0 words and 0 word types
2021-11-09 01:12:38.985 collected 10551 word types from a corpus of 14434 words (unigram + bigrams) and 71 sentences
2021-11-09 01:12:38.986 using 10551 counts as vocab in Phrases<0 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2021-11-09 01:12:38.987 collecting all words and their counts
2021-11-09 01:12:38.989 PROGRESS: at sentence #0, processed 0 words and 0 word types
2021-11-09 01:12:39.071 collected 10560 word types from a corpus of 14391 words (unigram + bigrams) and 71 sentences
2021-11-09 01:12:39.072 using 10560 counts as vocab in Phrases<0 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2021-11-09 01:12:39.073 source_vocab length 10551
2021-11-0

2021-11-09 01:12:41.169 topic #3 (0.085): 0.094*"sorry" + 0.049*"loss" + 0.024*"cancer" + 0.019*"love" + 0.017*"thank" + 0.013*"go" + 0.012*"heart" + 0.012*"read" + 0.011*"send" + 0.010*"wife"
2021-11-09 01:12:41.170 topic #4 (0.123): 0.070*"congratulation" + 0.050*"congrat" + 0.049*"news" + 0.034*"happy" + 0.032*"good" + 0.024*"awesome" + 0.020*"cancer" + 0.020*"amazing" + 0.020*"great" + 0.015*"hear"
2021-11-09 01:12:41.170 topic diff=0.131388, rho=0.408248
2021-11-09 01:12:41.206 -6.057 per-word bound, 66.6 perplexity estimate based on a held-out corpus of 71 documents with 6039 words
2021-11-09 01:12:41.207 PROGRESS: pass 5, at document #71/71
2021-11-09 01:12:41.226 optimized alpha [0.04826741, 0.06012559, 0.04217103, 0.077903286, 0.11459062]
2021-11-09 01:12:41.228 topic #0 (0.048): 0.021*"dad" + 0.021*"cancer" + 0.015*"sorry" + 0.015*"stage" + 0.012*"thank" + 0.011*"happy" + 0.010*"good" + 0.010*"get" + 0.009*"body" + 0.009*"diagnose"
2021-11-09 01:12:41.229 topic #1 (0.060): 0.


Perplexity:  -6.032702148232087


2021-11-09 01:12:44.235 7 accumulators retrieved from output queue
2021-11-09 01:12:44.247 accumulated word occurrence stats for 426 virtual documents



Coherence Score:  0.3784475488646925
