# Patient advocacy via topic modeling using reviews from drugs.com

Topic Modeling is a form of unsupervised learning (akin to clustering), so the set of possible topics are unknown apriori. The topics discovered are unlabeled. It takes a well-trained human to label the topics.

In [1]:
from datetime import datetime
date = datetime.today().strftime('%y%m%d')
print ('Last modified by Xiaoqing: ' + date)

Last modified by Xiaoqing: 220203


In [2]:
# environment: nlp_basics
# in terminal, do this:
# python3 -m spacy download en
# pip install --upgrade gensim
# pip install pyldavis

# import nltk
# nltk.download('stopwords')

In [1]:
#https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#1introduction
import pandas as pd
import numpy as np
import json
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis # updated from pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Read data and remove useless text

In [2]:
df = pd.read_csv('mirena.csv')
data = df['review'].tolist() # get a list of reviews

a = data[0]
a = a.find('“') # find the character index when " first appeared
    
data = [e[a:] for e in data] # remove "Elocon (mometasone) for Eczema:" from reviews
print (data[0][0:90])

“I have had my IUD for over a year now and I think it's the best thing ever. When I first 


In [3]:
df['review_clean'] = data
df['review_clean'] = df['review_clean'].str.lower()

# Lemmatize

In [4]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)


lemmatized_texts = lemmatization(data)
print (lemmatized_texts[0][0:90])

year now think good thing ever first get put in only bleed about week little cramping at a


# Remove stop words and single out words

In [5]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

print (data_words[0][0:20])

['year', 'now', 'think', 'good', 'thing', 'ever', 'first', 'get', 'put', 'in', 'only', 'bleed', 'about', 'week', 'little', 'cramping', 'at', 'all', 'period', 'about']


# Identify bigrams and trigrams

This works better with long documents. It doesn't work with reviews that rarely mention bigrams more than once per review. We didn't really find ANY bigrams or trigrams. If we did, they would show up like "short_term"

In [6]:
# bigram is like "french revolution"
# trigram is like "topical steroid withdrawal"

#BIGRAMS AND TRIGRAMS
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=100)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)



In [7]:
print(data_bigrams_trigrams)



# Bag of words: get unique words and their frequency

The two main inputs to the LDA topic model are the dictionary(id2word) and the corpus.

Gensim creates a unique id for each word in the document. The produced corpus shown above is a mapping of (word_id, word_frequency). For example, (0, 1) above implies, word id 0 occurs once in the first document. Likewise, word id 1 occurs once and so on.

In [8]:
id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
print (id2word[0])
print (corpus[0][0:20])

about
[(0, 2), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 2), (17, 1), (18, 1), (19, 1)]


# Remove super-frequent words like "eczema" that don't provide much value

- some words appear everywhere, like "cream", and they are not very helpful because they are too obvious
- check to see what words are being dropped, and include them back

In [9]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

# Main model

In [10]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, # choose the number of topics you wish to identify
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [13]:
# test_doc = corpus[-1]

# vector = lda_model[test_doc]
# print (vector)

# def Sort(sub_li):
#     sub_li.sort(key = lambda x: x[1])
#     sub_li.reverse()
#     return (sub_li)
# new_vector = Sort(vector)
# print (new_vector)

In [14]:
# # save model
# lda_model.save("topic_model.model")


In [15]:
# # open model
# new_model = gensim.models.ldamodel.LdaModel.load("topic_model.model")

# Vizualize the data

This visualization provides a global view of the topics (and how they differ from each other),
while at the same time allowing for an
inspection of the terms most highly associated with each individual topic

In [11]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(


  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


# Implications
- topic 1: insertion was painful
- topic 2: the iud affects your period
- topic 3: removed due to depression, anxiety, hair loss etc.
- topic 4: experienced weight gain and acne
- topic 5: it has to do with surgery and uterine

# Find frequency of topics

In [17]:
df2 = df[df['review_clean'].str.contains('weigh|lb|pounds')]
print(df2.shape[0]/len(df))
# df2.to_csv('weight.csv', index = False)  

0.31913746630727763


In [18]:
df3 = df[df['review_clean'].str.contains('mood|depressed|anxious|angry|cry')]
print(df3.shape[0]/len(df))


0.23234501347708894


In [19]:
df4 = df[df['review_clean'].str.contains('leg')]
print(df4.shape[0]/len(df))
# df4.to_csv('leg.csv', index = False)  

0.020485175202156335


In [20]:
df5 = df[df['review_clean'].str.contains('hair')]
print(df5.shape[0]/len(df))
# df5.to_csv('hair.csv', index = False)  

0.07223719676549865


# Other common side effects

In [21]:
df6 = df[df['review_clean'].str.contains('nausea|sick|indigestion|vomit|stomach')]
print(df6.shape[0]/len(df))
# df6.to_csv('nausea.csv', index = False)  

0.07978436657681941


In [22]:
df6 = df[df['review_clean'].str.contains('migraine|headache')]
print(df6.shape[0]/len(df))

0.09649595687331536


In [23]:
df6 = df[df['review_clean'].str.contains('discharge|irritation')]
print(df6.shape[0]/len(df))


0.03827493261455526


In [24]:
df6 = df[df['review_clean'].str.contains('breast|boob|tender')]
print(df6.shape[0]/len(df))


0.04366576819407008


In [25]:
df6 = df[df['review_clean'].str.contains('acne|skin|pimple')]
print(df6.shape[0]/len(df))


0.19838274932614555


In [26]:
df6 = df[df['review_clean'].str.contains('drive|libido')]
print(df6.shape[0]/len(df))


0.138544474393531


  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [27]:
A = np.random.randn(4,3)
B = np.sum(A, axis = 1, keepdims = True) 

B.shape

(4, 1)

In [28]:
B

array([[ 2.21799307],
       [ 3.02508096],
       [ 0.50555175],
       [-2.30303137]])

In [29]:
A

array([[ 0.07356712,  1.31875643,  0.82566952],
       [ 1.72546059,  1.57427145, -0.27465107],
       [-0.41544845,  0.62216842,  0.29883178],
       [ 0.34419889, -0.21288635, -2.43434391]])