In [1]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install corextopic


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting corextopic
  Downloading corextopic-1.1-py3-none-any.whl (27 kB)
Installing collected packages: corextopic
Successfully installed corextopic-1.1


In [2]:
import numpy as np
import scipy.sparse as ss
import matplotlib.pyplot as plt
import pandas as pd

import corextopic.corextopic as ct
import corextopic.vis_topic as vt # jupyter notebooks will complain matplotlib is being loaded twice

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

In [3]:
# Mounting google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Conduct an exploratory screening of the dataset provided      
df = pd.read_excel("/content/drive/MyDrive/Minmin 506/Project/redditPosts_v4.xlsx", sheet_name='Unvaccinated')                            
df.rename(columns={"data": "Text"}, inplace=True)

In [5]:
df1 = df[['Text']]
df1['Text'] = df1['Text'].astype(str).str.lower()
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Text'] = df1['Text'].astype(str).str.lower()


Unnamed: 0,Text
0,"tested positive, not vaccinated. how to qualif..."
1,tested positive today and on day 3 of symptoms...
2,not vaccinated - tested positive 3 days ago \n...
3,f/48 positive 10/09/21( not vaccinated) \nmy d...
4,when will my smell and taste come back roughly...


In [6]:
# Transform 20 newsgroup data into a sparse matrix
vectorizer = CountVectorizer(stop_words='english', max_features=20000, binary=True)
doc_word = vectorizer.fit_transform(df1.Text)
doc_word = ss.csr_matrix(doc_word)

doc_word.shape # n_docs x m_words



(747, 7552)

In [7]:
# Get words that label the columns (needed to extract readable topics and make anchoring easier)
words = list(np.asarray(vectorizer.get_feature_names_out()))

In [8]:
not_digit_inds = [ind for ind,word in enumerate(words) if not word.isdigit()]
doc_word = doc_word[:,not_digit_inds]
words    = [word for ind,word in enumerate(words) if not word.isdigit()]

doc_word.shape # n_docs x m_words

(747, 7407)

CorEx Topic Model

In [9]:
# Anchor 'fever' and 'cough' to first topic, 'paxlovid' and 'tylinol' to second topic, so on...
anchor_words = [['fever', 'cough', 'cold', 'congestion', 'fatigue'], 
                ['paxlovid', 'vitamin', 'hospital', 'advil', 'acetaminophen'], 
                ['friend', 'school', 'club', 'family', 'mask'], 
                ['isolation', 'miss', 'away', 'avoid', 'work'], 
                ['nervous', 'anxiety', 'worry', 'depress', 'stress']]
anchored_topic_model = ct.Corex(n_hidden=7, seed=2)
anchored_topic_model.fit(doc_word, words=words, anchors=anchor_words, anchor_strength=6);



In [10]:
for n in range(len(anchor_words)):
    topic_words,_,_ = zip(*anchored_topic_model.get_topics(topic=n))
    print('{}: '.format(n) + ', '.join(topic_words))

0: cough, fever, fatigue, congestion, cold, woke, headache, day, night, sore
1: hospital, paxlovid, vitamin, oxygen, acetaminophen, said, told, know, doctor, blood
2: mask, family, friend, school, wear, wearing, club, people, doing, stay
3: work, away, avoid, miss, home, test, isolation, got, asked, hours
4: anxiety, worry, nervous, stress, gone, really, breathe, use, hope, good


In [11]:
anchored_topic_model.get_topics(topic=4, n_words=10, print_words=True)

[('anxiety', 1.0181675226213571, 1.0),
 ('worry', 0.2773512260451011, 1.0),
 ('nervous', 0.14865600957061026, 1.0),
 ('stress', 0.10205705297769008, 1.0),
 ('gone', 0.05681780357504174, 1.0),
 ('really', 0.04180685010979469, 1.0),
 ('breathe', 0.04107799107663047, 1.0),
 ('use', 0.039125545374433045, 1.0),
 ('hope', 0.03629449120800433, 1.0),
 ('good', 0.035270184799037325, 1.0)]

In [13]:
abc=anchored_topic_model.get_topics(print_words=True)
print(abc)

[[('cough', 1.65300955816092, 1.0), ('fever', 1.3728052059363343, 1.0), ('fatigue', 0.9766352920882871, 1.0), ('congestion', 0.970944078092679, 1.0), ('cold', 0.4223110586361557, 1.0), ('woke', 0.14553015914099968, 1.0), ('headache', 0.1328287579111775, 1.0), ('day', 0.12934216612841876, 1.0), ('night', 0.12572672493609172, 1.0), ('sore', 0.11829293866674086, 1.0)], [('hospital', 1.033731280147118, 1.0), ('paxlovid', 0.2486668381769494, 1.0), ('vitamin', 0.15790087554019633, 1.0), ('oxygen', 0.07491665167677534, 1.0), ('acetaminophen', 0.06129951216418323, 1.0), ('said', 0.060221280794969974, 1.0), ('told', 0.05733801165358455, 1.0), ('know', 0.054453660184106696, 1.0), ('doctor', 0.05362406047990898, 1.0), ('blood', 0.04713401756050826, 1.0)], [('mask', 1.9856763907045463, 1.0), ('family', 0.48757342288747463, 1.0), ('friend', 0.30206317938041183, 1.0), ('school', 0.18807563687668705, 1.0), ('wear', 0.07009022507311621, 1.0), ('wearing', 0.06358336392264692, 1.0), ('club', 0.060782043

In [14]:
unvacscore2 = []
for i in range(5):
  values = [x[1] for x in abc[i]]
  mean_value = np.mean(values)
  unvacscore2.append(mean_value)
print(unvacscore2)

[0.6047425939697805, 0.18492861883783007, 0.33103307965450224, 0.26245748856245055, 0.17966246773577]
