In [1]:
# Import package
import wikipedia
# Specify the title of the Wikipedia page
wiki = wikipedia.page('1721 Boston smallpox outbreak')
# Extract the plain text content of the page
data = wiki.content
data

'In 1721, Boston experienced its worst outbreak of smallpox (also known as variola). 5,759 people out of around 10,600 in Boston were infected and 844 were recorded to have died between April 1721 and February 1722. The outbreak would motivate Puritan minister Cotton Mather and Harvard physician Zabdiel Boylston to variolate hundreds of Bostonians in the Thirteen Colonies\' earliest experiment with public inoculation. Their efforts would inspire further use and research of variolation for immunizing people from smallpox, placing the Massachusetts Bay Colony at the epicenter of the Colonies\' first inoculation debate and profoundly impacting Western society\'s medical treatment of the disease. The outbreak also permanently changed social and religious public discourse about disease, as Boston\'s newspapers published various pamphlets opposing and supporting the inoculation efforts.\n\n\n== Smallpox in Boston ==\n \nOn 22 April 1721 the British passenger ship HMS Seahorse arrived at Bost

In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np

stop_words = set(stopwords.words('english'))
def rev_punc(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation,'') 
    return text
def cre_low(text):
    word_tokens = word_tokenize(text) 
    text = [w.lower() for w in word_tokens]
    return ' '.join(text)

data_clean = rev_punc(data)
data_clean = cre_low(data_clean)

In [4]:
stemmer = SnowballStemmer("english")
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

data_clean = lemmatize_stemming(data)
data_clean = [preprocess(data_clean)]

In [5]:
data_clean

[['boston',
  'experi',
  'worst',
  'outbreak',
  'smallpox',
  'know',
  'variola',
  'peopl',
  'boston',
  'infect',
  'record',
  'die',
  'april',
  'februari',
  'outbreak',
  'motiv',
  'puritan',
  'minist',
  'cotton',
  'mather',
  'harvard',
  'physician',
  'zabdiel',
  'boylston',
  'variol',
  'hundr',
  'bostonian',
  'thirteen',
  'coloni',
  'earliest',
  'experi',
  'public',
  'inocul',
  'effort',
  'inspir',
  'research',
  'variol',
  'immun',
  'peopl',
  'smallpox',
  'place',
  'massachusett',
  'coloni',
  'epicent',
  'coloni',
  'inocul',
  'debat',
  'profound',
  'impact',
  'western',
  'societi',
  'medic',
  'treatment',
  'diseas',
  'outbreak',
  'perman',
  'chang',
  'social',
  'religi',
  'public',
  'discours',
  'diseas',
  'boston',
  'newspap',
  'publish',
  'pamphlet',
  'oppos',
  'support',
  'inocul',
  'effort',
  'smallpox',
  'boston',
  'april',
  'british',
  'passeng',
  'ship',
  'seahors',
  'arriv',
  'boston',
  'barbado',
  's

In [7]:
dictionary = gensim.corpora.Dictionary(data_clean)

In [9]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 academ
1 accept
2 account
3 accredit
4 action
5 adult
6 affect
7 africa
8 aftermath
9 alarm
10 ambassador


In [10]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in data_clean]

In [13]:
bow_corpus

[[(0, 3),
  (1, 2),
  (2, 3),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 2),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 2),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 2),
  (16, 1),
  (17, 1),
  (18, 2),
  (19, 2),
  (20, 2),
  (21, 3),
  (22, 1),
  (23, 1),
  (24, 2),
  (25, 3),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 3),
  (30, 3),
  (31, 3),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 2),
  (36, 1),
  (37, 22),
  (38, 4),
  (39, 1),
  (40, 19),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 2),
  (47, 2),
  (48, 5),
  (49, 1),
  (50, 7),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 2),
  (55, 1),
  (56, 2),
  (57, 2),
  (58, 1),
  (59, 1),
  (60, 5),
  (61, 1),
  (62, 1),
  (63, 3),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 3),
  (72, 1),
  (73, 3),
  (74, 2),
  (75, 1),
  (76, 10),
  (77, 3),
  (78, 2),
  (79, 1),
  (80, 1),
  (81, 5),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 2),
  (86, 2),
  (87, 1),
  (88, 1),
  (89, 2),
  (90, 1),
  (91,

In [15]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [16]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.047*"inocul" + 0.027*"smallpox" + 0.022*"boston" + 0.020*"mather" + 0.019*"boylston" + 0.013*"outbreak" + 0.011*"public" + 0.010*"peopl" + 0.010*"cotton" + 0.010*"procedur"


Topic: 1 
Words: 0.004*"inocul" + 0.004*"smallpox" + 0.004*"boylston" + 0.004*"boston" + 0.003*"mather" + 0.003*"outbreak" + 0.003*"public" + 0.003*"experi" + 0.003*"cotton" + 0.003*"procedur"


Topic: 2 
Words: 0.002*"inocul" + 0.002*"smallpox" + 0.002*"boston" + 0.002*"boylston" + 0.002*"mather" + 0.002*"procedur" + 0.002*"outbreak" + 0.002*"public" + 0.002*"harvard" + 0.002*"douglass"


Topic: 3 
Words: 0.002*"inocul" + 0.002*"boylston" + 0.002*"smallpox" + 0.002*"boston" + 0.002*"mather" + 0.002*"cotton" + 0.002*"outbreak" + 0.002*"peopl" + 0.002*"public" + 0.002*"patient"


Topic: 4 
Words: 0.002*"inocul" + 0.002*"mather" + 0.002*"smallpox" + 0.002*"boylston" + 0.002*"boston" + 0.002*"peopl" + 0.002*"outbreak" + 0.002*"procedur" + 0.002*"experi" + 0.002*"cotton"


Topic: 5 
Words: 0.003*"in

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer().fit(data_clean)
data_vectorized = vectorizer.transform(data_clean)
lda_model = LatentDirichletAllocation(n_components=10).fit(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])

print_topics(lda_model, vectorizer)