# Structural Topic Model Test
https://github.com/mkrcke/strutopy

## LDA model

In [2]:
pip install gensim

Collecting gensim
  Obtaining dependency information for gensim from https://files.pythonhosted.org/packages/63/46/5feab9c524a380bfa9f9f1c0d065743280dca30b216ab4c7a231f22dbed7/gensim-4.3.2-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading gensim-4.3.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (8.3 kB)
Collecting scipy>=1.7.0 (from gensim)
  Obtaining dependency information for scipy>=1.7.0 from https://files.pythonhosted.org/packages/50/8b/2057417a07a6fee8ed8be40e37bac4a502cae4cf44468a02962bbe81b8af/scipy-1.11.3-cp311-cp311-macosx_12_0_arm64.whl.metadata
  Using cached scipy-1.11.3-cp311-cp311-macosx_12_0_arm64.whl.metadata (165 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Obtaining dependency information for smart-open>=1.8.1 from https://files.pythonhosted.org/packages/fc/d9/d97f1db64b09278aba64e8c81b5d322d436132df5741c518f3823824fae0/smart_open-6.4.0-py3-none-any.whl.metadata
  Downloading smart_open-6.4.0-py3-none-any.whl.metadata (21 kB)
Downloading gensim-4.3.2-cp311-

In [7]:
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string
from gensim.corpora.textcorpus import TextCorpus
from gensim.corpora import Dictionary
from gensim import utils
from gensim.models import LdaModel
from gensim.test.utils import common_texts

In [4]:
doc_list = [
   "Hello, how are you?", "How do you do?", 
   "Hey what are you doing? yes you What are you doing?"
]

In [5]:
[remove_stopwords(doc) for doc in doc_list]

['Hello, you?', 'How do?', 'Hey doing? yes What doing?']

In [6]:
[preprocess_string(doc) for doc in doc_list]

[['hello'], [], ['hei', 'ye']]

In [8]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [9]:
common_dictionary = Dictionary(common_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

In [12]:
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=10)

In [13]:
# Create a new corpus, made of previously unseen documents.

other_texts = [
    ['computer', 'time', 'graph'],
    ['survey', 'response', 'eps'],
    ['human', 'system', 'computer']
]

other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
unseen_doc = other_corpus[0]

In [14]:
vector = lda[unseen_doc]  # get topic probability distribution for a document

In [15]:
vector

[(0, 0.025000058),
 (1, 0.025000062),
 (2, 0.025000062),
 (3, 0.025000062),
 (4, 0.2749837),
 (5, 0.5249823),
 (6, 0.025008382),
 (7, 0.025005536),
 (8, 0.025008382),
 (9, 0.02501146)]

https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [24]:
import pandas as pd
data = pd.read_csv('abcnews-date-text.csv') #, error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [25]:
documents

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4
...,...,...
1244179,two aged care residents die as state records 2...,1244179
1244180,victoria records 5;919 new cases and seven deaths,1244180
1244181,wa delays adopting new close contact definition,1244181
1244182,western ringtail possums found badly dehydrate...,1244182


In [26]:
print(len(documents))
print(documents[:5])

1244184
                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


In [28]:
pip install nltk

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting click (from nltk)
  Obtaining dependency information for click from https://files.pythonhosted.org/packages/00/2e/d53fa4befbf2cfa713304affc7ca780ce4fc1fd8710527771b58311a3229/click-8.1.7-py3-none-any.whl.metadata
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk)
  Obtaining dependency information for joblib from https://files.pythonhosted.org/packages/10/40/d551139c85db202f1f384ba8bcf96aca2f329440a844f924c8a0040b6d02/joblib-1.3.2-py3-none-any.whl.metadata
  Using cached joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Obtaining dependency information for regex>=2021.8.3 from https://files.pythonhosted.org/packages/4d/d3/38b09813a32618acd437906c4d0194119e27139dbcd7486e69d58e375a27/regex-2023.10.3-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading regex-2023.10.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)
[2K     [

In [38]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
from nltk.stem import *
from nltk.stem.porter import *
stemmer = PorterStemmer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/signebendsen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [39]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [40]:
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['ratepayers', 'group', 'wants', 'compulsory', 'local', 'govt', 'voting']


 tokenized and lemmatized document: 
['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']


In [41]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0               [decid, commun, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

In [42]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 commun
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


In [43]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)


In [44]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(162, 1), (240, 1), (292, 1), (589, 1), (839, 1), (3579, 1), (3580, 1)]

In [45]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 162 ("govt") appears 1 time.
Word 240 ("group") appears 1 time.
Word 292 ("vote") appears 1 time.
Word 589 ("local") appears 1 time.
Word 839 ("want") appears 1 time.
Word 3579 ("compulsori") appears 1 time.
Word 3580 ("ratepay") appears 1 time.


In [46]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5854395661274623),
 (1, 0.383252758688686),
 (2, 0.50230806644029),
 (3, 0.5080004367704987)]


In [47]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [48]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.027*"news" + 0.022*"market" + 0.018*"hospit" + 0.017*"morrison" + 0.016*"work" + 0.015*"fight" + 0.013*"countri" + 0.013*"close" + 0.012*"fall" + 0.012*"darwin"
Topic: 1 
Words: 0.051*"polic" + 0.035*"case" + 0.025*"charg" + 0.025*"court" + 0.021*"death" + 0.020*"murder" + 0.018*"face" + 0.016*"restrict" + 0.014*"trial" + 0.013*"investig"
Topic: 2 
Words: 0.031*"queensland" + 0.027*"live" + 0.022*"women" + 0.018*"victorian" + 0.012*"need" + 0.012*"deal" + 0.012*"budget" + 0.011*"rural" + 0.011*"road" + 0.011*"drum"
Topic: 3 
Words: 0.064*"covid" + 0.044*"trump" + 0.028*"vaccin" + 0.021*"bushfir" + 0.018*"test" + 0.015*"australia" + 0.013*"say" + 0.012*"coronaviru" + 0.012*"home" + 0.011*"support"
Topic: 4 
Words: 0.030*"victoria" + 0.019*"world" + 0.017*"coast" + 0.016*"south" + 0.016*"australia" + 0.015*"peopl" + 0.015*"canberra" + 0.015*"sydney" + 0.013*"queensland" + 0.013*"alleg"
Topic: 5 
Words: 0.033*"elect" + 0.019*"say" + 0.018*"minist" + 0.015*"miss" + 0.014

In [49]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.019*"crash" + 0.014*"drum" + 0.010*"stori" + 0.009*"tuesday" + 0.009*"fatal" + 0.008*"stab" + 0.008*"die" + 0.008*"peter" + 0.007*"polic" + 0.007*"truck"
Topic: 1 Word: 0.012*"andrew" + 0.012*"climat" + 0.009*"pandem" + 0.009*"david" + 0.009*"violenc" + 0.008*"chang" + 0.007*"domest" + 0.007*"octob" + 0.007*"grow" + 0.006*"kohler"
Topic: 2 Word: 0.021*"covid" + 0.017*"coronaviru" + 0.011*"vaccin" + 0.010*"countri" + 0.009*"govern" + 0.008*"health" + 0.007*"hour" + 0.006*"queensland" + 0.006*"royal" + 0.006*"rural"
Topic: 3 Word: 0.014*"restrict" + 0.009*"monday" + 0.009*"thursday" + 0.009*"bushfir" + 0.007*"white" + 0.006*"june" + 0.006*"coronaviru" + 0.005*"burn" + 0.005*"hous" + 0.005*"breach"
Topic: 4 Word: 0.016*"murder" + 0.015*"charg" + 0.015*"polic" + 0.010*"court" + 0.009*"woman" + 0.009*"assault" + 0.008*"alleg" + 0.008*"street" + 0.008*"guilti" + 0.008*"sentenc"
Topic: 5 Word: 0.013*"interview" + 0.008*"michael" + 0.008*"christma" + 0.007*"extend" + 0.007*"ho

In [50]:
processed_docs[4310]

['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']

In [51]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.887475311756134	 
Topic: 0.034*"govern" + 0.019*"border" + 0.015*"plan" + 0.015*"commun" + 0.015*"indigen" + 0.014*"water" + 0.013*"region" + 0.013*"concern" + 0.011*"industri" + 0.011*"local"

Score: 0.012507181614637375	 
Topic: 0.033*"elect" + 0.019*"say" + 0.018*"minist" + 0.015*"miss" + 0.014*"speak" + 0.013*"labor" + 0.013*"care" + 0.013*"call" + 0.013*"announc" + 0.012*"claim"

Score: 0.012502512894570827	 
Topic: 0.022*"donald" + 0.019*"year" + 0.018*"chang" + 0.015*"lockdown" + 0.015*"school" + 0.013*"warn" + 0.012*"high" + 0.011*"travel" + 0.011*"tasmanian" + 0.011*"price"

Score: 0.012502402998507023	 
Topic: 0.064*"covid" + 0.044*"trump" + 0.028*"vaccin" + 0.021*"bushfir" + 0.018*"test" + 0.015*"australia" + 0.013*"say" + 0.012*"coronaviru" + 0.012*"home" + 0.011*"support"

Score: 0.01250236202031374	 
Topic: 0.031*"queensland" + 0.027*"live" + 0.022*"women" + 0.018*"victorian" + 0.012*"need" + 0.012*"deal" + 0.012*"budget" + 0.011*"rural" + 0.011*"road" + 0.011*"

In [52]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.7418569922447205	 
Topic: 0.021*"covid" + 0.017*"coronaviru" + 0.011*"vaccin" + 0.010*"countri" + 0.009*"govern" + 0.008*"health" + 0.007*"hour" + 0.006*"queensland" + 0.006*"royal" + 0.006*"rural"

Score: 0.15809398889541626	 
Topic: 0.013*"interview" + 0.008*"michael" + 0.008*"christma" + 0.007*"extend" + 0.007*"hobart" + 0.006*"daniel" + 0.006*"alan" + 0.006*"queensland" + 0.006*"grand" + 0.005*"prime"

Score: 0.01250715833157301	 
Topic: 0.008*"financ" + 0.007*"turnbul" + 0.007*"coronaviru" + 0.007*"rise" + 0.006*"energi" + 0.006*"say" + 0.005*"know" + 0.005*"fiji" + 0.005*"rate" + 0.005*"australia"

Score: 0.012506548315286636	 
Topic: 0.011*"lockdown" + 0.010*"morrison" + 0.008*"kill" + 0.006*"biden" + 0.006*"explain" + 0.006*"presid" + 0.005*"protest" + 0.005*"video" + 0.005*"say" + 0.005*"septemb"

Score: 0.012506474740803242	 
Topic: 0.012*"andrew" + 0.012*"climat" + 0.009*"pandem" + 0.009*"david" + 0.009*"violenc" + 0.008*"chang" + 0.007*"domest" + 0.007*"octob" + 0

In [53]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.32987621426582336	 Topic: 0.027*"news" + 0.022*"market" + 0.018*"hospit" + 0.017*"morrison" + 0.016*"work"
Score: 0.2036464810371399	 Topic: 0.033*"elect" + 0.019*"say" + 0.018*"minist" + 0.015*"miss" + 0.014*"speak"
Score: 0.18344080448150635	 Topic: 0.031*"queensland" + 0.027*"live" + 0.022*"women" + 0.018*"victorian" + 0.012*"need"
Score: 0.18296894431114197	 Topic: 0.064*"covid" + 0.044*"trump" + 0.028*"vaccin" + 0.021*"bushfir" + 0.018*"test"
Score: 0.016678903251886368	 Topic: 0.034*"govern" + 0.019*"border" + 0.015*"plan" + 0.015*"commun" + 0.015*"indigen"
Score: 0.0166777390986681	 Topic: 0.051*"polic" + 0.035*"case" + 0.025*"charg" + 0.025*"court" + 0.021*"death"
Score: 0.0166777390986681	 Topic: 0.030*"victoria" + 0.019*"world" + 0.017*"coast" + 0.016*"south" + 0.016*"australia"
Score: 0.0166777390986681	 Topic: 0.064*"coronaviru" + 0.027*"china" + 0.022*"kill" + 0.019*"die" + 0.014*"australia"
Score: 0.0166777390986681	 Topic: 0.022*"donald" + 0.019*"year" + 0.018*"