In [5]:
from nltk import word_tokenize, pos_tag
def nouns_adj(text):
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

import pandas as pd
data_df = pd.read_pickle('./data-model/scrapped_df_other.pkl')

import re
import string

def clean_text_round1(text):
    text = str(text)
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

data_clean = pd.DataFrame(data_df.description.apply(round1))

def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

data_clean = pd.DataFrame(data_clean.description.apply(round2))

data_nouns_adj = pd.DataFrame(data_clean.description.apply(nouns_adj))

data_nouns_adj

Unnamed: 0,description
0,everything recent bushfires financial assistan...
1,police taskforce kimberley death sex workers b...
2,troops concussion symptoms january attack firs...
3,human rights groups lebanese security forces c...
4,term violent protests acquittal christian woma...
...,...
658,chinese face mask manufacturers factories nati...
659,china news second central chinese city lockdow...
660,twoseater aircraft emergency eastern periphera...
661,uddhav thackeray ayodhya march completion days...


In [26]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.description)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna.description = data_nouns_adj.description

data_dtmna

  if sys.path[0] == '':


Unnamed: 0,aaa,aaron,abilities,ability,able,abortion,abrogation,absolute,abu,abuse,...,youtuber,youve,zagreb,zaniolo,zapata,zelensky,zelenskys,zero,zion,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
659,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
660,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
661,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
from gensim import matutils, models
import scipy.sparse

corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.008*"company" + 0.006*"new" + 0.005*"latest" + 0.004*"today" + 0.004*"game" + 0.004*"united" + 0.004*"day" + 0.004*"states" + 0.004*"bitcoin" + 0.003*"security"'),
 (1,
  '0.010*"new" + 0.006*"wednesday" + 0.005*"president" + 0.005*"trump" + 0.004*"trial" + 0.004*"presidential" + 0.004*"year" + 0.004*"joe" + 0.004*"city" + 0.003*"washington"'),
 (2,
  '0.010*"new" + 0.009*"nan" + 0.009*"thursday" + 0.004*"health" + 0.004*"friday" + 0.004*"president" + 0.003*"china" + 0.003*"phone" + 0.003*"theres" + 0.003*"morning"'),
 (3,
  '0.014*"president" + 0.012*"new" + 0.012*"impeachment" + 0.009*"senate" + 0.009*"trial" + 0.007*"china" + 0.005*"donald" + 0.005*"trump" + 0.005*"coronavirus" + 0.004*"wuhan"')]

In [46]:
corpus_transformed = ldana[corpusna]
for item in corpus_transformed:
    for it in item: 
        print(it)
#list(zip([a for [(a,b)] in corpus_transformed], data_dtmna.index))

(0, 0.8912045)
(1, 0.036266662)
(2, 0.036614522)
(3, 0.035914373)
(0, 0.022963142)
(1, 0.022751587)
(2, 0.023218457)
(3, 0.9310669)
(0, 0.042748004)
(1, 0.8686888)
(2, 0.04330577)
(3, 0.045257416)
(0, 0.9235282)
(1, 0.025252312)
(2, 0.025694624)
(3, 0.025524847)
(0, 0.028391378)
(1, 0.028229611)
(2, 0.028303176)
(3, 0.9150759)
(0, 0.05089025)
(1, 0.84897715)
(2, 0.050072156)
(3, 0.05006048)
(0, 0.042911727)
(1, 0.8725131)
(2, 0.042867035)
(3, 0.041708082)
(0, 0.05004784)
(1, 0.8492463)
(2, 0.050391283)
(3, 0.05031452)
(0, 0.0424031)
(1, 0.043877047)
(2, 0.86735046)
(3, 0.046369407)
(0, 0.08345021)
(1, 0.08800945)
(2, 0.083451256)
(3, 0.74508905)
(0, 0.065042876)
(1, 0.064014815)
(2, 0.8032171)
(3, 0.067725174)
(0, 0.05194825)
(1, 0.8433856)
(2, 0.052433033)
(3, 0.05223314)
(0, 0.890102)
(1, 0.03614709)
(2, 0.03730313)
(3, 0.036447775)
(0, 0.028019736)
(1, 0.91611665)
(2, 0.027815368)
(3, 0.028048199)
(0, 0.03626192)
(1, 0.036981564)
(2, 0.036771584)
(3, 0.8899849)
(0, 0.014779482)
(1, 

(0, 0.9357208)
(1, 0.0213559)
(2, 0.02129027)
(3, 0.021633072)
(0, 0.037184358)
(1, 0.03730942)
(2, 0.036233004)
(3, 0.8892732)
(0, 0.017955791)
(1, 0.017948784)
(2, 0.9460326)
(3, 0.018062819)
(0, 0.068156146)
(1, 0.06457945)
(2, 0.06261776)
(3, 0.8046467)
(0, 0.03135764)
(1, 0.032013044)
(2, 0.9052604)
(3, 0.03136894)
(0, 0.04192515)
(1, 0.87265664)
(2, 0.043709733)
(3, 0.041708488)
(0, 0.03574675)
(1, 0.89276516)
(2, 0.035746574)
(3, 0.035741527)
(0, 0.03723842)
(1, 0.041193306)
(2, 0.88412565)
(3, 0.037442673)
(0, 0.035761606)
(1, 0.035762485)
(2, 0.892722)
(3, 0.035753936)
(0, 0.06254633)
(1, 0.06254765)
(2, 0.062546715)
(3, 0.8123593)
(0, 0.8741862)
(1, 0.042387895)
(2, 0.04171683)
(3, 0.041709013)
(0, 0.014913099)
(1, 0.014771621)
(2, 0.9551235)
(3, 0.0151918195)
(0, 0.01830944)
(1, 0.01833912)
(2, 0.017971164)
(3, 0.9453803)
(0, 0.9457414)
(1, 0.018180571)
(2, 0.017957322)
(3, 0.018120686)
(0, 0.017087273)
(1, 0.016922036)
(2, 0.017002387)
(3, 0.9489883)
(0, 0.022759194)
(1, 0.

(0, 0.015972368)
(1, 0.015783867)
(2, 0.015806451)
(3, 0.95243734)
(0, 0.9159498)
(1, 0.028038833)
(2, 0.028030893)
(3, 0.027980521)
(0, 0.026020013)
(1, 0.9230033)
(2, 0.025827976)
(3, 0.025148753)
(0, 0.021503624)
(1, 0.020325992)
(2, 0.93884045)
(3, 0.019329933)
(0, 0.021485124)
(1, 0.021948641)
(2, 0.021818813)
(3, 0.93474746)
(0, 0.025034867)
(1, 0.924901)
(2, 0.025034722)
(3, 0.025029346)
(0, 0.02129903)
(1, 0.9358227)
(2, 0.021359146)
(3, 0.021519147)
(0, 0.9241914)
(1, 0.025399946)
(2, 0.02532726)
(3, 0.025081327)
(0, 0.01925603)
(1, 0.019660441)
(2, 0.9416378)
(3, 0.019445723)
(0, 0.031816557)
(1, 0.03216883)
(2, 0.033603985)
(3, 0.9024106)
(0, 0.54759043)
(1, 0.02612147)
(2, 0.026198728)
(3, 0.40008932)
(0, 0.020919329)
(1, 0.021433817)
(2, 0.19504431)
(3, 0.76260257)
(0, 0.014825213)
(1, 0.015268436)
(2, 0.28472224)
(3, 0.6851841)
(3, 0.9814446)
(1, 0.97792786)
(0, 0.023119807)
(1, 0.023014698)
(2, 0.02297965)
(3, 0.93088585)
(0, 0.025494713)
(1, 0.02566305)
(2, 0.9227953)
(

(0, 0.9401786)
(1, 0.01971253)
(2, 0.019766593)
(3, 0.020342322)
(0, 0.025217798)
(1, 0.025751268)
(2, 0.02600069)
(3, 0.92303026)
(0, 0.018128963)
(1, 0.018029297)
(2, 0.94556165)
(3, 0.018280102)
(0, 0.028026508)
(1, 0.028268734)
(2, 0.915552)
(3, 0.028152738)
(0, 0.9233646)
(1, 0.025103927)
(2, 0.026042618)
(3, 0.025488853)
(0, 0.042166848)
(1, 0.04173096)
(2, 0.041827906)
(3, 0.87427425)
(0, 0.015547575)
(1, 0.95412666)
(2, 0.014850215)
(3, 0.015475526)
(0, 0.01397971)
(1, 0.95781887)
(2, 0.014192156)
(3, 0.014009338)
(0, 0.960217)
(1, 0.013258891)
(2, 0.013268088)
(3, 0.013256022)
(0, 0.9737796)
(0, 0.014858445)
(1, 0.014896795)
(2, 0.9552173)
(3, 0.01502743)
(0, 0.0228314)
(1, 0.023119891)
(2, 0.023625111)
(3, 0.93042356)
(0, 0.9459996)
(1, 0.017948678)
(2, 0.01804701)
(3, 0.018004665)
(0, 0.8082961)
(1, 0.066578194)
(2, 0.06258412)
(3, 0.062541604)
(0, 0.8648353)
(1, 0.04227997)
(2, 0.046941813)
(3, 0.045942836)
(0, 0.86607707)
(1, 0.045643613)
(2, 0.042613905)
(3, 0.04566537)
(