In [1]:
import pandas as pd


In [10]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Samar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
from sklearn.decomposition import TruncatedSVD


In [41]:
from sklearn.decomposition import NMF


In [2]:
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)


In [3]:
data.size

2372036

In [4]:
data=data[:500]

In [5]:
data.size

1000

In [6]:
print(len(data))


500


In [9]:
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [None]:
Since I am only intersted in headline_text

In [7]:
documents = data[['headline_text']]
documents['index'] = documents.index
documents.head()

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


In [8]:
#remove special characters
documents['clean_documents'] = documents['headline_text'].str.replace("[^a-zA-Z#]", " ")
#remove words have letters less than 3
documents['clean_documents'] = documents['clean_documents'].fillna('').apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
#lowercase all characters
documents['clean_documents'] = documents['clean_documents'].fillna('').apply(lambda x: x.lower())

In [12]:

stop_words = stopwords.words('english')
# tokenization
tokenized_doc = documents['clean_documents'].fillna('').apply(lambda x: x.split())
# remove stop-words
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
# de-tokenization
detokenized_doc = []
for i in range(len(documents)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)
documents['clean_documents'] = detokenized_doc

In [14]:
documents.head()

Unnamed: 0,headline_text,index,clean_documents
0,aba decides against community broadcasting lic...,0,aba decides community broadcasting licence
1,act fire witnesses must be aware of defamation,1,act fire witnesses must aware defamation
2,a g calls for infrastructure protection summit,2,calls infrastructure protection summit
3,air nz staff in aust strike for pay rise,3,air staff aust strike pay rise
4,air nz strike to affect australian travellers,4,air strike affect australian travellers


In [34]:

tokenized_doc[:1]

0    [aba, decides, community, broadcasting, licence]
Name: clean_documents, dtype: object

In [35]:
# Create Dictionary
id2word = corpora.Dictionary(tokenized_doc)

# Create Corpus
texts = tokenized_doc

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]]


In [36]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [39]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.052*"korean" + 0.041*"continues" + 0.033*"help" + 0.033*"calls" + 0.028*"drug" + 0.028*"toll" + 0.026*"summit" + 0.022*"subway" + 0.017*"community" + 0.017*"climb"
Topic: 1 Word: 0.034*"air" + 0.032*"wins" + 0.023*"low" + 0.021*"home" + 0.021*"defamation" + 0.021*"river" + 0.019*"forces" + 0.019*"cuts" + 0.019*"aware" + 0.019*"must"
Topic: 2 Word: 0.133*"war" + 0.050*"aid" + 0.036*"million" + 0.031*"call" + 0.030*"ethanol" + 0.026*"iraq" + 0.023*"chief" + 0.019*"australia" + 0.015*"fuel" + 0.015*"become"
Topic: 3 Word: 0.039*"profit" + 0.026*"records" + 0.026*"spill" + 0.023*"firefighters" + 0.023*"policy" + 0.014*"leaves" + 0.012*"third" + 0.005*"net" + 0.005*"successive" + 0.005*"freedom"
Topic: 4 Word: 0.072*"injured" + 0.053*"council" + 0.038*"record" + 0.026*"security" + 0.023*"club" + 0.023*"decision" + 0.021*"take" + 0.015*"open" + 0.014*"smoking" + 0.010*"birthday"
Topic: 5 Word: 0.117*"fire" + 0.113*"govt" + 0.041*"still" + 0.028*"tells" + 0.022*"crean" + 0.01

In [17]:
vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000, # keep top 1000 terms 
max_df = 0.5, 
smooth_idf=True)

X = vectorizer.fit_transform(documents['clean_documents'])

X.shape # check shape of the document-term matrix

(500, 1000)

In [21]:

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=10, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)

len(svd_model.components_)





10

In [22]:
terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0])
        print(" ")

Topic 0: 
nsw
 
govt
 
opp
 
hanson
 
war
 
drought
 
iraq
 
Topic 1: 
war
 
council
 
iraq
 
anti
 
protesters
 
criticism
 
man
 
Topic 2: 
man
 
court
 
face
 
new
 
council
 
murder
 
injured
 
Topic 3: 
council
 
chief
 
offer
 
welcomes
 
land
 
decision
 
security
 
Topic 4: 
funds
 
help
 
korean
 
subway
 
allocated
 
victims
 
miss
 
Topic 5: 
new
 
high
 
british
 
claim
 
court
 
asylum
 
govt
 
Topic 6: 
injured
 
crash
 
head
 
highway
 
nightclub
 
new
 
claim
 
Topic 7: 
korean
 
subway
 
death
 
toll
 
missing
 
continues
 
south
 
Topic 8: 
iraq
 
plan
 
million
 
pay
 
second
 
british
 
police
 
Topic 9: 
claim
 
plan
 
raid
 
police
 
water
 
embassy
 
aboriginal
 


In [32]:
# Importing Gensim
import gensim
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc )

# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokenized_doc]


In [33]:
lda_model = gensim.models.LdaMulticore(X, num_topics=10, id2word=dictionary, passes=2, workers=2)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt



In [24]:
from gensim import corpora, models


In [26]:
import gensim
