#Data loading and pre-processing

In [1]:
import numpy as np
import pandas as pd


In [2]:
dataframe = pd.read_csv("https://raw.githubusercontent.com/R4pidAce/PDNA8411/main/abcnews-date-text.csv")


In [3]:
dataframe

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers
...,...,...
1244179,20211231,two aged care residents die as state records 2...
1244180,20211231,victoria records 5;919 new cases and seven deaths
1244181,20211231,wa delays adopting new close contact definition
1244182,20211231,western ringtail possums found badly dehydrate...


In [4]:
text_data=dataframe[:5000][['headline_text']];

In [5]:
text_data['index']=text_data.index

In [6]:
docs=text_data

In [7]:
print(len(docs))

5000


### checking the data has been well loaded

In [8]:
docs.head()

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


In [9]:
docs.isnull().sum()

headline_text    0
index            0
dtype: int64

# Remove stop words

In [10]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


##Standardization to lower case and stemming of the tokens

In [11]:
corpus=[]
for i in range(0,len(docs)):
    text=re.sub('[^a-zA-z]',' ',docs['headline_text'][i])
    text=text.lower()
    text=text.split()
    text=[PorterStemmer().stem(word) for word in text if not word in stopwords.words('english')]
    text=' '.join(text)
    corpus.append(text)

In [12]:
for i in range(0,len(corpus)):
    docs['headline_text'][i]=corpus[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  docs['headline_text'][i]=corpus[i]


In [13]:
docs[["headline_text"]]

Unnamed: 0,headline_text
0,aba decid commun broadcast licenc
1,act fire wit must awar defam
2,g call infrastructur protect summit
3,air nz staff aust strike pay rise
4,air nz strike affect australian travel
...,...
4995,slater star blue day one
4996,soprano film delay contract disput
4997,souri outlin region road fund
4998,south east water license pay levi


In [14]:
processed_docs=docs['headline_text']

## Checking the Frequency of words or counting them

In [15]:
import gensim

In [16]:
for i in range(0,len(processed_docs)):
    processed_docs[i]=processed_docs[i].split(' ')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_docs[i]=processed_docs[i].split(' ')


In [17]:
processed_docs

0             [aba, decid, commun, broadcast, licenc]
1                 [act, fire, wit, must, awar, defam]
2           [g, call, infrastructur, protect, summit]
3           [air, nz, staff, aust, strike, pay, rise]
4       [air, nz, strike, affect, australian, travel]
                            ...                      
4995                   [slater, star, blue, day, one]
4996         [soprano, film, delay, contract, disput]
4997              [souri, outlin, region, road, fund]
4998         [south, east, water, license, pay, levi]
4999         [sri, lanka, hope, new, zealand, defeat]
Name: headline_text, Length: 5000, dtype: object

In [18]:
dictionary=gensim.corpora.Dictionary(processed_docs)
for k,v in dictionary.iteritems():
    print(k,v)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
126 toughen
127 bank
128 commonwealth
129 cut
130 fix
131 home
132 loan
133 rate
134 help
135 homeless
136 youth
137 chief
138 execut
139 fail
140 posit
141 contest
142 councillor
143 independ
144 wollongong
145 garden
146 heritag
147 ta
148 ambul
149 decis
150 levi
151 welcom
152 breakthrough
153 insur
154 alp
155 crean
156 critic
157 leadership
158 shut
159 tell
160 dargo
161 expect
162 threat
163 climb
164 continu
165 death
166 korean
167 south
168 subway
169 toll
170 conflict
171 dem
172 hold
173 iraqi
174 plebiscit
175 dent
176 down
177 philippoussi
178 thriller
179 tie
180 de
181 fate
182 learn
183 march
184 villier
185 becom
186 commonplac
187 digit
188 tv
189 anger
190 direct
191 govt
192 soldier
193 disput
194 plant
195 process
196 smithton
197 veget
198 dog
199 maul
200 month
201 nsw
202 old
203 toddler
204 die
205 passeng
206 phone
207 chang
208 england
209 three
210 wale
211 chemic
212 clean
213 cost
214 epa
2

In [19]:
len(dictionary)

5126

In [20]:
dictionary.filter_extremes(no_below=15,no_above=0.1,keep_n=1000)

In [21]:
len(dictionary)

407

In [22]:
BoW_corpus=[dictionary.doc2bow(doc) for doc in processed_docs]

In [23]:
BoW_corpus[100]

[(46, 1), (160, 1)]

In [24]:
bow_corpus_100=BoW_corpus[100]
for i in range(len(bow_corpus_100)):
    print("Word {} (\"{}\") appears {} time.".format(bow_corpus_100[i][0],
                                                     dictionary[bow_corpus_100[i][0]],
                                                     bow_corpus_100[i][1]))

Word 46 ("urg") appears 1 time.
Word 160 ("women") appears 1 time.


## TF-IDF on our document set

In [25]:
from gensim import corpora,models
tfidf=models.TfidfModel(BoW_corpus)

In [26]:
import numpy as np
tfidf = models.TfidfModel(BoW_corpus)
for doc in tfidf[BoW_corpus]:
   print([[dictionary[id], np.around(freq,decimals=2)] for id, freq in doc])

[['commun', 1.0]]
[['act', 0.78], ['fire', 0.62]]
[['call', 0.56], ['protect', 0.83]]
[['air', 0.39], ['aust', 0.38], ['nz', 0.44], ['pay', 0.41], ['rise', 0.4], ['strike', 0.44]]
[['air', 0.45], ['nz', 0.51], ['strike', 0.51], ['australian', 0.53]]
[['win', 1.0]]
[['break', 0.74], ['record', 0.67]]
[['aussi', 0.6], ['four', 0.56], ['match', 0.57]]
[['aust', 0.48], ['council', 0.41], ['iraq', 0.36], ['secur', 0.5], ['un', 0.48]]
[['australia', 0.59], ['opp', 0.65], ['war', 0.48]]
[['iraq', 0.34], ['australia', 0.5], ['aid', 0.56], ['million', 0.56]]
[['record', 0.74], ['take', 0.67]]
[['ahead', 0.66], ['move', 0.6], ['plan', 0.45]]
[['big', 0.59], ['championship', 0.62], ['hope', 0.52]]
[['plan', 0.34], ['big', 0.51], ['boost', 0.44], ['suppli', 0.54], ['water', 0.39]]
[['bill', 0.59], ['state', 0.58], ['unit', 0.56]]
[['report', 0.65], ['troop', 0.76]]
[['troop', 0.68], ['british', 0.73]]
[['win', 0.49], ['doubl', 0.66], ['lead', 0.57]]
[['bushfir', 0.57], ['urg', 0.53], ['victim', 0.

In [None]:
lda_model=gensim.models.LdaMulticore(BoW_corpus,
                                    num_topics=10,
                                    id2word=dictionary,
                                    passes=20)

In [None]:
for idx,topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

In [None]:
initial_processed_docs

## Use a pipeline to automate the whole process of LDA - feeding everything to the pipeline

In [None]:
!pip install sklearn.pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import StandardScaler
import nltk
from sklearn.feature_extraction.text  import TfidfTransformer
nltk.download('punkt')

In [None]:
docz = dataframe['headline_text']
docz

### working with a few documents to speed up the process

In [None]:
fewer_docs = []

for i in range (5000):
  fewer_docs.append(docz[i])

# Define the stemmer
stemmer = PorterStemmer()
# Define the pipeline steps
pipeline = Pipeline([
    ('tokenize', CountVectorizer(tokenizer=word_tokenize)),  # Tokenization
    ('stem', TfidfTransformer(use_idf=True)),  # Stemming
     ('standardize', StandardScaler(with_mean=False)),
    ('lda', LatentDirichletAllocation(n_components=50))  # LDA model
])



# Fit the pipeline to the data
pipeline.fit(fewer_docs)

# Get the learned topics
feature_names = pipeline.named_steps['tokenize'].get_feature_names_out()
lda_model = pipeline.named_steps['lda']
num_top_words = 50  # Number of top words per topic

for topic_idx, topic in enumerate(lda_model.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
    print(f"Topic #{topic_idx+1}: {' '.join(top_words)}")


In [None]:
# Calculate perplexity
perplexity = pipeline.score(fewer_docs)
print("Perplexity:", perplexity)

In [None]:
!pip install scikit-metrics

# Performance visualization of the LDA model

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Get the learned topics
feature_names = pipeline.named_steps['tokenize'].get_feature_names_out()
lda_model = pipeline.named_steps['lda']
num_top_words = 5  # Number of top words per topic

# Topic-Word Distribution
fig, ax = plt.subplots(figsize=(10, 6))
for topic_idx, topic in enumerate(lda_model.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
    ax.barh(f"Topic #{topic_idx+1}", topic[:num_top_words][::-1], align='center')
    ax.set_xlabel("Word Importance")
    ax.set_ylabel("Topic")
    ax.set_yticks(range(len(lda_model.components_)))
    ax.set_yticklabels([f"Topic #{i+1}" for i in range(len(lda_model.components_))])
    ax.set_title("Topic-Word Distribution")
plt.tight_layout()
plt.show()



### Topic similarity matrix

In [None]:
import numpy as np

# Calculate topic similarity matrix
topic_similarity = np.zeros((len(lda_model.components_), len(lda_model.components_)))

for i in range(len(lda_model.components_)):
    for j in range(i, len(lda_model.components_)):
        similarity = np.dot(lda_model.components_[i], lda_model.components_[j]) / (
                    np.linalg.norm(lda_model.components_[i]) * np.linalg.norm(lda_model.components_[j]))
        topic_similarity[i][j] = similarity
        topic_similarity[j][i] = similarity

# Visualize topic similarity matrix as a heatmap
fig, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(topic_similarity, cmap='Blues', annot=True, fmt=".2f", ax=ax)
ax.set_xlabel("Topic")
ax.set_ylabel("Topic")
ax.set_title("Topic Similarity Matrix")
plt.tight_layout()
plt.show()