##### Author: Weisi Chen
##### Last update: 1 May 2023

In [1]:
# Importing necessary Python libraries for this exercise
import pandas as pd
import xml.etree.ElementTree as et
import joblib

### Function to read one AFR XML file

In [2]:
# The function to read AFR news from a given XML file
# selected_type is an optional option that generates a filtered table by the news type;
# The value can be one of the following: 
#       'Domain Prestige', 'Companies and Markets', 'News',
#       'Chanticleer', 'Perspective', 'Weekend Fin', 'Opinion',
#       'Stock Tables', 'Smart Investor', 'Poster', 'World', 'Market Wrap',
#       'Property', 'Features', 'Life & Leisure', 'Financial Services',
#       'Review', 'Accounting', 'Marketing & Media', 'Education',
#       'Saleroom', 'Computers', 'Supplement'

def read_afr(xml_file, selected_type = ""):
    xtree = et.parse(xml_file)
    dates_all = []
    news_texts_all = []
    headlines_all = []
    sections_all = []

    for node in xtree.iter('TEXT'):
        news_text = ""
        for subnode in node.iter('p'):
            whole = subnode.itertext()
            for parts in whole:
                news_text += parts
        news_texts_all.append(news_text)

    for node in xtree.iter('SECTION'):
        sections_all.append(node.text)
        
    for node in xtree.iter('PUBLICATIONDATE'):
        dates_all.append(node.text)
        
    # print(len(headlines_all), len(dates_all), len(news_texts_all))
    news_df = pd.DataFrame(
        {'date': dates_all,
         # 'headline': headlines_all,
         'text': news_texts_all,
         'section': sections_all})
    
    if(selected_type):
        print("Selected News Types: ", selected_type)
        news_df = news_df.loc[news_df['section'] == selected_type]
    
    return news_df

### Read all XML files within a folder

In [3]:
import glob
import pandas as pd
import numpy as np
filenames = sorted(glob.glob('data/AFR*.xml'))
# filenames = filenames[0:1]
docs = []
flag = True
df = pd.DataFrame()
for filename in filenames:
    print(filename)
    news = read_afr(filename, "Companies and Markets")
    if flag:
        df = pd.DataFrame(news['date'],columns=(['date']))
        df['text']=news['text']
        df['section']=news['section']
        flag = False
    else:
        df = pd.concat([df,news],axis=0)
print(df)

data\AFR_20150101-20150131.xml
Selected News Types:  Companies and Markets
data\AFR_20150201-20150228.xml
Selected News Types:  Companies and Markets
data\AFR_20150301-20150331.xml
Selected News Types:  Companies and Markets
data\AFR_20150401-20150430.xml
Selected News Types:  Companies and Markets
data\AFR_20150501-20150531.xml
Selected News Types:  Companies and Markets
data\AFR_20150601-20150630.xml
Selected News Types:  Companies and Markets
data\AFR_20150701-20150731.xml
Selected News Types:  Companies and Markets
data\AFR_20150801-20150831.xml
Selected News Types:  Companies and Markets
data\AFR_20150901-20150930.xml
Selected News Types:  Companies and Markets
data\AFR_20151001-20151031.xml
Selected News Types:  Companies and Markets
data\AFR_20151101-20151130.xml
Selected News Types:  Companies and Markets
data\AFR_20151201-20151231.xml
Selected News Types:  Companies and Markets
data\AFR_20160101-20160131.xml
Selected News Types:  Companies and Markets
data\AFR_20160201-2016022

### Count of sentences and words in the texts

In [5]:
# average sentence and word length
sum = 0
for row in df['text']:
    sum += row.count('.')
print("sentence number:",sum/len(df['text']))

sum = 0
for row in df['text']:
    sum += row.count(' ')
print("word number:",sum/len(df['text']))

sentence number: 29.124973849372385
word number: 590.1399058577406


### Data preprocessing and cleaning

In [4]:
import nltk
from nltk.corpus import stopwords  #stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string

In [5]:
#clean all the text. 
FullData = df.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwords from text
    lemma = WordNetLemmatizer()
    text = ' '.join(lemma.lemmatize(word) for word in text.split())
    return text
df['clean_text'] = df['text'].apply(clean_text).str.replace('\d+', '')

### Executing LDA

In [14]:
# Importing sklearn and functions
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import time
start_time = time.time()
N_TOPIC = 100
# Using the Count vectorizer
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(df['clean_text'])

In [15]:
# Initialise the LDA model, and then fit the model using TF or TFIDF.
lda_tf = LatentDirichletAllocation(n_components=N_TOPIC, learning_method='online',
                                   random_state=42, max_iter=10) 

# TF DTM
lda_top_tf = lda_tf.fit_transform(dtm_tf)

print("--- %s seconds ---" % (time.time() - start_time))

# saving model
import joblib
model_filename = 'lda_tf_model_'.strip() + str(N_TOPIC).strip() + '.jl'.strip()
joblib.dump(lda_tf, model_filename)

--- 1950.5870633125305 seconds ---


['lda_tf_model_100.jl']

In [None]:
# Using the TF-IDF vectorizer
# tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', max_features=1000)
# dtm_tfidf = tfidf_vectorizer.fit_transform(df['clean_text'])
# TFIDF DTM
# lda_tfidf = LatentDirichletAllocation(n_components=20, random_state=0)
# lda_top_tfidf = lda_tfidf.fit_transform(dtm_tfidf)

In [7]:
# loading model
lda_tf_20=joblib.load('lda_tf_model_20.jl')
#lda_tf_50=joblib.load('lda_tf_model_50.jl')
lda_tf_100=joblib.load('lda_tf_model_100.jl')
#lda_tf_500=joblib.load('lda_tf_model_500.jl')

In [None]:
# Print the LDA results using TF (Count Vectorizer)
print(lda_tf.components_)
print(lda_tf.components_.shape)
print('perplexity: ')
print(lda_tf.perplexity(dtm_tf, sub_sampling=False))

In [None]:
# Print the LDA results using TF-IDF
# print(lda_tfidf.components_)
# print(lda_tfidf.components_.shape)
# print('perplexity: ')
# print(lda_tfidf.perplexity(dtm_tfidf, sub_sampling=False))

In [9]:
print("Document 2: ")
for i,topic in enumerate(lda_top_tf[2]):
    print("Topic ",i,": ",topic*100,"%")

Document 2: 
Topic  0 :  0.011682243336815615 %
Topic  1 :  0.7469126235595284 %
Topic  2 :  0.011682243279743106 %
Topic  3 :  0.011682243302025268 %
Topic  4 :  27.65166032072448 %
Topic  5 :  0.011682243124796736 %
Topic  6 :  6.74410568424459 %
Topic  7 :  12.119470659605742 %
Topic  8 :  33.44822797360226 %
Topic  9 :  0.011682243208406669 %
Topic  10 :  0.011682243215414135 %
Topic  11 :  14.275721848642492 %
Topic  12 :  0.011682243163956564 %
Topic  13 :  0.011682243205807468 %
Topic  14 :  1.738998479924205 %
Topic  15 :  0.011682243195455065 %
Topic  16 :  1.8559061974896331 %
Topic  17 :  1.2904915369364034 %
Topic  18 :  0.01168224305176054 %
Topic  19 :  0.011682243186493033 %


In [20]:
vocab = tf_vectorizer.get_feature_names_out()
for i, comp in enumerate(lda_tf.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
say time way need make people like think going big 

Topic 1: 
adam soul sea suez brickwork sexual softbank flu california canada 

Topic 2: 
rate bank economy growth central economic inflation reserve fed policy 

Topic 3: 
wilson progressive foundation pearce exclude aberdeen beddow preston barker potter 

Topic 4: 
ubs autonomous lawcock decentralised magnetite sino citic grange mineralogy glyn 

Topic 5: 
crown casino star packer hotel gaming resort gambling sydney operator 

Topic 6: 
patts unanimously groundwater arizona delegation barlow sacred crouch spacex torres 

Topic 7: 
organic packaging plastic amcor farrell delia abotomey glove protective ansell 

Topic 8: 
woman csl female diversity recall hunt elder bramble men blood 

Topic 9: 
infection tomlinson hayes hospitalisation ballooning calvary socialist aws stockpiling taxonomy 

Topic 10: 
energy power solar electricity renewable generation plant agl storage battery 

Topic 11: 
wisetech pursuit memory adairs di

profit share earnings growth billion dividend result analyst cost revenue 

Topic 96: 
gallagher royalty contract village mongolia outflow sand renewal paint inflow 

Topic 97: 
examining gorge deceptive misleading wam giles golf bookie walton charitable 

Topic 98: 
blackmore bunting tdm cowan initiating unanswered bookmaking honan companya gasmr 

Topic 99: 
executive chief board director chairman group ceo financial role management 



In [20]:
vocab = tf_vectorizer.get_feature_names_out()
for i, comp in enumerate(lda_tf_20.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:30]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
financial commission report court regulator claim review action government case 

Topic 1: 
bank loan credit financial banking capital billion customer rate risk 

Topic 2: 
rate economy global china economic bond world central investor policy 

Topic 3: 
project group construction contract infrastructure road toll building billion contractor 

Topic 4: 
share shareholder board deal group investor offer capital executive director 

Topic 5: 
crown network medium telstra mobile casino nbn news service content 

Topic 6: 
share price growth profit earnings month billion stock analyst result 

Topic 7: 
say people time executive chief big like make think way 

Topic 8: 
energy power solar electricity vehicle car renewable battery generation wind 

Topic 9: 
coal port rail union queensland worker thermal aurizon terminal agreement 

Topic 10: 
sale store retailer brand retail woolworth online food customer product 

Topic 11: 
government project supply industry plant gas energy e

### Calculating Coherence Score

In [15]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel
import gensim.corpora as corpora

In [16]:
def get_Cv(model, df_columnm):
    topics = model.components_
    n_top_words = 20
    texts = [[word for word in doc.split()] for doc in df_columnm]
    # create the dictionary
    dictionary = corpora.Dictionary(texts)
    # Create a gensim dictionary from the word count matrix
    # Create a gensim corpus from the word count matrix
    corpus = [dictionary.doc2bow(text) for text in texts]
    feature_names = [dictionary[i] for i in range(len(dictionary))]
    # Get the top words for each topic from the components_ attribute
    top_words = []
    for topic in topics:
        top_words.append([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
    coherence_model = CoherenceModel(topics=top_words, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
    return coherence

In [17]:
def get_Umass(model, df_columnm):
    topics = model.components_
    n_top_words = 20
    texts = [[word for word in doc.split()] for doc in df_columnm]
    # create the dictionary
    dictionary = corpora.Dictionary(texts)
    # Create a gensim dictionary from the word count matrix
    # Create a gensim corpus from the word count matrix
    corpus = [dictionary.doc2bow(text) for text in texts]
    feature_names = [dictionary[i] for i in range(len(dictionary))]
    # Get the top words for each topic from the components_ attribute
    top_words = []
    for topic in topics:
        top_words.append([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
    coherence_model = CoherenceModel(topics=top_words, texts=texts, dictionary=dictionary, coherence='u_mass')
    coherence = coherence_model.get_coherence()
    return coherence

In [None]:
coherence_cv = get_Cv(lda_tf_20,df['clean_text'])
print(coherence_cv)
coherence_umass = get_Umass(lda_tf_20,df['clean_text'])
print(coherence_umass)

In [28]:
coherence_cv = get_Cv(lda_tf_50,df['clean_text'])
print(coherence_cv)
coherence_umass = get_Umass(lda_tf_50,df['clean_text'])
print(coherence_umass)

0.5856644661189552
-17.13975540062799


In [29]:
coherence_cv = get_Cv(lda_tf_100,df['clean_text'])
print(coherence_cv)
coherence_umass = get_Umass(lda_tf_100,df['clean_text'])
print(coherence_umass)

0.5864202981780567
-17.21704266897991


In [30]:
coherence_cv = get_Cv(lda_tf_500,df['clean_text'])
print(coherence_cv)
coherence_umass = get_Umass(lda_tf_500,df['clean_text'])
print(coherence_umass)

0.5905801352521686
-17.22191333979769


In [16]:
 # #------------------------  pyLDAvis visualisation   -------------------------
import pyLDAvis.lda_model
import pyLDAvis

pyLDAvis.enable_notebook()

data = pyLDAvis.lda_model.prepare(lda_tf, dtm_tf, tf_vectorizer)
print(data)

pyLDAvis.show(data, local = False)
pyLDAvis.save_json(data,'fileobj.html')

PreparedData(topic_coordinates=              x         y  topics  cluster       Freq
topic                                                
0      0.209979 -0.181601       1        1  16.765831
95     0.256229 -0.120966       2        1   7.717867
49     0.267625 -0.104464       3        1   5.277460
21     0.251136 -0.081986       4        1   4.672997
99     0.218792 -0.077639       5        1   3.776451
...         ...       ...     ...      ...        ...
14    -0.184320 -0.117605      96        1   0.017992
69    -0.192510 -0.148727      97        1   0.014117
98    -0.191106 -0.142903      98        1   0.013660
45    -0.193451 -0.152595      99        1   0.012376
81    -0.194328 -0.157216     100        1   0.011732

[100 rows x 5 columns], topic_info=             Term          Freq         Total  Category  logprob  loglift
2193         bank  48591.000000  48591.000000   Default  30.0000  30.0000
24783       share  43732.000000  43732.000000   Default  29.0000  29.0000
11206    

127.0.0.1 - - [24/Apr/2023 11:57:28] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [24/Apr/2023 11:57:31] code 404, message Not Found
127.0.0.1 - - [24/Apr/2023 11:57:31] "GET /favicon.ico HTTP/1.1" 404 -



stopping Server...


In [17]:
pyLDAvis.save_json(data,'fileobj.html')

### (Exploratory) Train LDA using octis

In [None]:
test = df[:300]
test = test.drop('date', axis=1)

In [None]:
test.to_csv('testdata/corpus.tsv',sep='\t',index=None)

In [None]:
import os
import string
from octis.preprocessing.preprocessing import Preprocessing
os.chdir(os.path.pardir)

# Initialize preprocessing
preprocessor = Preprocessing(vocabulary=4, max_features=4,
                             remove_punctuation=True, punctuation=string.punctuation,
                             lemmatize=True, stopword_list='english',
                             min_chars=1, min_words_docs=0)


In [None]:
# preprocess
dataset = preprocessor.preprocess_dataset(documents_path='testdata/corpus.tsv', labels_path='testdata/vocabulary.txt')
# save the preprocessed dataset
dataset.save('hello_dataset')

In [None]:
from octis.dataset.dataset import Dataset
from octis.models.LDA import LDA

model = LDA(num_topics=25)  # Create model
model_output = model.train_model(dataset) # Train the model

In [None]:
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
metric = TopicDiversity(topk=10) # Initialize metric
topic_diversity_score = metric.score(model_output) # Compute score of the metric