##### Author: Weisi Chen
##### Last update: 1 May 2023

In [1]:
# Importing necessary Python libraries for this Exercise
import pandas as pd
import xml.etree.ElementTree as et
import io
from bertopic import BERTopic

SystemError: initialization of _internal failed without raising an exception

### Function to read one AFR XML file

In [2]:
# The function to read AFR news from a given XML file
# selected_type is an optional option that generates a filtered table by the news type;
# The value can be one of the following: 
#       'Domain Prestige', 'Companies and Markets', 'News',
#       'Chanticleer', 'Perspective', 'Weekend Fin', 'Opinion',
#       'Stock Tables', 'Smart Investor', 'Poster', 'World', 'Market Wrap',
#       'Property', 'Features', 'Life & Leisure', 'Financial Services',
#       'Review', 'Accounting', 'Marketing & Media', 'Education',
#       'Saleroom', 'Computers', 'Supplement'

def read_afr(xml_file, selected_type = ""):
    xtree = et.parse(xml_file)
    dates_all = []
    news_texts_all = []
    headlines_all = []
    sections_all = []

    for node in xtree.iter('TEXT'):
        news_text = ""
        for subnode in node.iter('p'):
            whole = subnode.itertext()
            for parts in whole:
                news_text += parts
        news_texts_all.append(news_text)

    for node in xtree.iter('SECTION'):
        sections_all.append(node.text)
        
    for node in xtree.iter('PUBLICATIONDATE'):
        dates_all.append(node.text)
        
    # print(len(headlines_all), len(dates_all), len(news_texts_all))
    news_df = pd.DataFrame(
        {'date': dates_all,
         # 'headline': headlines_all,
         'text': news_texts_all,
         'section': sections_all})
    
    if(selected_type):
        print("Selected News Types: ", selected_type)
        news_df = news_df.loc[news_df['section'] == selected_type]
    
    return news_df

### Read all XML files within a folder

In [72]:
import glob

filenames = sorted(glob.glob('./data/AFR*.xml'))
# filenames = filenames[0:3]
docs = []
news = pd.DataFrame()

for filename in filenames:
    print(filename)
    temp = read_afr(filename, "Companies and Markets")
    news = pd.concat([news,temp], axis=0)
    # docs = docs + news['text'].values.tolist()
    
docs = news['text'].values.tolist()  
print("The total data size is: ", len(docs), ".")

./data\AFR_20150101-20150131.xml
Selected News Types:  Companies and Markets
./data\AFR_20150201-20150228.xml
Selected News Types:  Companies and Markets
./data\AFR_20150301-20150331.xml
Selected News Types:  Companies and Markets
./data\AFR_20150401-20150430.xml
Selected News Types:  Companies and Markets
./data\AFR_20150501-20150531.xml
Selected News Types:  Companies and Markets
./data\AFR_20150601-20150630.xml
Selected News Types:  Companies and Markets
./data\AFR_20150701-20150731.xml
Selected News Types:  Companies and Markets
./data\AFR_20150801-20150831.xml
Selected News Types:  Companies and Markets
./data\AFR_20150901-20150930.xml
Selected News Types:  Companies and Markets
./data\AFR_20151001-20151031.xml
Selected News Types:  Companies and Markets
./data\AFR_20151101-20151130.xml
Selected News Types:  Companies and Markets
./data\AFR_20151201-20151231.xml
Selected News Types:  Companies and Markets
./data\AFR_20160101-20160131.xml
Selected News Types:  Companies and Markets

In [90]:
news['date']

3       20150131
4       20150131
5       20150131
23      20150131
24      20150131
          ...   
2583    20211201
2584    20211201
2585    20211201
2586    20211201
2587    20211201
Name: date, Length: 38240, dtype: object

### Code for BERTopic

In [9]:
import time
start_time = time.time()
topic_model = BERTopic()
# topic_model = BERTopic(verbose=True, n_gram_range=(1, 3))
topics, probs = topic_model.fit_transform(docs)
print("--- %s seconds ---" % (time.time() - start_time))

topic_model.save("BERTopic_model")
fig = topic_model.visualize_topics()
fig.write_html("BERTopic_viz1.html")
fig2 = topic_model.visualize_hierarchy()
fig2.write_html("BERTopic_viz2.html")

--- 2632.646096229553 seconds ---


  self._set_arrayXarray(i, j, x)


In [84]:
topic_model = BERTopic.load("./BERTopic_model")

In [85]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,9806,-1_the_are_is_of
1,0,643,0_telstra_nbn_mobile_tpg
2,1,604,1_p28_p30_p26_p29
3,2,392,2_index_stocks_earnings_cent
4,3,354,3_chinese_chinas_china_yuan
...,...,...,...
604,603,10,603_catapult_crowd1_miggster_gaming
605,604,10,604_cfsgam_cfs_countplus_colonial
606,605,10,605_roden_westpac_johnston_traders
607,606,10,606_russia_putin_russian_ukraine


In [93]:
output_df = pd.DataFrame({"Document": docs, "Topic": topic_model.topics_})
output_df['date'] = news['date'].values

In [119]:
output_df.loc[output_df['Topic'] == 66]

Unnamed: 0,Document,Topic,date,sentiment
2303,A former Westpac Bank finance manager is facin...,66,20150602,neutral
3030,The heads of Westpac and AMP have told a Senat...,66,20150811,neutral
4689,Westpac shareholders have hit the bank with a ...,66,20151212,negative
4704,Westpac's annual general meeting looms as a We...,66,20151211,neutral
6511,There’s probably a very good reason why most p...,66,20160407,neutral
...,...,...,...,...
37329,Reserve Bank of New Zealand deputy governor Ge...,66,20211126,negative
37343,It was Westpac chairman John McFarlane who bes...,66,20211126,negative
37945,"Westpac Banking Corp has received a ""first str...",66,20211216,negative
38027,Three of the big four banks will respond to gr...,66,20211213,negative


In [102]:
from transformers import pipeline
SA_model = pipeline(model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

Downloading (…)lve/main/config.json:   0%|          | 0.00/933 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [105]:
SA_result = SA_model(output_df.Document.values.tolist(), truncation=True)

In [109]:
output_df['sentiment'] = [d['label'] for d in SA_result]

In [110]:
output_df

Unnamed: 0,Document,Topic,date,sentiment
0,BC Iron managing director Morgan Ball says he ...,5,20150131,positive
1,"Australia's biggest gold miner, Newcrest Minin...",71,20150131,positive
2,Purchase gives Snowy a vital stake in the elec...,237,20150131,neutral
3,The Wiggins Island Coal Export ­Terminal will ...,378,20150131,negative
4,Seven Group Holdings has positioned itself to ...,117,20150131,positive
...,...,...,...,...
38235,This content is produced by The Australian Fin...,-1,20211201,positive
38236,Andrew Forrest's LNG import venture in Port Ke...,133,20211201,positive
38237,New York | Rising COVID-19 cases and the new o...,-1,20211201,negative
38238,"Cambridge, Massachusetts | The chief executive...",214,20211201,negative


In [116]:
pd.set_option('display.max_rows', 10)

In [120]:
negative_news = output_df.loc[(output_df['Topic'] == 66) & (output_df['sentiment'] == 'negative')]

In [121]:
negative_news.to_csv("neg_news.csv")

In [6]:
topic_model.get_topic(3)

[('chinese', 0.010126039473665744),
 ('chinas', 0.008502930202815237),
 ('china', 0.007933206973262191),
 ('yuan', 0.005144393063565393),
 ('beijing', 0.004660889551810391),
 ('hong', 0.004579232510015234),
 ('kong', 0.00372707774224844),
 ('evergrande', 0.0037170050498207875),
 ('alibaba', 0.003366758977274367),
 ('ant', 0.003317938694148476)]

In [18]:
topic_list = topic_model.get_topics()
topic_list_df = pd.DataFrame.from_dict(topic_list, orient="index")
topic_list_df.to_csv("BERTopic_topic_list.csv")

In [65]:
topic_list_df = pd.DataFrame.from_dict(topic_list, orient="index")
def get_first_term(pair):
    return pair[0]
topic_list_df.apply(get_first_term)
topic_list_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
-1,"(the, 0.01722059541028919)","(to, 0.015492582295412064)","(of, 0.015094877016834329)","(and, 0.014884358905934677)","(in, 0.01403237488564945)","(is, 0.012185641669232763)","(that, 0.011954065780269979)","(for, 0.011068393823104443)","(on, 0.009974630620851375)","(it, 0.00955980983094078)"
0,"(the, 0.01679951890423731)","(to, 0.01526286676343492)","(and, 0.015115115041700749)","(in, 0.014857131362953045)","(of, 0.014787513060430253)","(that, 0.011221361454306862)","(is, 0.010988194067384528)","(for, 0.010740504543703904)","(on, 0.010676619813869224)","(its, 0.01021792198490261)"
1,"(the, 0.01813825102424484)","(to, 0.01566415535179648)","(of, 0.015169819307944777)","(and, 0.013640653055051475)","(in, 0.013017564923683974)","(that, 0.011911853448520287)","(is, 0.011837816049514751)","(banks, 0.011791723402384305)","(for, 0.011038977475394151)","(bank, 0.010782140987357542)"
2,"(the, 0.017415544350045494)","(amp, 0.016629085446176254)","(to, 0.016407971109180013)","(of, 0.016035505362838473)","(and, 0.014720139001459059)","(energy, 0.013910444292873289)","(in, 0.013549693494101287)","(that, 0.01342884926378335)","(coal, 0.012358138210459292)","(is, 0.012008025077212587)"
3,"(the, 0.018922518956706748)","(of, 0.016446888895966634)","(to, 0.015336093115184897)","(in, 0.01438908791036216)","(is, 0.014170053045237)","(china, 0.013774019230974923)","(that, 0.0134824976591348)","(and, 0.012762566051921283)","(chinese, 0.012535186865105557)","(us, 0.010456309084627073)"
4,"(qantas, 0.017307943257908038)","(the, 0.017083576087010117)","(airport, 0.015999634357125075)","(to, 0.015858913249548107)","(and, 0.014408764098195598)","(travel, 0.014183837887019895)","(virgin, 0.013937660338082815)","(in, 0.013842506007903344)","(airlines, 0.013838952227031975)","(of, 0.013772555737198305)"
5,"(telstra, 0.024572264951343695)","(nbn, 0.02152840934078943)","(mobile, 0.020549529795687315)","(to, 0.016882867384925657)","(the, 0.01678846341675678)","(network, 0.015829086230457784)","(and, 0.014576837025174911)","(of, 0.014139916655107724)","(tpg, 0.01298433169871761)","(in, 0.012925663289588536)"
6,"(health, 0.01731591145163922)","(the, 0.01606969836780853)","(and, 0.01517801325421624)","(to, 0.015052663537022908)","(in, 0.014909469353895506)","(of, 0.013957248334921839)","(is, 0.011806345577754838)","(for, 0.011792270920389635)","(per, 0.01161639929974799)","(hospital, 0.011541964839780035)"
7,"(to, 0.01605416448925903)","(and, 0.016018979639540968)","(the, 0.01595975047204715)","(of, 0.014749784371169873)","(in, 0.013402982057241875)","(facebook, 0.012158011814431007)","(is, 0.011918961146730559)","(its, 0.01180010039035488)","(for, 0.01118986931612259)","(that, 0.011102356797635307)"
8,"(p28, 0.05934086007083249)","(p30, 0.05180227689285307)","(p26, 0.04785285204479147)","(asx, 0.04369400845790957)","(p29, 0.03316799826756565)","(p27, 0.03271783414950857)","(insideretail, 0.031192076569032112)","(p20, 0.02637024099945963)","(maley, 0.021922014702453873)","(p32, 0.021115208191121837)"


In [6]:
fig3 = topic_model.visualize_barchart()
fig3.write_html("BERTopic_viz3.html")
fig4 = topic_model.visualize_topics(top_n_topics=50)
fig4.write_html("BERTopic_viz4.html")

In [19]:
topic_model.reduce_topics(docs, nr_topics=50)

<bertopic._bertopic.BERTopic at 0x1ace34d0a30>

In [21]:
topic_list = topic_model.get_topics()
topic_list_df = pd.DataFrame.from_dict(topic_list, orient="index")
# topic_list_df.to_csv("BERTopic_topic_list_50.csv")

# df = pd.DataFrame.from_dict(d,orient='index').transpose()
# df = df.fillna(method='ffill')
print(topic_list_df)

                            -1                             0    
0    (the, 0.01722059541028919)    (the, 0.01679951890423731)  \
1    (to, 0.015492582295412064)     (to, 0.01526286676343492)   
2    (of, 0.015094877016834329)   (and, 0.015115115041700749)   
3   (and, 0.014884358905934677)    (in, 0.014857131362953045)   
4     (in, 0.01403237488564945)    (of, 0.014787513060430253)   
5    (is, 0.012185641669232763)  (that, 0.011221361454306862)   
6  (that, 0.011954065780269979)    (is, 0.010988194067384528)   
7   (for, 0.011068393823104443)   (for, 0.010740504543703904)   
8    (on, 0.009974630620851375)    (on, 0.010676619813869224)   
9     (it, 0.00955980983094078)    (its, 0.01021792198490261)   

                              1                               2    
0     (the, 0.01813825102424484)     (the, 0.017415544350045494)  \
1      (to, 0.01566415535179648)     (amp, 0.016629085446176254)   
2     (of, 0.015169819307944777)      (to, 0.016407971109180013)   
3    (and, 0

In [9]:
topics = topic_model.topics_
topics

[0,
 0,
 0,
 0,
 0,
 -1,
 0,
 -1,
 -1,
 -1,
 1,
 1,
 1,
 1,
 0,
 0,
 -1,
 -1,
 0,
 -1,
 0,
 0,
 30,
 0,
 23,
 10,
 0,
 0,
 0,
 1,
 2,
 0,
 -1,
 0,
 30,
 0,
 28,
 0,
 -1,
 0,
 -1,
 0,
 0,
 -1,
 1,
 0,
 16,
 1,
 -1,
 -1,
 5,
 -1,
 3,
 9,
 -1,
 7,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 0,
 0,
 -1,
 1,
 -1,
 7,
 0,
 0,
 0,
 0,
 -1,
 3,
 0,
 3,
 0,
 -1,
 -1,
 -1,
 0,
 5,
 -1,
 6,
 1,
 1,
 0,
 0,
 -1,
 -1,
 1,
 0,
 0,
 -1,
 0,
 0,
 0,
 0,
 0,
 0,
 -1,
 36,
 0,
 20,
 -1,
 1,
 0,
 -1,
 -1,
 0,
 0,
 -1,
 0,
 0,
 0,
 0,
 0,
 -1,
 0,
 0,
 -1,
 0,
 2,
 0,
 -1,
 0,
 -1,
 1,
 0,
 0,
 10,
 4,
 0,
 0,
 -1,
 0,
 -1,
 -1,
 0,
 0,
 1,
 0,
 -1,
 -1,
 -1,
 0,
 -1,
 -1,
 -1,
 42,
 32,
 0,
 -1,
 15,
 -1,
 30,
 -1,
 0,
 6,
 0,
 -1,
 -1,
 0,
 -1,
 1,
 3,
 0,
 0,
 0,
 -1,
 -1,
 -1,
 -1,
 0,
 27,
 -1,
 -1,
 0,
 5,
 0,
 1,
 1,
 -1,
 0,
 -1,
 0,
 5,
 3,
 0,
 -1,
 -1,
 4,
 7,
 -1,
 0,
 0,
 -1,
 4,
 0,
 0,
 0,
 2,
 -1,
 -1,
 -1,
 5,
 -1,
 9,
 18,
 -1,
 0,
 7,
 0,
 0,
 17,
 -1,
 -1,
 -1,
 2,
 0,
 6,
 0,
 -1,
 -1,
 -1,
 

In [11]:
topic_model.update_topics(docs, n_gram_range=(1, 3))

In [23]:
topic_model.get_topic(7)[:10]

[('crown', 0.016020017409080826),
 ('casino', 0.00875838828382906),
 ('packer', 0.0064031258234296924),
 ('crowns', 0.0061657968896585685),
 ('resorts', 0.004229219935083247),
 ('melco', 0.0038440084615654997),
 ('macau', 0.0036332715367634006),
 ('gaming', 0.003405275864480119),
 ('crown resorts', 0.0032401175782722966),
 ('packers', 0.003040038879040304)]

In [28]:
from umap import UMAP
from sentence_transformers import SentenceTransformer
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=False)

In [30]:
# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
fig3 = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
fig3.write_html("BERTopic_viz3.html")

### Calculate Coherence Score

In [4]:
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

In [13]:
#topics= topic_model._map_predictions(topic_model.hdbscan_model.labels_)
topics = topic_model.topics_

[5,
 71,
 237,
 378,
 117,
 -1,
 358,
 -1,
 -1,
 -1,
 522,
 2,
 99,
 20,
 35,
 5,
 -1,
 -1,
 117,
 -1,
 27,
 593,
 185,
 543,
 149,
 6,
 435,
 334,
 182,
 20,
 56,
 344,
 -1,
 319,
 185,
 260,
 136,
 586,
 -1,
 91,
 -1,
 528,
 79,
 -1,
 80,
 91,
 561,
 445,
 -1,
 -1,
 112,
 -1,
 168,
 487,
 -1,
 352,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 139,
 589,
 -1,
 80,
 -1,
 452,
 413,
 10,
 319,
 303,
 -1,
 168,
 5,
 275,
 196,
 -1,
 -1,
 -1,
 10,
 0,
 -1,
 283,
 2,
 99,
 31,
 271,
 -1,
 -1,
 444,
 55,
 548,
 -1,
 548,
 22,
 325,
 354,
 175,
 82,
 -1,
 205,
 10,
 45,
 -1,
 80,
 82,
 -1,
 -1,
 274,
 65,
 -1,
 65,
 63,
 356,
 288,
 436,
 -1,
 166,
 115,
 -1,
 228,
 72,
 130,
 -1,
 102,
 -1,
 80,
 130,
 319,
 6,
 42,
 212,
 23,
 -1,
 31,
 -1,
 -1,
 182,
 559,
 21,
 35,
 -1,
 -1,
 -1,
 225,
 -1,
 -1,
 -1,
 392,
 95,
 147,
 -1,
 415,
 -1,
 185,
 -1,
 5,
 73,
 587,
 -1,
 -1,
 548,
 -1,
 522,
 168,
 31,
 354,
 23,
 -1,
 -1,
 -1,
 -1,
 65,
 103,
 -1,
 -1,
 31,
 0,
 35,
 2,
 99,
 -1,
 65,
 -1,
 130,
 0,
 1

In [None]:
# Wenqi's version of CV score
cv = topic_model.vectorizer_model
X = cv.fit_transform(docs)
doc_tokens = [text.split(" ") for text in docs]

id2word = corpora.Dictionary(doc_tokens)
texts = doc_tokens
corpus = [id2word.doc2bow(text) for text in texts]

topic_words = []
for i in range(len(topic_model.get_topic_freq())-1):
    interim = []
    interim = [t[0] for t in topic_model.get_topic(i)]
    topic_words.append(interim)

coherence_model = CoherenceModel(topics=topic_words, texts=texts, corpus=corpus, dictionary=id2word, coherence='c_v')
coherence_model.get_coherence()

In [15]:
# Official version of coherence score
# Preprocess Documents
documents = pd.DataFrame({"Document": docs,
                          "ID": range(len(docs)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
# words = vectorizer.get_feature_names()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 #coherence='u_mass')
                                 coherence='c_v')
coherence = coherence_model.get_coherence()

In [12]:
coherence # u_mass

-1.156293251998221

In [16]:
coherence # c_v

0.8227582065434376