In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction import text 
from nltk.corpus import stopwords
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
import pandas as pd

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ttonny0326/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ttonny0326/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ttonny0326/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ttonny0326/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
news_data = pd.read_excel('/Users/ttonny0326/Data_Project/Topic_Modelling/News/Bloom_Berg_News.xlsx')

In [4]:

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation) 
    return text.translate(translator)

nltk_stopwords = set(stopwords.words('english'))
my_stop_words = text.ENGLISH_STOP_WORDS.union(nltk_stopwords)
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Lemmatize the text
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Remove stop words
    tokens = [token for token in tokens if token not in my_stop_words]
    
    # Remove punctuation and non-alphabetic tokens
    tokens = [token for token in tokens if token.isalpha()]
    
    return tokens

# Apply preprocessing
news_data['processed_content'] = news_data['Content'].apply(preprocess_text)


In [5]:
news_data.head()

Unnamed: 0,Title,Sub_Title,Total_title,Author,Posted_Time,Content,processed_content
0,Xi Urges Open Supply Chains After Curb on Key ...,,Xi Urges Open Supply Chains After Curb on Key ...,Bloomberg News,2023/7/5T00:00:05.250Z,Chinese leader Xi Jinping called on nations to...,"[chinese, leader, xi, jinping, called, nation,..."
1,Xi Urges Open Supply Chains After Curb on Key ...,,Xi Urges Open Supply Chains After Curb on Key ...,Bloomberg News,2023/7/5T00:00:05.250Z,Chinese leader Xi Jinping called on nations to...,"[chinese, leader, xi, jinping, called, nation,..."
2,China’s Chip-Metals Producers Surge on Export ...,,China’s Chip-Metals Producers Surge on Export ...,Bloomberg News,2023/7/4T00:00:05.250Z,The shares of Chinese metals producers surged ...,"[share, chinese, metal, producer, surged, beij..."
3,What Are Gallium and Germanium? Niche Metals H...,,What Are Gallium and Germanium? Niche Metals H...,Archie Hunter+Follow,2023/7/3T00:00:05.250Z,China is clamping down on exports of two obscu...,"[china, clamping, export, obscure, crucial, me..."
4,Xi’s Metal Curbs Risk Backfiring as G-7 Seeks ...,,Xi’s Metal Curbs Risk Backfiring as G-7 Seeks ...,Bloomberg News,2023/7/4T00:00:05.250Z,China’s decision to control the export of two ...,"[china, decision, control, export, key, metal,..."


In [6]:
# Rejoin the processed tokens to create processed text
news_data['processed_text'] = news_data['processed_content'].str.join(' ')

# Initialize CountVectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit and transform the processed titles
dtm = vectorizer.fit_transform(news_data['processed_text'])

# Convert to pandas DataFrame for better visual
dtm_df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())

# Display the document-term matrix
dtm_df.head()



Unnamed: 0,aaa,aaron,ab,abadia,abandoned,abandoning,abate,abated,abating,abbas,...,zinc,zloty,zoe,zoltan,zone,zoopla,zuckerberg,zuma,zunyi,zurich
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Define the number of topics
n_topics = 10

# Initialize LDA model
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)

# Fit the model to the data
lda_model.fit(dtm)

# For each topic, print the top 15 most representative words
for index, topic in enumerate(lda_model.components_):
    print(f"Top words for Topic #{index}")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print("\n")


Top words for Topic #0
['according', 'dollar', 'wa', 'financial', 'currency', 'company', 'government', 'debt', 'market', 'billion', 'ha', 'year', 'said', 'china', 'bank']


Top words for Topic #1
['big', 'people', 'right', 'thing', 'kind', 'year', 'lot', 'really', 'like', 'new', 'going', 'ha', 'know', 'think', 'wa']


Top words for Topic #2
['parliament', 'candidate', 'new', 'mitsotakis', 'country', 'coalition', 'sunak', 'vote', 'prime', 'said', 'ha', 'election', 'minister', 'party', 'government']


Top words for Topic #3
['june', 'hike', 'economist', 'bloomberg', 'policy', 'month', 'economy', 'ha', 'year', 'price', 'central', 'said', 'bank', 'rate', 'inflation']


Top words for Topic #4
['bloomberg', 'according', 'month', 'global', 'ha', 'data', 'billion', 'rate', 'bond', 'yield', 'investor', 'stock', 'said', 'market', 'year']


Top words for Topic #5
['european', 'wa', 'year', 'eu', 'wagner', 'new', 'president', 'gas', 'country', 'putin', 'russian', 'ukraine', 'ha', 'russia', 'said']

In [8]:
# Transform the data
lda_transformed = lda_model.transform(dtm)

# Create a DataFrame
topic_df = pd.DataFrame(lda_transformed, columns=[f"Topic_{i}" for i in range(n_topics)])

# Add a column with the dominant topic per document
topic_df["dominant_topic"] = topic_df.idxmax(axis=1)

# Add the dominant topic data to the original news data
news_data = pd.concat([news_data, topic_df["dominant_topic"]], axis=1)

# Display the first few rows of the updated news data
news_data.head()


Unnamed: 0,Title,Sub_Title,Total_title,Author,Posted_Time,Content,processed_content,processed_text,dominant_topic
0,Xi Urges Open Supply Chains After Curb on Key ...,,Xi Urges Open Supply Chains After Curb on Key ...,Bloomberg News,2023/7/5T00:00:05.250Z,Chinese leader Xi Jinping called on nations to...,"[chinese, leader, xi, jinping, called, nation,...",chinese leader xi jinping called nation spurn ...,Topic_6
1,Xi Urges Open Supply Chains After Curb on Key ...,,Xi Urges Open Supply Chains After Curb on Key ...,Bloomberg News,2023/7/5T00:00:05.250Z,Chinese leader Xi Jinping called on nations to...,"[chinese, leader, xi, jinping, called, nation,...",chinese leader xi jinping called nation spurn ...,Topic_6
2,China’s Chip-Metals Producers Surge on Export ...,,China’s Chip-Metals Producers Surge on Export ...,Bloomberg News,2023/7/4T00:00:05.250Z,The shares of Chinese metals producers surged ...,"[share, chinese, metal, producer, surged, beij...",share chinese metal producer surged beijing im...,Topic_6
3,What Are Gallium and Germanium? Niche Metals H...,,What Are Gallium and Germanium? Niche Metals H...,Archie Hunter+Follow,2023/7/3T00:00:05.250Z,China is clamping down on exports of two obscu...,"[china, clamping, export, obscure, crucial, me...",china clamping export obscure crucial metal es...,Topic_6
4,Xi’s Metal Curbs Risk Backfiring as G-7 Seeks ...,,Xi’s Metal Curbs Risk Backfiring as G-7 Seeks ...,Bloomberg News,2023/7/4T00:00:05.250Z,China’s decision to control the export of two ...,"[china, decision, control, export, key, metal,...",china decision control export key metal showed...,Topic_6


In [9]:
news_data.to_excel('LDA_bloomberg_result.xlsx', index=False)

In [10]:
# Define function to get top words for each topic
def get_top_words(lda_model, vectorizer, n_top_words):
    top_words = {}
    for topic_idx, topic in enumerate(lda_model.components_):
        top_words[f"Topic_{topic_idx}"] = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    return pd.DataFrame(top_words)

# Get top 15 words for each topic
n_top_words = 15
top_words_df = get_top_words(lda_model, vectorizer, n_top_words)

# Display the DataFrame
print(top_words_df)


       Topic_0 Topic_1     Topic_2    Topic_3    Topic_4    Topic_5   Topic_6  \
0         bank      wa  government  inflation       year       said     china   
1        china   think       party       rate     market     russia   chinese   
2         said    know    minister       bank       said         ha      said   
3         year      ha    election       said      stock    ukraine   beijing   
4           ha   going          ha    central   investor    russian        ha   
5      billion     new        said      price      yield      putin       new   
6       market    like       prime       year       bond    country        wa   
7         debt  really        vote         ha       rate        gas    export   
8   government     lot       sunak    economy    billion  president    yellen   
9      company    year   coalition      month       data        new  economic   
10    currency    kind     country     policy         ha     wagner      kong   
11   financial   thing  mits

In [25]:
top_words_df.to_excel('Top_Words_bloomberg.xlsx', index=False)