In [2]:
from bertopic import BERTopic
from datetime import datetime
from bertopic.vectorizers import ClassTfidfTransformer
import pandas as pd

In [6]:
df = pd.read_csv('../../Datasets/bbc_news.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33819 entries, 0 to 33818
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        33819 non-null  object
 1   pubDate      33819 non-null  object
 2   guid         33819 non-null  object
 3   link         33819 non-null  object
 4   description  33819 non-null  object
dtypes: object(5)
memory usage: 1.3+ MB


In [24]:
%%capture
ctfidfModel = ClassTfidfTransformer(reduce_frequent_words=True)
topicModel = BERTopic(min_topic_size = 100, ctfidf_model=ctfidfModel)
docs = df['title'] + ' ' + df['description']
topics, probs = topicModel.fit_transform(docs)

In [25]:
def convertDateFormat(dateStr):
    datetimeDate = datetime.strptime(dateStr, '%a, %d %b %Y %H:%M:%S %Z')
    return datetimeDate.strftime('%m/%Y')

In [26]:
def getTopicName(topicNumber):
    topicTerms = topicModel.get_topic(topicNumber)
    topTerms = [term[0] for term in topicTerms[:3]]
    return str(topicNumber)+' '+' '.join(topTerms)

In [27]:
df['shortPubDate'] = df['pubDate'].apply(convertDateFormat)
df['topics'] = topics
df['topicName'] = df['topics'].apply(getTopicName)
df['probs']  = probs
df['docs'] = docs
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33819 entries, 0 to 33818
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         33819 non-null  object 
 1   pubDate       33819 non-null  object 
 2   guid          33819 non-null  object 
 3   link          33819 non-null  object 
 4   description   33819 non-null  object 
 5   shortPubDate  33819 non-null  object 
 6   topics        33819 non-null  int64  
 7   topicName     33819 non-null  object 
 8   probs         33819 non-null  float64
 9   docs          33819 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 2.6+ MB


In [28]:
df.to_csv('./Datasets/bbc_news_preprocessed.csv', index=False)