In [40]:
from bertopic import BERTopic
from datetime import datetime
from bertopic.vectorizers import ClassTfidfTransformer
import pandas as pd

In [41]:
df = pd.read_csv('../../Datasets/bbc_news.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33819 entries, 0 to 33818
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        33819 non-null  object
 1   pubDate      33819 non-null  object
 2   guid         33819 non-null  object
 3   link         33819 non-null  object
 4   description  33819 non-null  object
dtypes: object(5)
memory usage: 1.3+ MB


In [42]:
%%capture
ctfidfModel = ClassTfidfTransformer(reduce_frequent_words=True)
topicModel = BERTopic(min_topic_size = 100, ctfidf_model=ctfidfModel)
docs = df['title'] + ' ' + df['description']
topics, probs = topicModel.fit_transform(docs)

In [43]:
def getTopicName(topicNumber):
    topicTerms = topicModel.get_topic(topicNumber)
    topTerms = [term[0] for term in topicTerms[:3]]
    return str(topicNumber)+' '+' '.join(topTerms)

In [None]:
df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %Z')
df['shortPubDate'] = df['pubDate'].apply(lambda x: datetime(x.year, x.month, 1))
df['topics'] = topics
df['topicName'] = df['topics'].apply(getTopicName)
df['probs']  = probs
df['docs'] = docs

In [57]:
beforeDate = pd.to_datetime('03/2022', format='%m/%Y')
df = df[df['shortPubDate'] >= beforeDate].sort_values('shortPubDate', ascending=True)
df = df[df['topics'] != -1]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22144 entries, 56 to 33048
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   title         22144 non-null  object             
 1   pubDate       22144 non-null  datetime64[ns, UTC]
 2   guid          22144 non-null  object             
 3   link          22144 non-null  object             
 4   description   22144 non-null  object             
 5   shortPubDate  22144 non-null  datetime64[ns]     
 6   topics        22144 non-null  int64              
 7   topicName     22144 non-null  object             
 8   probs         22144 non-null  float64            
 9   docs          22144 non-null  object             
dtypes: datetime64[ns, UTC](1), datetime64[ns](1), float64(1), int64(1), object(6)
memory usage: 1.9+ MB


In [59]:
df.to_csv('../../Datasets/bbc_news_preprocessed.csv', index=False)