## Import Libraries

In [1]:
import pandas as pd
import re
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

## Load Dataset

In [3]:
# Load BBC news dataset
df = pd.read_csv("bbc_news.csv")

print(df.head())   # check first few rows
print("Columns:", df.columns)

                                               title  \
0  Ukraine: Angry Zelensky vows to punish Russian...   
1  War in Ukraine: Taking cover in a town under a...   
2         Ukraine war 'catastrophic for global food'   
3  Manchester Arena bombing: Saffie Roussos's par...   
4  Ukraine conflict: Oil price soars to highest l...   

                         pubDate  \
0  Mon, 07 Mar 2022 08:01:56 GMT   
1  Sun, 06 Mar 2022 22:49:58 GMT   
2  Mon, 07 Mar 2022 00:14:42 GMT   
3  Mon, 07 Mar 2022 00:05:40 GMT   
4  Mon, 07 Mar 2022 08:15:53 GMT   

                                               guid  \
0  https://www.bbc.co.uk/news/world-europe-60638042   
1  https://www.bbc.co.uk/news/world-europe-60641873   
2      https://www.bbc.co.uk/news/business-60623941   
3            https://www.bbc.co.uk/news/uk-60579079   
4      https://www.bbc.co.uk/news/business-60642786   

                                                link  \
0  https://www.bbc.co.uk/news/world-europe-606380...   
1  

## Download stopwords

In [11]:
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rachi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preprocessing function

In [12]:
def clean_text(text):
    text = str(text).lower()                      # lowercase
    text = re.sub(r'[^a-z\s]', '', text)          # remove punctuation/numbers
    words = text.split()                          # tokenize
    words = [w for w in words if w not in stop_words]  # remove stopwords
    return " ".join(words)

## Apply preprocessing on 'description' column

In [17]:
df["clean_text"] = df["description"].apply(clean_text)

## Convert Text → Numbers

In [14]:
vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words="english")
X = vectorizer.fit_transform(df["clean_text"])

## Apply LDA (Topic Modeling)

In [15]:
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(X)

## Show Top Words per Topic

In [16]:
words = vectorizer.get_feature_names_out()
for i, topic in enumerate(lda_model.components_):
    top_words = [words[j] for j in topic.argsort()[-10:]]
    print(f"Topic {i+1}: {', '.join(top_words)}")

Topic 1: new, people, bbc, election, government, say, minister, president, uk, says
Topic 2: year, test, years, uk, papers, people, england, bbc, new, says
Topic 3: united, womens, final, says, manchester, win, england, league, world, cup
Topic 4: russian, attack, killed, man, yearold, died, says, people, say, police
Topic 5: royal, new, years, grand, year, wales, world, king, open, says
