In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from gensim.models import LdaModel
from gensim.corpora import Dictionary

import warnings
warnings.filterwarnings("ignore")
import pandas as pd

In [8]:
def preprocess (textstring):
    """A simple preprocess function that takes a textstring as input and then
    tokenize it using word_tokenizer,
    removes stopwords,
    lowercase each tokens,
    removes non-alphabatic characters
    """
    
    stops = set(stopwords.words("english"))
    tokens = word_tokenize(textstring)
    return [
        token.lower() for token in tokens
        if token.isalpha() and token not in stops
    ]

In [11]:
data_path = "../Get Data/Data/all_news/all_news_2022_10_09.csv"

In [12]:
df = pd.read_csv(data_path,encoding="utf-8")

In [13]:
df.head(5)

Unnamed: 0,URLS,Headline,Article
0,/news/bangladesh/crime-justice/news/bangladesh...,"2 Bangladeshis killed by “BSF” along Satkhira,...",Two Bangladeshis have been shot dead along the...
1,/sports/football/news/militao-takes-real-top-l...,"Militao takes Real top of La Liga, Atletico sc...",Eder Militao's early goal earned Real Madrid a...
2,/business/global-economy/news/india-kicks-bank...,India kicks off bank divestments,The Indian government is looking to sell a 60....
3,/news/bangladesh/news/their-guardian-angel-313...,Their guardian angel,"Setara was a playful child, curious about the ..."
4,/news/bangladesh/accidents-fires/news/2-dead-w...,2 dead while working inside septic tank in Cox...,Two construction workers have died and another...


In [15]:
type(df['Article'])

pandas.core.series.Series

In [18]:
articles = []
for article in df['Article']:
    articles.append(preprocess(article))

In [19]:
articles[0]

['two',
 'bangladeshis',
 'shot',
 'dead',
 'along',
 'border',
 'india',
 'satkhira',
 'satkhira',
 'bangladeshi',
 'shot',
 'dead',
 'india',
 'border',
 'security',
 'force',
 'along',
 'kaliani',
 'border',
 'sadar',
 'upazila',
 'today',
 'october',
 'family',
 'however',
 'denied',
 'firing',
 'taken',
 'place',
 'around',
 'reports',
 'local',
 'deceased',
 'identified',
 'abu',
 'hasan',
 'dakshin',
 'kushkhali',
 'village',
 'sadar',
 'father',
 'haider',
 'ali',
 'sheikh',
 'said',
 'my',
 'son',
 'crossed',
 'border',
 'went',
 'india',
 'last',
 'night',
 'bsf',
 'personnel',
 'dubli',
 'camp',
 'india',
 'basirhat',
 'opposite',
 'kaliani',
 'border',
 'fired',
 'later',
 'someone',
 'brought',
 'bangladesh',
 'territory',
 'admitted',
 'satkhira',
 'sadar',
 'hospital',
 'around',
 'bgb',
 'commander',
 'lt',
 'col',
 'mohammad',
 'al',
 'mahmud',
 'said',
 'when',
 'bsf',
 'asked',
 'hasan',
 'death',
 'denied',
 'firing',
 'however',
 'letter',
 'flag',
 'meeting',
 'se

In [20]:
# Create a dictionary representation of the articles
dictionary = Dictionary(articles)

In [21]:
# Filtering infrequent of too frequent words
dictionary.filter_extremes(no_below=10,no_above=0.5)

In [22]:
len(dictionary)

257

In [28]:
corpus = [
    dictionary.doc2bow(article)
    for article in articles
]

In [30]:
len(corpus)

110

In [31]:
dictionary[0]

'al'

In [32]:
# Making an index to word-dictionary
temp = dictionary[0]
id2word = dictionary.id2token

In [33]:
id2word

{0: 'al',
 1: 'along',
 2: 'around',
 3: 'asked',
 4: 'bangladesh',
 5: 'came',
 6: 'death',
 7: 'director',
 8: 'family',
 9: 'force',
 10: 'found',
 11: 'given',
 12: 'hasan',
 13: 'hospital',
 14: 'however',
 15: 'identified',
 16: 'in',
 17: 'information',
 18: 'it',
 19: 'killed',
 20: 'know',
 21: 'last',
 22: 'later',
 23: 'local',
 24: 'look',
 25: 'man',
 26: 'meeting',
 27: 'night',
 28: 'october',
 29: 'officer',
 30: 'place',
 31: 'police',
 32: 'reports',
 33: 'security',
 34: 'seen',
 35: 'sheikh',
 36: 'side',
 37: 'station',
 38: 'take',
 39: 'taken',
 40: 'this',
 41: 'time',
 42: 'today',
 43: 'told',
 44: 'took',
 45: 'two',
 46: 'union',
 47: 'upazila',
 48: 'went',
 49: 'when',
 50: 'able',
 51: 'area',
 52: 'back',
 53: 'could',
 54: 'early',
 55: 'find',
 56: 'first',
 57: 'following',
 58: 'friday',
 59: 'get',
 60: 'half',
 61: 'help',
 62: 'higher',
 63: 'home',
 64: 'lead',
 65: 'league',
 66: 'long',
 67: 'make',
 68: 'new',
 69: 'next',
 70: 'one',
 71: 'ro

In [34]:
# train the topic model
model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    iterations=400,
    num_topics=10
)

In [35]:
model.top_topics(corpus)

[([(0.03558791, 'league'),
   (0.028100094, 'two'),
   (0.025031075, 'would'),
   (0.019448943, 'i'),
   (0.018942153, 'tk'),
   (0.01517504, 'also'),
   (0.014437637, 'hours'),
   (0.014376664, 'it'),
   (0.014376618, 'us'),
   (0.014315946, 'bangladesh'),
   (0.013572684, 'business'),
   (0.0135447495, 'time'),
   (0.013164731, 'president'),
   (0.012915394, 'group'),
   (0.012266346, 'set'),
   (0.012019671, 'last'),
   (0.011745663, 'dhaka'),
   (0.011476818, 'new'),
   (0.011154065, 'around'),
   (0.011049236, 'and')],
  -1.5231625177414636),
 ([(0.028470874, 'hospital'),
   (0.020639904, 'tk'),
   (0.019282253, 'also'),
   (0.01741522, 'bangladesh'),
   (0.0149968695, 'two'),
   (0.014333311, 'three'),
   (0.013461171, 'new'),
   (0.013460116, 'we'),
   (0.013292181, 'dhaka'),
   (0.012827906, 'percent'),
   (0.012531609, 'i'),
   (0.012504477, 'last'),
   (0.011914824, 'day'),
   (0.011141631, 'process'),
   (0.011085078, 'around'),
   (0.010525288, 'health'),
   (0.00991102, 'd