## Keyword extraction using TFIDF and Topic modeling using LDA


In [9]:
import pandas as pd

from gensim import models, corpora
from gensim.models import CoherenceModel

In [2]:
df = pd.read_csv("datasets/PDF_text_data.csv")
df.drop(['text'], axis=1, inplace=True)
df.head()

Unnamed: 0,processed_text
0,understand random forestthis article publish d...
1,2 boost – combine weak learner strong learner ...
2,model model 01 model 02 model 03 obtain bootst...
3,step involve random forest algorithm step 1 ra...
4,important features random forest1 diversity- a...


### Keyword extraction using TFIDF:

In [3]:
documents = df['processed_text'].apply(lambda x: x.split()).tolist()    
documents

[['understand',
  'random',
  'forestthis',
  'article',
  'publish',
  'data',
  'science',
  'blogathonintroductionrandom',
  'forest',
  'supervised',
  'machine',
  'learning',
  'algorithm',
  'use',
  'widely',
  'classification',
  'andregression',
  'problem',
  'build',
  'decision',
  'tree',
  'different',
  'sample',
  'majority',
  'vote',
  'forclassification',
  'average',
  'case',
  'regression',
  'important',
  'feature',
  'random',
  'forest',
  'algorithm',
  'handle',
  'datum',
  'setcontaine',
  'continuous',
  'variable',
  'case',
  'regression',
  'categorical',
  'variable',
  'case',
  'ofclassification',
  'perform',
  'result',
  'classification',
  'problem',
  'real',
  'life',
  'analogylet',
  'dive',
  'real',
  'life',
  'analogy',
  'understand',
  'concept',
  'far',
  'student',
  'x',
  'want',
  'choose',
  'acourse',
  '10',
  '2',
  'confused',
  'choice',
  'course',
  'base',
  'skill',
  'set',
  'decidesto',
  'consult',
  'people',
  'l

In [4]:
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]
corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 2),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 2),
  (14, 1),
  (15, 1),
  (16, 3),
  (17, 1),
  (18, 1),
  (19, 2),
  (20, 2),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 2),
  (26, 1),
  (27, 5),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 2),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 2),
  (38, 1),
  (39, 3),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 5),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 2),
  (57, 2),
  (58, 1),
  (59, 1),
  (60, 2),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 3),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 4),
  (71, 1),
  (72, 2),
  (73, 1),
  (74, 1),
  (75, 5),
  (76, 2),
  (77, 2),
  (78, 1),
  (79, 1),
  (80, 2),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 2),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 1),
  (91, 1)

In [5]:
# Perform keyword extraction using TF-IDF
tfidf = models.TfidfModel(corpus)

keyword_threshold = 0.2
keywords = []

for doc in corpus:
    doc_tfidf = tfidf[doc]
    doc_keywords = [(dictionary[word_id], tfidf_value) for word_id, tfidf_value in doc_tfidf if tfidf_value > keyword_threshold]
    keywords.append(doc_keywords)
    
keywords 

[[('case', 0.24939118224988546),
  ('course', 0.29970878542280766),
  ('people', 0.3325215763331806)],
 [('combine', 0.23639560580249785),
  ('replacement', 0.23639560580249785),
  ('aggregation', 0.21856393968340285),
  ('boost', 0.21856393968340285),
  ('bootstrap', 0.3048343457420213),
  ('know', 0.21856393968340285),
  ('learner', 0.21856393968340285)],
 [('model', 0.27355967595212805),
  ('emoji', 0.5884207079986862),
  ('happy', 0.3922804719991241),
  ('obtain', 0.3922804719991241)],
 [('decision', 0.21127590791103987),
  ('tree', 0.2104535128544048),
  ('figure', 0.25843212331870363),
  ('output', 0.25254421542528577),
  ('step', 0.4307202055311727),
  ('basket', 0.23893821046941827),
  ('construct', 0.23893821046941827),
  ('fruit', 0.23893821046941827),
  ('n', 0.23893821046941827)],
 [('decision', 0.28395207820677065), ('tree', 0.32684517946777325)],
 [('variable', 0.21650194078258567),
  ('x', 0.2098685243461261),
  ('split', 0.2098685243461261),
  ('import', 0.3002540252895

In [6]:
for idx, doc_keywords in enumerate(keywords):
    print(f"Document {idx+1} keywords:")
    
    for keyword, tfidf_value in doc_keywords:
        print(keyword, "-", tfidf_value)
    print("-------------------------------------")

Document 1 keywords:
case - 0.24939118224988546
course - 0.29970878542280766
people - 0.3325215763331806
-------------------------------------
Document 2 keywords:
combine - 0.23639560580249785
replacement - 0.23639560580249785
aggregation - 0.21856393968340285
boost - 0.21856393968340285
bootstrap - 0.3048343457420213
know - 0.21856393968340285
learner - 0.21856393968340285
-------------------------------------
Document 3 keywords:
model - 0.27355967595212805
emoji - 0.5884207079986862
happy - 0.3922804719991241
obtain - 0.3922804719991241
-------------------------------------
Document 4 keywords:
decision - 0.21127590791103987
tree - 0.2104535128544048
figure - 0.25843212331870363
output - 0.25254421542528577
step - 0.4307202055311727
basket - 0.23893821046941827
construct - 0.23893821046941827
fruit - 0.23893821046941827
n - 0.23893821046941827
-------------------------------------
Document 5 keywords:
decision - 0.28395207820677065
tree - 0.32684517946777325
-----------------------

### Topic modeling using Latent Dirichlet Allocation (LDA)

In [7]:
required_num_topics = 5
lda_model = models.LdaModel(corpus, num_topics=required_num_topics, id2word=dictionary, passes=10)

print("LDA Topics:-\n")

for idx in range(required_num_topics):
    print(f"Topic {idx+1}:")
    topic_keywords = lda_model.show_topic(idx)
    
    for keyword, prob in topic_keywords:
        print(keyword, "-", prob)
    print("-------------------------------------")

LDA Topics:-

Topic 1:
import - 0.032486834
use - 0.018063974
let - 0.018037416
random - 0.015177302
datum - 0.015157547
forest - 0.0122818565
variable - 0.012209837
train - 0.0122062275
feature - 0.009333784
sample - 0.009323869
-------------------------------------
Topic 2:
sample - 0.029253094
bootstrap - 0.024488205
model - 0.02448809
random - 0.01514961
datum - 0.015113642
combine - 0.015078198
replacement - 0.015063466
forest - 0.010438185
output - 0.010421505
majority - 0.010404459
-------------------------------------
Topic 3:
random - 0.020195553
forest - 0.020065168
course - 0.01990265
use - 0.016189571
people - 0.016059281
model - 0.012311964
article - 0.012288909
understand - 0.012254553
case - 0.012240262
ensemble - 0.012239718
-------------------------------------
Topic 4:
tree - 0.03848133
random - 0.032764364
decision - 0.028963197
forest - 0.025160909
datum - 0.021384787
model - 0.019505922
use - 0.0118291285
majority - 0.009939866
– - 0.009908254
consider - 0.00989917

In [8]:
# Coherence score for the LDA model

coherence_model = CoherenceModel(model=lda_model, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print("Coherence Score:", coherence_score)

Coherence Score: 0.4479193717421756
