<a href="https://colab.research.google.com/github/SRIKAR-SILUVERI/NLP/blob/main/2403A52240_ASSIGNMENT(6_3).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

** LDA with Sample Data and BoW**

# **LDA-Data.xlsx**

**1. PREPARE CORPUS**

In [None]:
import pandas as pd

**2.LOAD TEXT DATA**

In [None]:
# Load Excel file
df = pd.read_excel("LDA-Data.xlsx")

# View first rows
df.head()

Unnamed: 0,News
0,Virat scored century in match
1,BJP won in elections
2,Bumra took 5 wicket in a match
3,Congress form state government


In [None]:
corpus = df['News'].astype(str).tolist()

**3. TEXT PREPROCESSING**

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


**PREPROCESSING FUNCTION**

In [None]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Remove numbers & special characters
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenization
    tokens = nltk.word_tokenize(text)

    # Stopword removal & Lemmatization
    tokens = [lemmatizer.lemmatize(word)
              for word in tokens if word not in stop_words]

    # Rejoin words
    return " ".join(tokens)

APPLY PREPROCESSING

In [None]:
nltk.download('punkt_tab')
clean_corpus = [preprocess_text(doc) for doc in corpus]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


**4.BAG OF WORDS**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_df=0.9, min_df=2)
bow = vectorizer.fit_transform(clean_corpus)

**5. BOW AS DATAFRAME**

In [None]:
bow_df = pd.DataFrame(
    bow.toarray(),
    columns=vectorizer.get_feature_names_out()
)

bow_df.head()

Unnamed: 0,match
0,1
1,0
2,1
3,0


**6.APPLY LDA**

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(
    n_components=3,    # number of topics
    random_state=42
)

lda_model.fit(bow)

**7.IDENTIFY WORDS FOR EACH TOPIC**

In [None]:
def display_topics(model, feature_names, num_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        print(" ".join(
            [feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]
        ))

In [None]:
display_topics(lda_model, vectorizer.get_feature_names_out(), 10)


Topic 1:
match

Topic 2:
match

Topic 3:
match


**8.TOPIC MODELLING**

In [None]:
topic_distribution = lda_model.transform(bow)

topic_df = pd.DataFrame(
    topic_distribution,
    columns=[f"Topic {i+1}" for i in range(lda_model.n_components)]
)

topic_df.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3
0,0.333333,0.333333,0.333333
1,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333
3,0.333333,0.333333,0.333333


**LDA with Kaggle Data and BoW**

# **ArXiv paper abstracts dataset from Kaggle.**

## 1. PREPARE CORPUS

In [None]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

**2.LOAD TEXT DATA**

In [None]:
# Replace with your actual file path
df = pd.read_csv("arxiv_data.csv", engine='python', on_bad_lines='warn')
df.head()

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


In [None]:
df.columns

Index(['titles', 'summaries', 'terms'], dtype='object')

**3. TEXT PREPROCESSING**

*Download NLTK Dependencies*

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

*Preprocessing Function*

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Remove special characters & digits
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Remove stopwords + lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    # Rejoin
    return " ".join(tokens)

*Apply Preprocessing*

In [None]:
clean_abstracts = df['summaries'].astype(str).apply(preprocess_text)

**4. BAG OF WORDS**

In [None]:
vectorizer = CountVectorizer(
    max_df=0.9,    # ignore extremely common words
    min_df=5       # ignore rare words
)

bow_matrix = vectorizer.fit_transform(clean_abstracts)
print(bow_matrix)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 176222 stored elements and shape (2384, 3800)>
  Coords	Values
  (0, 3293)	5
  (0, 2087)	4
  (0, 2378)	2
  (0, 3766)	1
  (0, 3641)	2
  (0, 3434)	1
  (0, 1761)	1
  (0, 902)	1
  (0, 1662)	2
  (0, 2451)	1
  (0, 2993)	1
  (0, 3251)	2
  (0, 325)	1
  (0, 2056)	2
  (0, 3496)	1
  (0, 2928)	1
  (0, 3177)	1
  (0, 1352)	1
  (0, 186)	2
  (0, 286)	1
  (0, 1054)	1
  (0, 2989)	1
  (0, 2272)	1
  (0, 2813)	1
  (0, 2069)	2
  :	:
  (2382, 2868)	1
  (2382, 2346)	1
  (2382, 568)	1
  (2382, 523)	1
  (2382, 2138)	4
  (2382, 2397)	1
  (2382, 219)	1
  (2382, 2059)	1
  (2382, 220)	1
  (2382, 1793)	1
  (2382, 3121)	3
  (2382, 504)	1
  (2382, 3343)	1
  (2382, 2752)	1
  (2382, 1790)	1
  (2382, 56)	1
  (2382, 924)	1
  (2382, 329)	6
  (2382, 3181)	1
  (2382, 165)	1
  (2382, 304)	1
  (2382, 2304)	1
  (2382, 2985)	5
  (2382, 2186)	9
  (2383, 2320)	1


**5. BOW AS DATAFRAME**

In [None]:
bow_df = pd.DataFrame(
    bow_matrix.toarray(),
    columns=vectorizer.get_feature_names_out()
)

bow_df.head()

Unnamed: 0,abdominal,ability,ablation,able,abnormal,abnormality,absence,absolute,abstract,abstraction,...,xray,xrays,year,yet,yield,yielded,yielding,youtube,zero,zeroshot
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


**6. APPLY LDA**

In [None]:
lda_model = LatentDirichletAllocation(
    n_components = 10,
    random_state = 42,
    learning_method = "batch"
)

lda_model.fit(bow_matrix)

**7. IDENTIFY WORDS FOR EACH TOPIC**

In [None]:
def display_topics(model, feature_names, no_top_words):
    for i, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
        print(f"Topic {i+1}: {', '.join(top_words)}")

# Display top 15 words per topic
display_topics(lda_model, vectorizer.get_feature_names_out(), 15)

Topic 1: graph, representation, learning, node, network, method, information, structure, propose, embedding, feature, model, task, neural, proposed
Topic 2: segmentation, image, network, medical, deep, method, neural, model, convolutional, performance, result, training, proposed, learning, architecture
Topic 3: learning, representation, sample, task, contrastive, model, negative, policy, state, loss, method, rl, agent, latent, reinforcement
Topic 4: segmentation, image, object, method, boundary, region, pixel, semantic, proposed, model, label, annotation, mask, contour, result
Topic 5: image, segmentation, model, method, data, learning, domain, task, training, performance, feature, propose, network, approach, label
Topic 6: image, segmentation, algorithm, clustering, method, proposed, result, based, cluster, paper, using, region, used, color, fuzzy
Topic 7: learning, representation, data, model, task, method, training, deep, approach, show, framework, datasets, unsupervised, network, p

**8. TOPIC MODELLING**

In [None]:
topic_dist = lda_model.transform(bow_matrix)

topic_df = pd.DataFrame(
    topic_dist,
    columns=[f"Topic {i+1}" for i in range(lda_model.n_components)]
)

topic_df.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10
0,0.001282,0.001282,0.001282,0.174077,0.001282,0.001282,0.001282,0.001282,0.41453,0.402417
1,0.071707,0.400928,0.000813,0.000813,0.000813,0.01506,0.425921,0.082318,0.000813,0.000813
2,0.00069,0.00069,0.00069,0.14633,0.715236,0.00069,0.00069,0.00069,0.00069,0.133605
3,0.000834,0.000834,0.000834,0.000833,0.903061,0.000834,0.000834,0.000834,0.09027,0.000834
4,0.000637,0.057624,0.000637,0.104765,0.000637,0.000637,0.191856,0.130012,0.117911,0.395284


# **FOR NMF**

**NMF with Sample Data and BoW**

`**SAMPLE DATA:** Apply NMF,Identify words for each topic, topic modeling `

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [None]:
df = pd.read_excel("/content/LDA-Data.xlsx")
texts = df['News'].dropna()

`APPLY NMF`

In [None]:
nmf_model = NMF(n_components=5, random_state=42)
nmf_model.fit(tfidf_matrix)

`Identify words for each topic`

In [None]:
feature_names = vectorizer.get_feature_names_out()

for topic_idx, topic in enumerate(nmf_model.components_):
    print(f"\nTopic {topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))


Topic 1:
in match

Topic 2:
match in

Topic 3:
match in

Topic 4:
in match

Topic 5:
match in


`Topic modeling `

In [None]:
W = nmf_model.transform(tfidf_matrix)
df['Dominant_Topic'] = W.argmax(axis=1)
df[['News', 'Dominant_Topic']].head()

Unnamed: 0,News,Dominant_Topic
0,Virat scored century in match,0
1,BJP won in elections,0
2,Bumra took 5 wicket in a match,0
3,Congress form state government,0


**NMF with Kaggle Data and BoW**

`**kaggle dataset: **apply nmf,identify words for each topic,Topic modelling`

APPLY NMF

In [None]:
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.9,    # ignore extremely common words
    min_df=5       # ignore rare words
)
tfidf_matrix_kaggle = tfidf_vectorizer.fit_transform(clean_abstracts)

nmf_model = NMF(n_components=6, random_state=42)
W = nmf_model.fit_transform(tfidf_matrix_kaggle)   # Document–Topic matrix
H = nmf_model.components_                   # Topic–Word matrix

Identify words for each topic

In [None]:
feature_names = tfidf_vectorizer.get_feature_names_out()

for topic_idx, topic in enumerate(H):
    print(f"\nTopic {topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))


Topic 1:
image segmentation algorithm region method object based clustering pixel proposed

Topic 2:
representation learning contrastive data selfsupervised task video model unsupervised learn

Topic 3:
graph node representation embedding gnns learning structure network edge link

Topic 4:
segmentation medical image training annotation data deep label network method

Topic 5:
network feature semantic architecture segmentation convolutional layer neural map module

Topic 6:
domain adaptation target source uda data shift generalization model method


Topic modeling

In [None]:
df = pd.read_csv("arxiv_data.csv", engine='python', on_bad_lines='warn')
df = df.loc[clean_abstracts.index]
df['Dominant_Topic'] = W.argmax(axis=1)
df[['summaries', 'Dominant_Topic']].head()

Unnamed: 0,summaries,Dominant_Topic
0,Stereo matching is one of the widely used tech...,4
1,The recent advancements in artificial intellig...,3
2,"In this paper, we proposed a novel mutual cons...",3
3,Consistency training has proven to be an advan...,3
4,"To ensure safety in automated driving, the cor...",0


# **NMF with Sample Data and TFIDF**

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [None]:
df_sample = pd.read_excel("/content/LDA-Data.xlsx")
texts_sample = df_sample['News'].dropna()

TF-IDF Vectorization

In [None]:
tfidf_sample = TfidfVectorizer(
    stop_words='english',
    max_features=1000
)

tfidf_matrix_sample = tfidf_sample.fit_transform(texts_sample)

Apply NMF (Topic Modeling)

In [None]:
nmf_sample = NMF(n_components=5, random_state=42)

W_sample = nmf_sample.fit_transform(tfidf_matrix_sample)  # Document–Topic
H_sample = nmf_sample.components_                          # Topic–Word

Identify words for each topic

In [None]:
feature_names_sample = tfidf_sample.get_feature_names_out()

for topic_idx, topic in enumerate(H_sample):
    print(f"\nTopic {topic_idx + 1}:")
    print(" ".join([feature_names_sample[i]
                    for i in topic.argsort()[:-11:-1]]))


Topic 1:
state government form congress wicket won scored took virat match

Topic 2:
bumra wicket took match won virat scored state form government

Topic 3:
took wicket bumra match won virat scored state form government

Topic 4:
scored virat century match wicket won state took form government

Topic 5:
won elections bjp took virat wicket scored state government match


Assign Dominant Topic to Each Document

In [None]:
df_sample = df_sample.loc[texts_sample.index]
df_sample['Dominant_Topic'] = W_sample.argmax(axis=1)

df_sample[['News', 'Dominant_Topic']].head()

Unnamed: 0,News,Dominant_Topic
0,Virat scored century in match,3
1,BJP won in elections,4
2,Bumra took 5 wicket in a match,1
3,Congress form state government,0


# **NMF with Kaggle Data and TFIDF**

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [None]:
df_kaggle = pd.read_csv("/content/arxiv_data.csv", engine='python', on_bad_lines='warn')
texts_kaggle = df_kaggle['summaries'].dropna()

TF-IDF Vectorization

In [None]:
tfidf_kaggle = TfidfVectorizer(
    stop_words='english',
    max_features=2000
)

tfidf_matrix_kaggle = tfidf_kaggle.fit_transform(texts_kaggle)

Apply NMF (Topic Modeling)

In [None]:
nmf_kaggle = NMF(n_components=6, random_state=42)

W_kaggle = nmf_kaggle.fit_transform(tfidf_matrix_kaggle)  # Document–Topic
H_kaggle = nmf_kaggle.components_                         # Topic–Word



Identify Words for Each Topic

In [None]:
feature_names_kaggle = tfidf_kaggle.get_feature_names_out()

for topic_idx, topic in enumerate(H_kaggle):
    print(f"\nTopic {topic_idx + 1}:")
    print(" ".join([feature_names_kaggle[i]
                    for i in topic.argsort()[:-11:-1]]))


Topic 1:
data learning models training model domain time deep series adversarial

Topic 2:
image segmentation images resolution method semantic color network quality proposed

Topic 3:
graph graphs node gnns nodes networks gnn neural representation structure

Topic 4:
policy learning rl reinforcement agent algorithm algorithms reward agents function

Topic 5:
3d point object detection cloud depth clouds objects 2d lidar

Topic 6:
attention video temporal features visual transformer feature model network information


Assign Dominant Topic to Each Document

In [None]:
df_kaggle = df_kaggle.loc[texts_kaggle.index]
df_kaggle['Dominant_Topic'] = W_kaggle.argmax(axis=1)

df_kaggle[['summaries', 'Dominant_Topic']].head()

Unnamed: 0,summaries,Dominant_Topic
0,Stereo matching is one of the widely used tech...,1
1,The recent advancements in artificial intellig...,1
2,"In this paper, we proposed a novel mutual cons...",1
3,Consistency training has proven to be an advan...,1
4,"To ensure safety in automated driving, the cor...",1
