<a href="https://colab.research.google.com/github/Niharika9948/NLP/blob/main/2403A52234_Lab_Assignment_6_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. LDA with Sample Data and BoW**

In [2]:
import pandas as pd

# Load data (Excel or CSV)
df = pd.read_excel("/content/LDA-Data.xlsx")   # or pd.read_csv("data.csv")

# Select text column
documents = df["News"].astype(str).tolist()


In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Added this line as suggested by the error

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Clean text
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords & lemmatize
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words and len(word) > 2
    ]

    # Rejoin tokens
    return " ".join(tokens)

# Apply preprocessing
processed_docs = [preprocess_text(doc) for doc in documents]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(processed_docs)


In [6]:
bow_df = pd.DataFrame(
    bow_matrix.toarray(),
    columns=vectorizer.get_feature_names_out()
)

print(bow_df.head())


   bjp  bumra  century  congress  election  form  government  match  scored  \
0    0      0        1         0         0     0           0      1       1   
1    1      0        0         0         1     0           0      0       0   
2    0      1        0         0         0     0           0      1       0   
3    0      0        0         1         0     1           1      0       0   

   state  took  virat  wicket  
0      0     0      1       0  
1      0     0      0       0  
2      0     1      0       1  
3      1     0      0       0  


In [56]:
from sklearn.decomposition import LatentDirichletAllocation

num_topics = 2

lda_model = LatentDirichletAllocation(
    n_components=num_topics,
    random_state=42
)

lda_model.fit(bow_matrix)


In [57]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        print(
            ", ".join(
                feature_names[i]
                for i in topic.argsort()[:-num_top_words - 1:-1]
            )
        )

display_topics(
    lda_model,
    vectorizer.get_feature_names_out(),
    num_top_words=10
)



Topic 1:
network, detection, neural, object, adversarial, image, using, generative, model, video

Topic 2:
learning, image, segmentation, deep, reinforcement, graph, representation, network, using, via


In [58]:
# Get topic distribution per document
doc_topic_dist = lda_model.transform(bow_matrix)

topic_df = pd.DataFrame(
    doc_topic_dist,
    columns=[f"Topic {i+1}" for i in range(num_topics)]
)

print(topic_df.head())


    Topic 1   Topic 2
0  0.068178  0.931822
1  0.155052  0.844948
2  0.094403  0.905597
3  0.246500  0.753500
4  0.263268  0.736732


In [60]:
# Assign dominant topic to each document

# Re-create CountVectorizer and bow_matrix for sample data
# (Assuming processed_docs and num_topics are still correct from previous sample data cells)
from sklearn.feature_extraction.text import CountVectorizer
sample_vectorizer = CountVectorizer()
sample_bow_matrix = sample_vectorizer.fit_transform(processed_docs)

# Re-create LDA model and fit for sample data
from sklearn.decomposition import LatentDirichletAllocation
sample_num_topics = 2 # Ensure this is consistent with LDA section for sample data
sample_lda_model = LatentDirichletAllocation(
    n_components=sample_num_topics,
    random_state=42
)
sample_lda_model.fit(sample_bow_matrix)

# Get topic distribution per document for sample data
doc_topic_dist = sample_lda_model.transform(sample_bow_matrix)

dominant_topic = doc_topic_dist.argmax(axis=1)

# Create DataFrame showing sentence and its topic
sentence_topic_df = pd.DataFrame({
    "Sentence": documents,
    "Assigned Topic": dominant_topic + 1   # +1 for human-readable topic numbers
})

print(sentence_topic_df)


                         Sentence  Assigned Topic
0   Virat scored century in match               2
1            BJP won in elections               1
2  Bumra took 5 wicket in a match               2
3  Congress form state government               1


** NMF with Sample Data and BoW**

In [10]:
import pandas as pd

# Load the given dataset
df = pd.read_excel("/content/LDA-Data.xlsx")

# Prepare corpus
corpus = df['News'].astype(str).tolist()


In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)     # clean text
    tokens = word_tokenize(text)             # tokenize
    tokens = [lemmatizer.lemmatize(word)     # stopword removal + lemmatization
              for word in tokens
              if word not in stop_words and len(word) > 2]
    return " ".join(tokens)                  # rejoin

processed_corpus = [preprocess_text(doc) for doc in corpus]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(processed_corpus)


In [13]:
bow_df = pd.DataFrame(
    bow.toarray(),
    columns=vectorizer.get_feature_names_out()
)

print(bow_df)


   bjp  bumra  century  congress  election  form  government  match  scored  \
0    0      0        1         0         0     0           0      1       1   
1    1      0        0         0         1     0           0      0       0   
2    0      1        0         0         0     0           0      1       0   
3    0      0        0         1         0     1           1      0       0   

   state  took  virat  wicket  
0      0     0      1       0  
1      0     0      0       0  
2      0     1      0       1  
3      1     0      0       0  


In [14]:
from sklearn.decomposition import NMF

num_topics = 2

nmf_model = NMF(
    n_components=num_topics,
    random_state=42
)

nmf_model.fit(bow)


In [15]:
def display_topics(model, feature_names, top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        print(", ".join(
            feature_names[i]
            for i in topic.argsort()[:-top_words - 1:-1]
        ))

display_topics(
    nmf_model,
    vectorizer.get_feature_names_out(),
    top_words=5
)



Topic 1:
match, virat, took, scored, wicket

Topic 2:
state, form, congress, government, election


In [16]:
doc_topic_matrix = nmf_model.transform(bow)

topic_df = pd.DataFrame(
    doc_topic_matrix,
    columns=[f"Topic {i+1}" for i in range(num_topics)]
)

print(topic_df)


        Topic 1   Topic 2
0  6.915205e-01  0.000000
1  2.809504e-07  0.000099
2  6.915205e-01  0.000000
3  0.000000e+00  1.090551


In [54]:
from sklearn.decomposition import NMF

# Re-initialize nmf_model specifically for this section to ensure correct feature count
num_topics = 2 # Assuming num_topics is still 2 from earlier NMF setup for sample data
nmf_model = NMF(
    n_components=num_topics,
    random_state=42
)
nmf_model.fit(bow)

# Assign dominant topic to each document
doc_topic_matrix = nmf_model.transform(bow) # Re-calculate for sample data
dominant_topics = doc_topic_matrix.argmax(axis=1)

# Create DataFrame mapping sentence to topic
sentence_topic_df = pd.DataFrame({
    "Sentence": documents,
    "Assigned Topic": dominant_topics + 1   # +1 for readable topic numbers
})

print(sentence_topic_df)


                         Sentence  Assigned Topic
0   Virat scored century in match               1
1            BJP won in elections               2
2  Bumra took 5 wicket in a match               1
3  Congress form state government               2


**NMF with Sample Data and TFIDF**

In [17]:
import pandas as pd

# Load dataset
df = pd.read_excel("/content/LDA-Data.xlsx")

# Prepare corpus
corpus = df['News'].astype(str).tolist()


In [18]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words and len(word) > 2
    ]
    return " ".join(tokens)

processed_corpus = [preprocess_text(doc) for doc in corpus]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_corpus)


In [20]:
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print(tfidf_df)


        bjp     bumra   century  congress  election  form  government  \
0  0.000000  0.000000  0.525473       0.0  0.000000   0.0         0.0   
1  0.707107  0.000000  0.000000       0.0  0.707107   0.0         0.0   
2  0.000000  0.525473  0.000000       0.0  0.000000   0.0         0.0   
3  0.000000  0.000000  0.000000       0.5  0.000000   0.5         0.5   

      match    scored  state      took     virat    wicket  
0  0.414289  0.525473    0.0  0.000000  0.525473  0.000000  
1  0.000000  0.000000    0.0  0.000000  0.000000  0.000000  
2  0.414289  0.000000    0.0  0.525473  0.000000  0.525473  
3  0.000000  0.000000    0.5  0.000000  0.000000  0.000000  


In [21]:
from sklearn.decomposition import NMF

num_topics = 2

nmf_model = NMF(
    n_components=num_topics,
    random_state=42
)

nmf_model.fit(tfidf_matrix)


In [22]:
def display_topics(model, feature_names, top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        print(", ".join(
            feature_names[i]
            for i in topic.argsort()[:-top_words - 1:-1]
        ))

display_topics(
    nmf_model,
    tfidf_vectorizer.get_feature_names_out(),
    top_words=5
)



Topic 1:
match, bumra, wicket, took, virat

Topic 2:
election, bjp, form, government, state


In [23]:
doc_topic_distribution = nmf_model.transform(tfidf_matrix)

topic_df = pd.DataFrame(
    doc_topic_distribution,
    columns=[f"Topic {i+1}" for i in range(num_topics)]
)

print(topic_df)


    Topic 1   Topic 2
0  0.579747  0.000000
1  0.000000  0.783491
2  0.579747  0.000000
3  0.000000  0.391745


In [55]:
# Assign dominant topic to each sentence
dominant_topics = doc_topic_distribution.argmax(axis=1)

# Create DataFrame mapping sentence to its topic
sentence_topic_df = pd.DataFrame({
    "Sentence": corpus,
    "Assigned Topic": dominant_topics + 1   # +1 for human-readable topic numbers
})

print(sentence_topic_df)


                                                Sentence  Assigned Topic
0      Survey on Semantic Stereo Matching / Semantic ...               2
1      FUTURE-AI: Guiding Principles and Consensus Re...               2
2      Enforcing Mutual Consistency of Hard Regions f...               1
3      Parameter Decoupling Strategy for Semi-supervi...               2
4      Background-Foreground Segmentation for Interio...               1
...                                                  ...             ...
51769  Hierarchically-coupled hidden Markov models fo...               2
51770                         Blinking Molecule Tracking               2
51771  Towards a Mathematical Foundation of Immunolog...               2
51772  A Semi-Automatic Graph-Based Approach for Dete...               2
51773  SparseCodePicking: feature extraction in mass ...               2

[51774 rows x 2 columns]


**LDA with Kaggle Data and BoW**

In [26]:
import pandas as pd

# Load CSV dataset
df = pd.read_csv("/content/arxiv_data.csv", on_bad_lines='skip', engine='python')

# Prepare corpus
corpus = df['titles'].astype(str).tolist()


In [27]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)     # clean text
    tokens = word_tokenize(text)             # tokenize
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words and len(word) > 2
    ]
    return " ".join(tokens)                  # rejoin

processed_corpus = [preprocess_text(doc) for doc in corpus]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(processed_corpus)


In [29]:
bow_df = pd.DataFrame(
    bow_matrix.toarray(),
    columns=vectorizer.get_feature_names_out()
)

print(bow_df.head())


   aabo  aacp  aadnet  aaformer  aag  aam  aaseg  abandoned  abc  abcdp  ...  \
0     0     0       0         0    0    0      0          0    0      0  ...   
1     0     0       0         0    0    0      0          0    0      0  ...   
2     0     0       0         0    0    0      0          0    0      0  ...   
3     0     0       0         0    0    0      0          0    0      0  ...   
4     0     0       0         0    0    0      0          0    0      0  ...   

   zoom  zoomin  zooming  zoomintocheck  zoomnet  zooplankton  zootuning  \
0     0       0        0              0        0            0          0   
1     0       0        0              0        0            0          0   
2     0       0        0              0        0            0          0   
3     0       0        0              0        0            0          0   
4     0       0        0              0        0            0          0   

   zope  zsslr  zstgan  
0     0      0       0  
1     0     

In [30]:
from sklearn.decomposition import LatentDirichletAllocation

num_topics = 2

lda_model = LatentDirichletAllocation(
    n_components=num_topics,
    random_state=42
)

lda_model.fit(bow_matrix)


In [31]:
def display_topics(model, feature_names, top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        print(", ".join(
            feature_names[i]
            for i in topic.argsort()[:-top_words - 1:-1]
        ))

display_topics(
    lda_model,
    vectorizer.get_feature_names_out(),
    top_words=5
)



Topic 1:
detection, image, object, learning, segmentation

Topic 2:
network, learning, adversarial, generative, deep


In [32]:
doc_topic_distribution = lda_model.transform(bow_matrix)

topic_df = pd.DataFrame(
    doc_topic_distribution,
    columns=[f"Topic {i+1}" for i in range(num_topics)]
)

print(topic_df.head())


    Topic 1   Topic 2
0  0.930293  0.069707
1  0.812159  0.187841
2  0.618477  0.381523
3  0.487752  0.512248
4  0.928091  0.071909


In [61]:
# Assign the dominant topic to each sentence
dominant_topics = doc_topic_distribution.argmax(axis=1)

# Create a DataFrame mapping each sentence to its topic
sentence_topic_df = pd.DataFrame({
    "Sentence": corpus,
    "Assigned Topic": dominant_topics + 1  # +1 for readable topic numbering
})

print(sentence_topic_df)


                                                Sentence  Assigned Topic
0      Survey on Semantic Stereo Matching / Semantic ...               2
1      FUTURE-AI: Guiding Principles and Consensus Re...               2
2      Enforcing Mutual Consistency of Hard Regions f...               1
3      Parameter Decoupling Strategy for Semi-supervi...               2
4      Background-Foreground Segmentation for Interio...               1
...                                                  ...             ...
51769  Hierarchically-coupled hidden Markov models fo...               2
51770                         Blinking Molecule Tracking               2
51771  Towards a Mathematical Foundation of Immunolog...               2
51772  A Semi-Automatic Graph-Based Approach for Dete...               2
51773  SparseCodePicking: feature extraction in mass ...               2

[51774 rows x 2 columns]


 **NMF with Kaggle Data and BoW**

In [35]:
import pandas as pd

# Load CSV dataset
df = pd.read_csv("/content/arxiv_data.csv", on_bad_lines='skip', engine='python')

# Prepare corpus
corpus = df['titles'].astype(str).tolist()


In [65]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)     # Clean text
    tokens = word_tokenize(text)             # Tokenization
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words and len(word) > 2
    ]
    return " ".join(tokens)                  # Rejoin

processed_corpus_temp = [preprocess_text(doc) for doc in corpus]

# Filter out empty processed documents and their corresponding original corpus entries
filtered_corpus_titles = []
filtered_processed_corpus = []
for i, doc in enumerate(processed_corpus_temp):
    if doc.strip(): # Check if the processed document is not empty
        filtered_processed_corpus.append(doc)
        filtered_corpus_titles.append(corpus[i])

processed_corpus = filtered_processed_corpus # Update processed_corpus for downstream use
corpus_for_df = filtered_corpus_titles # Store filtered original corpus titles for DataFrame creation


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [66]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(processed_corpus)


In [38]:
bow_df = pd.DataFrame(
    bow_matrix.toarray(),
    columns=vectorizer.get_feature_names_out()
)

print(bow_df.head())


   aaa  aabo  aacp  aad  aadnet  aaformer  aag  aam  aamdrl  aanet  ...  \
0    0     0     0    0       0         0    0    0       0      0  ...   
1    0     0     0    0       0         0    0    0       0      0  ...   
2    0     0     0    0       0         0    0    0       0      0  ...   
3    0     0     0    0       0         0    0    0       0      0  ...   
4    0     0     0    0       0         0    0    0       0      0  ...   

   zoomintocheck  zoomnet  zoomtoinpaint  zooplankton  zootuning  zope  zorro  \
0              0        0              0            0          0     0      0   
1              0        0              0            0          0     0      0   
2              0        0              0            0          0     0      0   
3              0        0              0            0          0     0      0   
4              0        0              0            0          0     0      0   

   zpd  zsslr  zstgan  
0    0      0       0  
1    0      0 

In [39]:
from sklearn.decomposition import NMF

num_topics = 2

nmf_model = NMF(
    n_components=num_topics,
    random_state=42
)

nmf_model.fit(bow_matrix)


In [40]:
def display_topics(model, feature_names, top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        print(", ".join(
            feature_names[i]
            for i in topic.argsort()[:-top_words - 1:-1]
        ))

display_topics(
    nmf_model,
    vectorizer.get_feature_names_out(),
    top_words=5
)



Topic 1:
learning, reinforcement, deep, representation, using

Topic 2:
network, neural, graph, image, using


In [41]:
doc_topic_matrix = nmf_model.transform(bow_matrix)

topic_df = pd.DataFrame(
    doc_topic_matrix,
    columns=[f"Topic {i+1}" for i in range(num_topics)]
)

print(topic_df.head())


    Topic 1   Topic 2
0  0.004016  0.013649
1  0.001476  0.003155
2  0.005518  0.042458
3  0.002585  0.014308
4  0.001047  0.013692


In [67]:
from sklearn.decomposition import NMF

# Re-initialize NMF model for Kaggle data (BOW)
num_topics = 2 # Ensure this is consistent with the NMF setup for Kaggle data
nmf_model = NMF(
    n_components=num_topics,
    random_state=42
)
nmf_model.fit(bow_matrix)

# Assign dominant topic to each sentence
doc_topic_matrix = nmf_model.transform(bow_matrix)
dominant_topics = doc_topic_matrix.argmax(axis=1)

# Map each original sentence to its assigned topic (using the filtered corpus_for_df)
sentence_topic_df = pd.DataFrame({
    "Sentence": corpus_for_df,
    "Assigned Topic": dominant_topics + 1   # +1 for readable topic numbers
})

print(sentence_topic_df)


                                                Sentence  Assigned Topic
0      Survey on Semantic Stereo Matching / Semantic ...               2
1      FUTURE-AI: Guiding Principles and Consensus Re...               2
2      Enforcing Mutual Consistency of Hard Regions f...               2
3      Parameter Decoupling Strategy for Semi-supervi...               2
4      Background-Foreground Segmentation for Interio...               2
...                                                  ...             ...
51769  Hierarchically-coupled hidden Markov models fo...               1
51770                         Blinking Molecule Tracking               2
51771  Towards a Mathematical Foundation of Immunolog...               2
51772  A Semi-Automatic Graph-Based Approach for Dete...               1
51773  SparseCodePicking: feature extraction in mass ...               2

[51774 rows x 2 columns]


 **NMF with Kaggle Data and TFIDF**

In [43]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/arxiv_data.csv", on_bad_lines='skip', engine='python')

# Prepare corpus
corpus = df['titles'].astype(str).tolist()


In [44]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words and len(word) > 2
    ]
    return " ".join(tokens)

processed_corpus = [preprocess_text(doc) for doc in corpus]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_corpus)


In [46]:
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print(tfidf_df.head())


   aaa  aabo  aacp  aad  aadnet  aads  aaformer  aag  aam  aamdrl  ...  \
0  0.0   0.0   0.0  0.0     0.0   0.0       0.0  0.0  0.0     0.0  ...   
1  0.0   0.0   0.0  0.0     0.0   0.0       0.0  0.0  0.0     0.0  ...   
2  0.0   0.0   0.0  0.0     0.0   0.0       0.0  0.0  0.0     0.0  ...   
3  0.0   0.0   0.0  0.0     0.0   0.0       0.0  0.0  0.0     0.0  ...   
4  0.0   0.0   0.0  0.0     0.0   0.0       0.0  0.0  0.0     0.0  ...   

   zoomtoinpaint  zoonosis  zooplankton  zootuning  zope  zorro  zpd  zscores  \
0            0.0       0.0          0.0        0.0   0.0    0.0  0.0      0.0   
1            0.0       0.0          0.0        0.0   0.0    0.0  0.0      0.0   
2            0.0       0.0          0.0        0.0   0.0    0.0  0.0      0.0   
3            0.0       0.0          0.0        0.0   0.0    0.0  0.0      0.0   
4            0.0       0.0          0.0        0.0   0.0    0.0  0.0      0.0   

   zsslr  zstgan  
0    0.0     0.0  
1    0.0     0.0  
2    0.0   

In [47]:
from sklearn.decomposition import NMF

num_topics = 2

nmf_model = NMF(
    n_components=num_topics,
    random_state=42
)

nmf_model.fit(tfidf_matrix)


In [48]:
def display_topics(model, feature_names, top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        print(", ".join(
            feature_names[i]
            for i in topic.argsort()[:-top_words - 1:-1]
        ))

display_topics(
    nmf_model,
    tfidf_vectorizer.get_feature_names_out(),
    top_words=5
)



Topic 1:
network, neural, graph, convolutional, adversarial

Topic 2:
learning, reinforcement, deep, representation, unsupervised


In [49]:
doc_topic_distribution = nmf_model.transform(tfidf_matrix)

topic_df = pd.DataFrame(
    doc_topic_distribution,
    columns=[f"Topic {i+1}" for i in range(num_topics)]
)

print(topic_df.head())


    Topic 1   Topic 2
0  0.009739  0.011568
1  0.001660  0.002869
2  0.011662  0.010740
3  0.004855  0.005628
4  0.004064  0.002565


In [68]:
# Assign the dominant topic to each sentence
dominant_topics = doc_topic_distribution.argmax(axis=1)

# Create a DataFrame mapping each sentence to its topic
sentence_topic_df = pd.DataFrame({
    "Sentence": corpus,
    "Assigned Topic": dominant_topics + 1   # +1 for human-readable topic numbers
})

print(sentence_topic_df)


                                                Sentence  Assigned Topic
0      Survey on Semantic Stereo Matching / Semantic ...               2
1      FUTURE-AI: Guiding Principles and Consensus Re...               2
2      Enforcing Mutual Consistency of Hard Regions f...               1
3      Parameter Decoupling Strategy for Semi-supervi...               2
4      Background-Foreground Segmentation for Interio...               1
...                                                  ...             ...
51769  Hierarchically-coupled hidden Markov models fo...               2
51770                         Blinking Molecule Tracking               2
51771  Towards a Mathematical Foundation of Immunolog...               2
51772  A Semi-Automatic Graph-Based Approach for Dete...               2
51773  SparseCodePicking: feature extraction in mass ...               2

[51774 rows x 2 columns]
