<a href="https://colab.research.google.com/github/Sathwika2202/NLP/blob/main/2403A52250_BATCH_09_ASSIGNMENT_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

import matplotlib.pyplot as plt


In [9]:
import pandas as pd

abstracts = [
    "This research explores machine learning algorithms for predicting student performance using classification techniques.",
    "Deep learning models are applied to medical image analysis for early disease detection.",
    "The study focuses on network security using intrusion detection systems and anomaly detection methods.",
    "This paper discusses natural language processing techniques for sentiment analysis in social media data.",
    "A comparative analysis of data mining algorithms for large scale healthcare datasets is presented.",
    "The research investigates cloud computing architectures and resource optimization strategies.",
    "This work presents an efficient algorithm for big data analytics using distributed computing.",
    "Machine learning techniques are used to improve recommendation systems in e-commerce platforms.",
    "The study analyzes cyber security threats and mitigation techniques using artificial intelligence.",
    "This paper explores computer vision methods for object detection and image classification.",
    "Research focuses on blockchain technology and its applications in secure data sharing.",
    "The study evaluates different feature extraction techniques for text classification tasks.",
    "Artificial intelligence based chatbots are analyzed for customer support automation.",
    "The paper discusses predictive analytics for financial market forecasting.",
    "This research presents sentiment analysis techniques for opinion mining.",
    "The study applies neural networks to speech recognition systems.",
    "A review of supervised and unsupervised learning algorithms is presented.",
    "The research examines IoT based smart healthcare monitoring systems.",
    "This paper explores optimization techniques for neural network training.",
    "The study analyzes social network data using graph based algorithms."
]

# Repeat abstracts to reach 200 documents
dataset = abstracts * 10   # 20 x 10 = 200

df = pd.DataFrame({
    "abstract": dataset
})

# Save dataset
df.to_csv("research_abstracts.csv", index=False)

print("Dataset created successfully!")
print("Total documents:", len(df))


Dataset created successfully!
Total documents: 200


In [10]:
# Load dataset
df = pd.read_csv("research_abstracts.csv")   # CSV must contain 'abstract' column

# Extract text documents
documents = df['abstract'].dropna().tolist()

print("Total documents:", len(documents))


Total documents: 200


In [11]:
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()                            # Convert to lowercase
    text = re.sub(r'\d+', '', text)                # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)            # Remove punctuation
    tokens = word_tokenize(text)                   # Tokenization
    tokens = [word for word in tokens
              if word not in stop_words and len(word) > 2]
    return " ".join(tokens)


In [14]:
import nltk
nltk.download('punkt_tab')

processed_docs = [preprocess_text(doc) for doc in documents]

print(processed_docs[0])

[nltk_data] Downloading package punkt_tab to /root/nltk_data...


research explores machine learning algorithms predicting student performance using classification techniques


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [15]:
count_vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=2
)

doc_term_matrix = count_vectorizer.fit_transform(processed_docs)

print(doc_term_matrix.shape)


(200, 108)


In [16]:
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95,
    min_df=2
)

tfidf_matrix = tfidf_vectorizer.fit_transform(processed_docs)

print(tfidf_matrix.shape)


(200, 108)


In [17]:
num_topics = 5

lda_model = LatentDirichletAllocation(
    n_components=num_topics,
    random_state=42
)


In [18]:
lda_model.fit(doc_term_matrix)


In [19]:
def display_topics(model, feature_names, top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx+1}:")
        print([feature_names[i] for i in topic.argsort()[:-top_words-1:-1]])


In [20]:
lda_features = count_vectorizer.get_feature_names_out()
display_topics(lda_model, lda_features, 10)



Topic 1:
['research', 'paper', 'image', 'learning', 'analysis', 'discusses', 'detection', 'analytics', 'models', 'medical']

Topic 2:
['research', 'data', 'analysis', 'computing', 'mining', 'presents', 'presented', 'healthcare', 'optimization', 'algorithms']

Topic 3:
['detection', 'systems', 'network', 'using', 'methods', 'study', 'security', 'artificial', 'intelligence', 'machine']

Topic 4:
['techniques', 'using', 'classification', 'algorithms', 'explores', 'study', 'learning', 'analyzes', 'predicting', 'performance']

Topic 5:
['paper', 'social', 'based', 'data', 'study', 'systems', 'neural', 'processing', 'media', 'natural']


In [21]:
nmf_model = NMF(
    n_components=num_topics,
    random_state=42
)


In [22]:
nmf_model.fit(tfidf_matrix)


In [23]:
nmf_features = tfidf_vectorizer.get_feature_names_out()
display_topics(nmf_model, nmf_features, 10)



Topic 1:
['detection', 'image', 'methods', 'vision', 'computer', 'object', 'explores', 'classification', 'models', 'medical']

Topic 2:
['analysis', 'sentiment', 'mining', 'data', 'presents', 'opinion', 'discusses', 'research', 'techniques', 'processing']

Topic 3:
['analyzes', 'intelligence', 'artificial', 'study', 'using', 'based', 'security', 'cyber', 'mitigation', 'threats']

Topic 4:
['learning', 'machine', 'algorithms', 'techniques', 'review', 'unsupervised', 'supervised', 'predicting', 'performance', 'student']

Topic 5:
['neural', 'optimization', 'training', 'network', 'explores', 'speech', 'networks', 'recognition', 'applies', 'paper']


In [25]:
print("LDA Topics are more probabilistic and overlapping.")
print("NMF Topics are sharper and more interpretable for short texts.")


LDA Topics are more probabilistic and overlapping.
NMF Topics are sharper and more interpretable for short texts.


In [24]:
doc_index = 0

lda_distribution = lda_model.transform(doc_term_matrix)
nmf_distribution = nmf_model.transform(tfidf_matrix)

print("LDA Topic Distribution:", lda_distribution[doc_index])
print("NMF Topic Distribution:", nmf_distribution[doc_index])


LDA Topic Distribution: [0.01676765 0.01683194 0.01680495 0.93279958 0.01679588]
NMF Topic Distribution: [0.01678957 0.00306901 0.01526776 0.41146792 0.02321583]
