In [1]:
# Title: Document Clustering

# Task 1: A news platform uses Latent Dirichlet Allocation (LDA) to group articles by topic. Implement LDA on a corpus of news articles to identify underlying topics.
# Task 2: A library uses k-means clustering to organize digital books based on content similarity. Use a text vectorization method (e.g., TF-IDF) and apply k-means.
# Task 3: A law firm leverages agglomerative clustering to organize legal documents related to similar cases. Perform hierarchical clustering and visualize using a dendrogram.

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Ensure you have the necessary NLTK data downloaded
try:
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt')

def preprocess_text(text):
    """Cleans and tokenizes text, removing stopwords and non-alphabetic characters."""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text) # Remove non-alphabetic characters
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return " ".join(tokens)

# Sample corpus of news articles
news_articles = [
    "The economy is showing signs of recovery with increased consumer spending and job growth.",
    "Central banks are debating interest rate hikes to curb inflation in several countries.",
    "New advancements in artificial intelligence are transforming various industries, from healthcare to finance.",
    "Machine learning algorithms are being used to predict stock market trends and optimize trading strategies.",
    "A major earthquake struck the region, causing widespread damage and requiring immediate humanitarian aid.",
    "Volcanic eruption in the Pacific caused disruptions to air travel and local evacuations.",
    "Scientists discovered a new exoplanet with potential for liquid water, fueling search for extraterrestrial life.",
    "Space agencies are planning new missions to Mars and beyond, pushing the boundaries of human exploration."
]

# Preprocess the articles
preprocessed_articles = [preprocess_text(article) for article in news_articles]

# Use CountVectorizer for LDA as it typically works with word counts
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2) # Ignore terms that appear in too many/few documents
dtm = count_vectorizer.fit_transform(preprocessed_articles)

# Apply LDA to identify underlying topics
num_topics = 3 # You can adjust this based on your dataset and desired granularity
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42, n_init=10)
lda.fit(dtm)

print("--- Task 1: News Article Topic Modeling with LDA ---")
print("\nIdentified Topics (Top 10 keywords per topic):")
feature_names = count_vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx + 1}:")
    print([feature_names[i] for i in topic.argsort()[:-11:-1]]) # Top 10 words

print("\nDominant Topic for each Article:")
topic_distributions = lda.transform(dtm)
for i, article in enumerate(news_articles):
    dominant_topic = np.argmax(topic_distributions[i])
    print(f"Article {i+1}: '{article[:70]}...' -> Dominant Topic: {dominant_topic + 1}") # Added + 1 to complete the line

ModuleNotFoundError: No module named 'nltk'