In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import csv

# Download NLTK resources (needed for stopwords and lemmatization)
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Function to preprocess text
def preprocess_text(text):
    # Tokenize text into words
    words = word_tokenize(text)
    
    # Convert to lowercase and remove punctuation
    words = [word.lower() for word in words if word.isalpha()]
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return " ".join(words)

# Function to perform LDA topic modeling
def perform_lda(text, num_topics):
    # Preprocess the text
    preprocessed_text = preprocess_text(text)
    
    # Create a document-term matrix using CountVectorizer
    vectorizer = CountVectorizer(min_df=1, max_df=1.0)  # Adjust min_df and max_df values to control redundancy
    dtm = vectorizer.fit_transform([preprocessed_text])
    
    # Apply LDA with the specified number of topics
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda_model.fit(dtm)
    
    # Get the most significant words and their probability scores for each topic
    feature_names = vectorizer.get_feature_names_out()
    topics_keywords = []
    for topic_idx, topic in enumerate(lda_model.components_):
        topic_keywords = [feature_names[i] for i in topic.argsort()[:-6:-1]]
        topic_prob_scores = [round(topic[i], 2) for i in topic.argsort()[:-6:-1]]
        topics_keywords.append((topic_idx, topic_keywords, topic_prob_scores))
    
    return topics_keywords

# Read user input dynamically
user_input = input("Enter your text passage or paragraph: ")

# Dynamically adjust the number of topics as needed
num_topics = min(5, len(word_tokenize(user_input)))

# Identify keywords using LDA topic modeling
topics_keywords = perform_lda(user_input, num_topics)

# Save the results to a CSV file
csv_file = "lda_results.csv"
with open(csv_file, mode='w', encoding='utf-8', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Topic ID", "Keywords", "Probability Scores"])
    for topic_idx, keywords, prob_scores in topics_keywords:
        writer.writerow([topic_idx, ", ".join(keywords), ", ".join(map(str, prob_scores))])

print("Results saved to lda_results.csv")


[nltk_data] Downloading package punkt to /home/iiit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/iiit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/iiit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


ValueError: max_df corresponds to < documents than min_df