In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import joblib

# Download NLTK resources (needed for stopwords and lemmatization)
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Install wordcloud package (if not installed)
try:
    import wordcloud
except ImportError:
    !pip install wordcloud


# Function to preprocess text
def preprocess_text(text):
    # Tokenize text into words
    words = word_tokenize(text)
    
    # Convert to lowercase and remove punctuation
    words = [word.lower() for word in words if word.isalpha()]
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return " ".join(words)

# Read the raw data (ebook) - Replace 'ebook.txt' with the actual filename or path
with open("edata.txt", mode='r', encoding='utf-8') as file:
    ebook_text = file.read()

# Preprocess the data
preprocessed_data = preprocess_text(ebook_text)

# Save the preprocessed data to a CSV file
data = pd.DataFrame([preprocessed_data], columns=["Text"])
data.to_csv("data.csv", index=False)

# Function to train LDA model from raw data
def train_lda_from_raw_data(raw_data, num_topics):
    # Preprocess the raw data
    preprocessed_data = [preprocess_text(text) for text in raw_data]
    
    # Create a document-term matrix using CountVectorizer
    vectorizer = CountVectorizer(min_df=1, max_df=1.0)  # Adjust min_df and max_df values to control redundancy
    dtm = vectorizer.fit_transform(preprocessed_data)
    
    # Apply LDA with the specified number of topics
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda_model.fit(dtm)
    
    return lda_model, vectorizer

# Function to find keywords from user input using the trained LDA model
def find_keywords_from_user_input(user_input, lda_model, vectorizer):
    # Preprocess the user input
    preprocessed_text = preprocess_text(user_input)
    
    # Create a document-term matrix for the user input using the trained vectorizer
    dtm_user_input = vectorizer.transform([preprocessed_text])
    
    # Get the most significant words and their probability scores for each topic in the user input
    feature_names = vectorizer.get_feature_names_out()
    topics_keywords = []
    for topic_idx, topic in enumerate(lda_model.components_):
        topic_keywords = [feature_names[i] for i in topic.argsort()[:-6:-1]]
        topic_prob_scores = [round(topic[i], 2) for i in topic.argsort()[:-6:-1]]
        topics_keywords.append((topic_idx, topic_keywords, topic_prob_scores))
    
    return topics_keywords, preprocessed_text

# Read raw data from a file (e.g., CSV file containing text passages)
raw_data = []
with open("data.csv", mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header row
    for row in reader:
        raw_data.append(row[0])  # Assuming the text data is in the first column

# Train LDA model on raw data and save the model and vectorizer
num_topics = 5  # You can adjust the number of topics as needed
lda_model, vectorizer = train_lda_from_raw_data(raw_data, num_topics)

# Save the trained model and vectorizer to disk using joblib
joblib.dump(lda_model, "lda_model.joblib")
joblib.dump(vectorizer, "vectorizer.joblib")

# Load the trained model and vectorizer from disk
lda_model = joblib.load("lda_model.joblib")
vectorizer = joblib.load("vectorizer.joblib")

# Read user input dynamically
user_input = input("Enter your text passage or paragraph: ")

# Find keywords from user input using the loaded LDA model
topics_keywords, preprocessed_text = find_keywords_from_user_input(user_input, lda_model, vectorizer)

# Perform EDA on user input
num_sentences = len(nltk.sent_tokenize(user_input))
num_words = len(word_tokenize(user_input))

print("\n[Exploratory Data Analysis on User Input]")
print(f"Number of Sentences: {num_sentences}")
print(f"Number of Words: {num_words}")

# Perform EDA on generated keywords
num_keywords = sum(len(keywords) for _, keywords, _ in topics_keywords)
unique_keywords = set(keyword for _, keywords, _ in topics_keywords for keyword in keywords)

print("\n[Exploratory Data Analysis on Generated Keywords]")
print(f"Number of Topics: {len(topics_keywords)}")
print(f"Total Number of Keywords: {num_keywords}")
print(f"Number of Unique Keywords: {len(unique_keywords)}")

# Visual EDA - Bar plot of keyword frequency
keyword_freq = {keyword: 0 for _, keywords, _ in topics_keywords for keyword in keywords}
for _, keywords, _ in topics_keywords:
    for keyword in keywords:
        keyword_freq[keyword] += 1

plt.figure(figsize=(10, 6))
sns.barplot(x=list(keyword_freq.values()), y=list(keyword_freq.keys()), palette="viridis")
plt.xlabel("Frequency")
plt.ylabel("Keywords")
plt.title("Keyword Frequency Distribution")
plt.tight_layout()
plt.show()

# Generate word cloud of keywords
keywords_text = " ".join(keyword for _, keywords, _ in topics_keywords for keyword in keywords)
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(keywords_text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Keywords")
plt.show()

# Save the results to a CSV file
csv_file = "lda_results.csv"
with open(csv_file, mode='w', encoding='utf-8', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Topic ID", "Keywords", "Probability Scores"])
    for topic_idx, keywords, prob_scores in topics_keywords:
        writer.writerow([topic_idx, ", ".join(keywords), ", ".join(map(str, prob_scores))])

print("\nResults saved to lda_results.csv")
