In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load your dataset (ensure it's UTF-8 encoded)
books_df = pd.read_excel("E:\MGM UDICT\Analytica\c.xlsx")

# Drop rows with NaN values in the 'Tokenized Text' column
books_df = books_df.dropna(subset=['Tokenized Text'])

# Step 1: Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Step 2: Fit and transform the preprocessed text
tfidf_matrix = vectorizer.fit_transform(books_df['Tokenized Text'])

# Step 3: Add the book titles to the DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df['Title'] = books_df['Title'].values
tfidf_df.set_index('Title', inplace=True)

def identify_book(prompt):
    # Preprocess the prompt (this should include the same steps as for your texts)
    # Example preprocessing might include normalization, removing stopwords, etc.
    
    prompt_vector = vectorizer.transform([prompt])  # Vectorize the prompt

    # Calculate cosine similarity between the prompt vector and all book vectors
    similarities = cosine_similarity(prompt_vector, tfidf_matrix)

    # Find the index of the book with the highest similarity score
    most_similar_index = similarities.argmax()

    # Return the title of the most similar book and the similarity score
    return books_df['Title'].values[most_similar_index], similarities[0][most_similar_index]

# Example usage with a prompt in Devanagari script
prompt = "God"  # Example prompt in Hindi
book_title, similarity_score = identify_book(prompt)
print(f"Most Relevant Book: {book_title}, Similarity Score: {similarity_score:.4f}")


Most Relevant Book: truth_is_god, Similarity Score: 0.6063


In [10]:
import random # STEP 3 
import re
from nltk.tokenize import sent_tokenize, word_tokenize

def find_context_sentences(sentences, search_word, window_size=1):
    """
    Find sentences containing the search word and include surrounding context.
    Args:
    - sentences: List of all sentences from the text.
    - search_word: Word to search for.
    - window_size: How many surrounding sentences to include before and after the target sentence.
    Returns:
    - List of sentences containing the word and their surrounding context.
    """
    context_sentences = []
    
    # Iterate through sentences to find the ones containing the search word
    for i, sentence in enumerate(sentences):
        if re.search(rf'\b{re.escape(search_word)}\b', sentence, re.I):
            # Capture surrounding context within the specified window size
            start_idx = max(0, i - window_size)
            end_idx = min(len(sentences), i + window_size + 1)
            context = sentences[start_idx:end_idx]
            context_sentences.extend(context)
    
    # Return the sentences containing the word and context around them
    return context_sentences

def truncate_sentence(sentence, max_words=20):
    """Limit sentence length to a maximum number of words."""
    words = word_tokenize(sentence)  # Tokenize the sentence into words
    if len(words) > max_words:
        truncated_sentence = ' '.join(words[:max_words]) + '...'  # Truncate and add ellipsis
        return truncated_sentence
    return sentence  # If sentence is short enough, return as-is

def generate_unique_questions(sentences, num_questions=10, max_words=20):
    """
    Generate unique questions from a list of sentences.
    """
    question_starters = [
        "What is the significance of",
        "Who is involved in",
        "When did",
        "Why does",
        "How does",
        "Where did",
        "What are the details of",
        "Explain the role of",
        "Describe the impact of",
        "What can you tell me about"
    ]

    questions = set()  # Use a set to avoid duplicate questions
    
    # Repeat over sentences to ensure we get enough unique questions
    while len(questions) < num_questions:
        for sentence in sentences:
            if len(questions) >= num_questions:
                break
            
            # Randomly choose a question starter
            starter = random.choice(question_starters)
            
            # Truncate the sentence to ensure it's not too long
            truncated_sentence = truncate_sentence(sentence, max_words=max_words)
            
            # Formulate a question and ensure it ends with a "?"
            question = f"{starter} {truncated_sentence.strip()}?"
            questions.add(question)  # Add question to the set to ensure uniqueness
    
    # Return unique questions as a list
    return list(questions)

def generate_questions_v2(tokenized_text, search_word=None, num_questions=10, max_words=20, context_window=1):
    # Split the text into sentences using nltk's sentence tokenizer
    sentences = sent_tokenize(tokenized_text)
    
    # Get the sentences that contain the search word and their context
    if search_word:
        sentences = find_context_sentences(sentences, search_word, window_size=context_window)
    
    # Ensure that we generate unique questions
    unique_questions = generate_unique_questions(sentences, num_questions=num_questions, max_words=max_words)
    
    return unique_questions


# Check if the book exists in the DataFrame
if book_title in books_df['Title'].values:
    # Retrieve tokenized text for the book
    tokenized_text = books_df.loc[books_df['Title'] == book_title, 'Tokenized Text'].values

    # Check if tokenized_text is not empty
    if len(tokenized_text) > 0:
        tokenized_text = tokenized_text[0]  # Access the text
        
        # Enter the word you want to search for in the text
        search_word = input("Enter a word to find related questions: ")
        
        # Generate questions from the tokenized text, with questions limited to 2-3 lines
        generated_questions = generate_questions_v2(tokenized_text, search_word=search_word, num_questions=5, max_words=10, context_window=1)
        
        # Print the generated questions
        print("\nGenerated Questions:")
        for i, question in enumerate(generated_questions, 1):
            print(f"Q{i}: {question}")
    else:
        print(f"No tokenized text found for the book '{book_title}'.")
else:
    print(f"Book title '{book_title}' not found in the dataset.")


NameError: name 'book_title' is not defined