In [1]:
!pip install transformers sentence-transformers PyPDF2




In [None]:
import PyPDF2
import logging
import os
import nltk
import torch
import numpy as np
import random
from nltk.tokenize import sent_tokenize
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')

logging.basicConfig(filename="/content/support_bot_log.txt", level=logging.INFO,
                    format="%(asctime)s - %(levelname)s - %(message)s", force=True)

logging.info("Logging initialized successfully.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
class SupportBotAgent:
    def __init__(self, document_path):
        """Initialize the bot, load the document, process it, and set up NLP models."""
        logging.info("Initializing SupportBotAgent...")
        self.document_path = document_path
        self.document_text = self.load_document()

        logging.info("Loading sentence embedding model...")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

        logging.info("Loading question-answering model google/flan-t5-base")
        self.nlp_pipeline = pipeline("text2text-generation", model="google/flan-t5-base", tokenizer="google/flan-t5-base")

        logging.info("Processing document into structured text chunks...")
        self.paragraphs = self.process_document(self.document_text)
        logging.info(f"Document processed into {len(self.paragraphs)} chunks.")

        logging.info("Computing TF-IDF vectors for keyword-based retrieval...")
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.paragraphs)

        logging.info("Computing sentence embeddings for semantic search...")
        self.embeddings = self.embedding_model.encode(self.paragraphs, convert_to_tensor=True)

        logging.info("SupportBotAgent successfully initialized.")

    def load_document(self):
        """Load and preprocess the document dynamically based on its format."""
        logging.info(f"Attempting to load document: {self.document_path}")
        file_extension = os.path.splitext(self.document_path)[1].lower()

        try:
            if file_extension == ".pdf":
                with open(self.document_path, "rb") as file:
                    reader = PyPDF2.PdfReader(file)
                    text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
            elif file_extension == ".txt":
                with open(self.document_path, "r", encoding="utf-8") as file:
                    text = file.read()
            else:
                raise ValueError("Unsupported file format. Use PDF or TXT.")

            logging.info("Document successfully loaded and processed.")
            return text

        except Exception as e:
            logging.error(f"Error loading document: {e}")
            raise e

    def process_document(self, text):
        """Chunk the document based on semantic similarity and structure."""
        logging.info("Starting document chunking process...")
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = []
        max_chunk_length = 400
        sentence_embeddings = self.embedding_model.encode(sentences, convert_to_tensor=True)

        for i, sentence in enumerate(sentences):
            if len(" ".join(current_chunk)) + len(sentence) < max_chunk_length:
                if current_chunk:
                    sim_score = util.pytorch_cos_sim(sentence_embeddings[i - 1], sentence_embeddings[i])[0].item()
                    if sim_score < 0.3:
                        chunks.append(" ".join(current_chunk))
                        current_chunk = [sentence]
                    else:
                        current_chunk.append(sentence)
                else:
                    current_chunk.append(sentence)
            else:
                chunks.append(" ".join(current_chunk))
                current_chunk = [sentence]

        if current_chunk:
            chunks.append(" ".join(current_chunk))

        logging.info(f"Document successfully chunked into {len(chunks)} sections.")
        return chunks

    def retrieve_relevant_section(self, query, top_n=3):
        """Retrieve the top relevant sections using a hybrid approach (TF-IDF + embeddings)."""
        logging.info(f"Retrieving relevant section for query: {query}")
        query_tfidf = self.tfidf_vectorizer.transform([query])
        tfidf_scores = np.dot(self.tfidf_matrix, query_tfidf.T).toarray().flatten()
        top_indices = np.argsort(tfidf_scores)[-top_n:][::-1]

        query_embedding = self.embedding_model.encode(query, convert_to_tensor=True)
        best_matches = []

        for idx in top_indices:
            similarity_score = util.pytorch_cos_sim(query_embedding, self.embeddings[idx])[0].item()
            best_matches.append((self.paragraphs[idx], similarity_score))

        best_matches = sorted(best_matches, key=lambda x: x[1], reverse=True)

        if best_matches and best_matches[0][1] >= 0.5:
            logging.info(f"Top relevant section found with similarity score: {best_matches[0][1]:.2f}")
            return best_matches[0][0]
        else:
            logging.warning("No relevant section found, returning fallback response.")
            return "I'm sorry, I couldn't find relevant details. Please contact support@example.com."

    def answer_query(self, query):
        """Generate a response using the retrieved context."""
        logging.info(f"Processing query: {query}")
        iterations = 0
        refined_answer = None

        while iterations < 2:
            relevant_section = self.retrieve_relevant_section(query)
            if "I'm sorry" in relevant_section:
                logging.info(f"No relevant context found for query: {query}")
                return f"Query: {query}\n\n{relevant_section}"

            prompt = f"Based on the following context, answer the question:\n\nContext: {relevant_section}\n\nQuestion: {query}"
            logging.info("Generating response using NLP model...")
            answer_data = self.nlp_pipeline(prompt, max_length=100, truncation=True)
            extracted_answer = answer_data[0]["generated_text"]

            feedback = self.simulate_feedback()
            logging.info(f"Generated answer: {extracted_answer} | Feedback: {feedback}")

            refined_answer = extracted_answer

            if feedback == "good":
                logging.info("Answer accepted based on feedback.")
                break
            elif feedback == "too vague":
                prompt += " Provide a more detailed answer."
                logging.info("Reattempting answer generation with more detail.")
            elif feedback == "not helpful":
                prompt += " Rephrase the answer clearly."
                logging.info("Reattempting answer generation with clearer phrasing.")

            iterations += 1

        if not refined_answer:
            refined_answer = "I'm sorry, I couldn't generate a satisfactory answer."
            logging.warning("Final response could not be generated successfully.")

        logging.info(f"Final response for query: {query} | Answer: {refined_answer}")
        return f"Query: {query}\nBot: {refined_answer}\n\n"

    def simulate_feedback(self):
        """Simulate user feedback randomly."""
        feedback = random.choice(["not helpful", "too vague", "good"])
        logging.info(f"Simulated feedback: {feedback}")
        return feedback


In [23]:
if __name__ == "__main__":
    with open("/content/support_bot_log.txt", "a") as log_file:
      log_file.write("\n\n")
    logging.info("Starting SupportBotAgent...")
    bot = SupportBotAgent(document_path="/content/Assignment.txt")

    sample_queries = [
        "How do I reset my password?",
        "What’s the refund policy?",
        "How do I fly to the moon?"
    ]

    for query in sample_queries:
        bot.answer_query(query)

    logging.info("SupportBotAgent execution completed.")

Device set to use cpu


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]