In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import requests
import fitz 
import os

device = torch.device("cpu")
max_input_tokens = 2048

class ModelManager:
    _model = None
    _tokenizer = None

    @classmethod
    def load(cls, model_name, cache_dir="./cache", save_dir="./saved_model"):
        """
        Loads the model and tokenizer either from a saved directory or from Hugging Face.

        Args:
            model_name (str): The name of the model to load.
            cache_dir (str, optional): Directory for caching model files. Defaults to "./cache".
            save_dir (str, optional): Directory to check for saved models. Defaults to "./saved_model".

        Returns:
            tuple: The model and tokenizer.
        """
        # Check if the model and tokenizer are already loaded
        if cls._model is None or cls._tokenizer is None:
            if os.path.exists(save_dir):
                print("Loading model and tokenizer from saved files...")
                cls._tokenizer = AutoTokenizer.from_pretrained(save_dir)
                cls._model = AutoModelForSeq2SeqLM.from_pretrained(save_dir).to(device)
            else:
                print("Loading model and tokenizer from Hugging Face...")
                cls._tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
                cls._model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=cache_dir, torch_dtype=torch.float32).to(device)
                # Save the model and tokenizer
                cls._model.save_pretrained(save_dir)
                cls._tokenizer.save_pretrained(save_dir)

        return cls._model, cls._tokenizer

# PDFExtractor class to handle PDF text extraction
class PDFExtractor:
    @staticmethod
    def extract_text_from_pdf(pdf_source: str) -> str:
        """
        Extracts text from a PDF document, either from a URL or local file.

        Args:
            pdf_source (str): The URL or local path of the PDF.

        Returns:
            str: The extracted text from the PDF.

        Raises:
            Exception: If the PDF cannot be fetched or processed.
        """
        print("Starting to extract text from the PDF...")
        try:
            if pdf_source.startswith("http"):
                # Ensure the URL points to a direct PDF file (not a webpage)
                if not pdf_source.endswith('.pdf'):
                    raise Exception("The URL does not point to a PDF file.")
                
                response = requests.get(pdf_source)
                response.raise_for_status()  # Raise an exception for HTTP errors
                doc = fitz.open(stream=response.content, filetype="pdf")
            else:
                doc = fitz.open(pdf_source)

            text = " ".join(page.get_text("text") for page in doc)
            doc.close()
            print("Text extraction from PDF completed.")
            return text.strip()
        except requests.exceptions.RequestException as e:
            raise Exception(f"Failed to fetch the PDF from URL: {e}")
        except Exception as e:
            raise Exception(f"Error processing the PDF: {e}")

    @staticmethod
    def preprocess_text(text: str) -> str:
        """
        Preprocesses the extracted text by normalizing the whitespace.

        Args:
            text (str): The raw extracted text.

        Returns:
            str: The preprocessed text with normalized whitespace.
        """
        print("Starting text preprocessing...")
        text = " ".join(text.split())  # Normalize whitespace to single spaces
        print("Text preprocessing completed.")
        return text

# QAGenerator class to handle question-answer generation
class QAGenerator:
    def __init__(self, model, tokenizer):
        """
        Initializes the QAGenerator with the model and tokenizer.

        Args:
            model: The preloaded model.
            tokenizer: The preloaded tokenizer.
        """
        self.model = model
        self.tokenizer = tokenizer

    def answer_question_from_document(self, document_text: str, question: str) -> str:
        """
        Generates an answer to a question based on the provided document text.

        Args:
            document_text (str): The text extracted from the document.
            question (str): The question to be answered.

        Returns:
            str: The generated answer.
        """
        prompt = f"""You are an expert assistant trained to answer questions based on real estate documents. 
            Your task is to provide the most accurate and specific answer, strictly based on the content of the provided document. 

            Please follow these rules:
            1. **Provide direct quotes** from the document. Do not paraphrase or modify the content in any way.
            2. **Avoid speculation**. Only answer based on the exact information available in the document. Do not infer or invent any details.
            3. **Maintain precision**. Ensure that numbers, facts, and all details are quoted exactly as they appear.
            4. If the information required to answer the question is missing from the document, **say "Information not available in the document"**.

            Document:
            \"\"\"{document_text}\"\"\" 

            Question:
            {question}

            Answer:
            """

        # Tokenize the input text without truncation or splitting
        inputs = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Call the model to generate the answer
        outputs = self.model.generate(
            **inputs,
            max_length=100,  # Longer responses if necessary
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            top_k=50,  # Limit top-k to reduce repetitions
            repetition_penalty=1.5,
            no_repeat_ngram_size=5,
            length_penalty=1.0,
            num_beams=5,
            early_stopping=True,
            pad_token_id=self.tokenizer.eos_token_id,
        )

        # Decode the generated answer
        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        return generated_text

    def generate_qa_dictionary(self, document_text: str, questions: list) -> dict:
        """
        Generates answers for a list of questions based on the document text.

        Args:
            document_text (str): The text extracted from the document.
            questions (list): A list of questions to be answered.

        Returns:
            dict: A dictionary where keys are questions and values are answers.
        """
        print("Generating answers for all questions...")
        qa_dict = {}
        for question in questions:
            answers = set()  # Use a set to avoid duplicate answers
            answer = self.answer_question_from_document(document_text, question)
            if answer.strip() != "Information not available in the document.":
                answers.add(answer.strip())  # Add only unique answers
            qa_dict[question] = " ".join(answers) if answers else "Information not available in the document."
        print("Answers generated successfully.")
        return qa_dict


In [2]:
# Main flow
if __name__ == "__main__":
    """
    This is the main execution flow for the program.
    It loads the model and tokenizer, extracts and preprocesses text from a PDF document,
    generates answers to a set of predefined questions, and prints the results.
    """

    print("Starting the main flow...")  # Added print statement
    
    # Load model and tokenizer
    model, tokenizer = ModelManager.load("google/flan-t5-large")
    
    # URL of the PDF document to extract text from
    github_pdf_url = "https://raw.githubusercontent.com/SAN1713911S/million-luxury-docs-qa/main/Million%20Luxury%20Document.pdf"

    # Extract and preprocess text from the PDF document
    document = PDFExtractor.extract_text_from_pdf(github_pdf_url)
    document_text = PDFExtractor.preprocess_text(document)

    # Create an instance of QAGenerator
    qa_generator = QAGenerator(model, tokenizer)

    # List of questions to generate answers for
    questions = [
        # Questions for UNA CLUB
        "What is the delivery date for the South Tower of UNA CLUB?",
        "What is the delivery date for the North Tower of UNA CLUB?",
        "Who is the developer of UNA CLUB?",
        "Where is the UNA CLUB project located?",
        "What is the starting price of UNA CLUB residences?",

        # Questions for BRICKELL HOME LUXURY
        "What is the price range for residences in Brickell Home Luxury?",
        "How many stories does the Brickell Home Luxury building have?",
        "How many residences are there in Brickell Home Luxury?",
        "What is the size range of the residences in Brickell Home Luxury?",
        "Who is responsible for the architecture of Brickell Home Luxury?",

        # Questions for BRICKELL HOME
        "What is the price range for residences in Brickell Home?",
        "How many stories does the Brickell Home building have?",
        "How many residences are there in Brickell Home?",
        "What is the size range of the residences in Brickell Home?",
        "Who is the developer of Brickell Home?",

        # Questions for BAYFRONT RESIDENCES
        "What is the price range for residences in Bayfront Residences?",
        "How many stories does the Bayfront Residences building have?",
        "How many residences are there in Bayfront Residences?",
        "What is the size range of the residences in Bayfront Residences?",
        "What amenities are available at Bayfront Residences?"
    ]

    # Generate answers for all the questions
    qa_dictionary = qa_generator.generate_qa_dictionary(document_text, questions)

    # Print the questions and their corresponding answers
    for q, a in qa_dictionary.items():
        print(f"Q: {q}\nA: {a}\n")
    
    print("Main flow completed.")

Starting the main flow...
Loading model and tokenizer from saved files...
Starting to extract text from the PDF...
Text extraction from PDF completed.
Starting text preprocessing...
Text preprocessing completed.
Generating answers for all questions...
Answers generated successfully.
Q: What is the delivery date for the South Tower of UNA CLUB?
A: 2027

Q: What is the delivery date for the North Tower of UNA CLUB?
A: 2029

Q: Who is the developer of UNA CLUB?
A: Fortune International and Château Group

Q: Where is the UNA CLUB project located?
A: Sunny Isles Beach

Q: What is the starting price of UNA CLUB residences?
A: $4,500,000

Q: What is the price range for residences in Brickell Home Luxury?
A: Starting at $8,000,000

Q: How many stories does the Brickell Home Luxury building have?
A: 25

Q: How many residences are there in Brickell Home Luxury?
A: 56

Q: What is the size range of the residences in Brickell Home Luxury?
A: 3 to 8 bedrooms

Q: Who is responsible for the architectu