In [None]:
#Dependencies
pip install tensorflow transformers PyPDF2


In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForQuestionAnswering, pipeline
import PyPDF2
import re

def initialize_bert_model():
    # Load pre-trained BERT model and tokenizer
    model = TFBertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

    return model, tokenizer

def extract_text_from_pdf(pdf_path):
    # Extract text from a PDF file using PyPDF2
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def chunk_text(text, max_chunk_size=512):
    # Split the text into chunks with a maximum size
    chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
    return chunks

def answer_question(document, question, model, tokenizer):
    # Tokenize inputs
    inputs = tokenizer(question, document, return_tensors="tf", max_length=512, truncation=True)

    # Get model output
    outputs = model(inputs)

    # Extract start and end logits from model output
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Find the tokens with the highest probability as start and end positions
    start_idx = tf.argmax(start_logits, axis=1).numpy()[0]
    end_idx = tf.argmax(end_logits, axis=1).numpy()[0]

    # Check if valid indices are found
    if 0 <= start_idx < len(inputs["input_ids"].numpy()[0]) and 0 <= end_idx < len(inputs["input_ids"].numpy()[0]):
        # Convert token indices to actual tokens
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].numpy()[0])

        # Get the answer span
        answer = tokenizer.convert_tokens_to_string(tokens[start_idx:end_idx + 1])

        return answer
    else:
        return "Answer not found"

def summarize_text(text):
    summarizer = pipeline("summarization")

    try:
        # Attempt to get the summary
        summary = summarizer(text, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

        # Check if a valid summary is obtained
        if summary and 'summary_text' in summary[0]:
            # Split the summary into sentences
            sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', summary[0]['summary_text'])

            # Join sentences with newline characters
            formatted_summary = '\n'.join(sentences)
            return formatted_summary
        else:
            return "Unable to generate a summary."
    except Exception as e:
        return f"Error during summarization: {str(e)}"


def main():
    # Get PDF path from user input
    pdf_path = "/content/fundamentals_supplychain-1-4.pdf"

    # Extract text from the PDF
    document = extract_text_from_pdf(pdf_path)

    # Chunk the document into smaller parts
    document_chunks = chunk_text(document)

    # Get question from user input
    question = input("Ask a question: ")

    # Initialize BERT model and tokenizer
    model, tokenizer = initialize_bert_model()

    # Accumulate answers from each chunk
    all_answers = []

    for i, chunk in enumerate(document_chunks):
        print(f"Processing chunk {i + 1} of {len(document_chunks)}")
        answer = answer_question(chunk, question, model, tokenizer)
        all_answers.append(answer)

    # Combine answers from different chunks with new lines between them
    final_answer = '\n'.join(all_answers)

    # Summarize the final answer
    summarized_answer = summarize_text(final_answer)
    print(f"Summarized Answer:\n{summarized_answer}")

if __name__ == "__main__":
    main()
