In [None]:
#Dependencies
pip install tensorflow transformers PyPDF2

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForQuestionAnswering, T5ForConditionalGeneration, T5Tokenizer
import PyPDF2
import re

def initialize_bert_model():
    model = TFBertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    return model, tokenizer

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def chunk_text(text, max_chunk_size=512):
    chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
    return chunks

def answer_question(document, question, model, tokenizer):
    inputs = tokenizer(question, document, return_tensors="tf", max_length=512, truncation=True)
    outputs = model(inputs)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    start_idx = tf.argmax(start_logits, axis=1).numpy()[0]
    end_idx = tf.argmax(end_logits, axis=1).numpy()[0]

    if 0 <= start_idx < len(inputs["input_ids"].numpy()[0]) and 0 <= end_idx < len(inputs["input_ids"].numpy()[0]):
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].numpy()[0])
        answer = tokenizer.convert_tokens_to_string(tokens[start_idx:end_idx + 1])
        return answer
    else:
        return "Answer not found"

def summarize_with_t5(text):
    model_name = "t5-small"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def main():
    pdf_path = "/content/fundamentals_supplychain-1-4.pdf"
    document = extract_text_from_pdf(pdf_path)
    document_chunks = chunk_text(document)
    question = input("Ask a question: ")

    model, tokenizer = initialize_bert_model()
    all_answers = []

    for i, chunk in enumerate(document_chunks):
        print(f"Processing chunk {i + 1} of {len(document_chunks)}")
        answer = answer_question(chunk, question, model, tokenizer)
        all_answers.append(answer)

    final_answer = '\n'.join(all_answers)
    t5_summary = summarize_with_t5(final_answer)
    print(f"T5 Summarized Answer:\n{t5_summary}")

if __name__ == "__main__":
    main()
