In [None]:
!pip install transformers
!pip install pdfplumber

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.7 MB/s[0m eta [36m0:00:0

In [None]:
## Distilled Bert model

import pdfplumber
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch

# Load the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to answer questions
def answer_question(question, text):
    # Tokenize the input question and document
    inputs = tokenizer(question, text, return_tensors="pt", padding=True, truncation=True)

    # Get the start and end positions of the answer
    start_positions, end_positions = model(**inputs).output["start_logits"], model(**inputs).output["end_logits"]
    start_idx = torch.argmax(start_positions)
    end_idx = torch.argmax(end_positions)

    # Get the answer span from the text
    answer = tokenizer.decode(inputs["input_ids"][0][start_idx:end_idx+1])

    return answer

# Example PDF document and question
pdf_path = "example.pdf"  # Replace with the path to your PDF document
question = "What is the main topic discussed in this document?"

# Extract text from the PDF
document_text = extract_text_from_pdf(pdf_path)

# Answer the question
answer = answer_question(question, document_text)

# Print the answer
print("Answer:", answer)


In [None]:
## Bert Tokenizer

import pdfplumber
from transformers import BertTokenizer, BertForQuestionAnswering, pipeline

# Load the BERT model and tokenizer
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
model = BertForQuestionAnswering.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to answer questions
def answer_question(question, text):
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
    answer = qa_pipeline(question=question, context=text)
    return answer["answer"]

# Example PDF document and question
pdf_path = "Rent_agreement.pdf"  # Replace with the path to your PDF document
question = "what is this document regarding"

# Extract text from the PDF
document_text = extract_text_from_pdf(pdf_path)

# Answer the question
answer = answer_question(question, document_text)

# Print the answer
print("Answer:", answer)


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Answer: complying with the terms and conditions of the agreement.


In [None]:
question = "what is the amount tenant has to pay every month"

# Extract text from the PDF
document_text = extract_text_from_pdf(pdf_path)

# Answer the question
answer = answer_question(question, document_text)

# Print the answer
print("Answer:", answer)

Answer: Rs. 13655


In [None]:
### T5 implementation

import pdfplumber
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the T5 model and tokenizer
# from transformers import BertModel

# BertModel.from_pretrained("NewT5/dummy_model", use_auth_token=True)
token ="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"
model_name = "t5-small-qa-qg"
model = T5ForConditionalGeneration.from_pretrained(model_name,token=token)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to generate an answer to a question
def generate_answer(question, text):
    input_text = f"question: {question} context: {text}"

    # Tokenize and generate the answer
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    answer_ids = model.generate(input_ids, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2)

    # Decode and return the answer
    answer = tokenizer.decode(answer_ids[0], skip_special_tokens=True)

    return answer

# Example PDF document and question
pdf_path = "/content/Rent_agreement.pdf"  # Replace with the path to your PDF document
question = "What is the main topic discussed in this document?"

# Extract text from the PDF
document_text = extract_text_from_pdf(pdf_path)

# Answer the question
answer = generate_answer(question, document_text)

# Print the answer
print("Answer:", answer)


OSError: ignored