Packages required

In [None]:
!pip install openai
!pip install faiss-cpu
!pip install PyPDF2
!pip install ipywidgets
!pip install sentence-transformers
!pip install pdfminer.six
!pip install Torch
!pip install openai==0.28

Import Libraries

In [9]:
import re
import io
from IPython.display import display, HTML
from pdfminer.high_level import extract_text
from transformers import BertTokenizer, BertForQuestionAnswering
import torch
import ipywidgets as widgets
import warnings

DocumentProcessor class with the functions to upload, clean, process question and generate answer functions.

In [10]:

warnings.filterwarnings("ignore", category=UserWarning, module='transformers.tokenization_utils_base')

class DocumentProcessor:
  #Initialise global variables
    def __init__(self):
        self.chunks = None
        self.tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
        self.model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

    def handle_upload(self, change):
      #Upload file
        for uploaded_filename in change['new']:
            file_info = change['new'][uploaded_filename]
            content = file_info['content']

            if uploaded_filename.endswith('.pdf'):
                text = self.extract_text_from_pdf(content)
            elif uploaded_filename.endswith('.txt'):
                text = self.extract_text_from_txt(content)
            else:
                text = "Unsupported file format. Please upload a PDF or TXT file."

            cleaned_text = self.clean_text(text)
            self.chunks = self.split_into_chunks(cleaned_text)
            self.enable_question_input()

            display(HTML(f"<p>File name: {uploaded_filename} <br> Status: processed<br>Type your queries in the question box and wait for the answer</p>"))

    #Enable question input and submit button (The user cannot type the question until the file is processed)
    def enable_question_input(self):
        question_input.disabled = False
        submit_button.disabled = False

    def extract_text_from_pdf(self, content):
        text = extract_text(io.BytesIO(content))
        return text

    def extract_text_from_txt(self, content):
        return content.decode('utf-8')

    #text preprocessing
    def clean_text(self, text):
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s]', '', text)
        return text

    def split_into_chunks(self, text, chunk_size=500):
        words = text.split()
        chunks = []
        for i in range(0, len(words), chunk_size):
            chunk = ' '.join(words[i:i + chunk_size])
            chunk_tokens = self.tokenizer(chunk, return_tensors='pt', truncation=True, max_length=512)['input_ids']
            if chunk_tokens.size(1) <= 512:
                chunks.append(chunk)
            else:
                sub_chunks = [chunk[j:j + chunk_size] for j in range(0, len(chunk), chunk_size)
                              if len(self.tokenizer(chunk[j:j + chunk_size])['input_ids']) >= 512]
                chunks.extend(sub_chunks)
        return chunks

    #Process Question
    def process_question(self, question):
        relevant_chunks = []
        for chunk in self.chunks:
            chunk_tokens = self.tokenizer(chunk, return_tensors='pt', truncation=True, max_length=512)['input_ids']
            if chunk_tokens.size(1) <= 512:
                relevant_chunks.append(chunk)
        return relevant_chunks


    #Generate answers utiling BERT (Hugging face)
    def generate_answer(self, question, relevant_chunks):
        answers = []
        for chunk in relevant_chunks:
            inputs = self.tokenizer.encode_plus(
                question,
                chunk,
                return_tensors='pt',
                add_special_tokens=True,
                truncation='only_second',  # Truncate only the chunk text
                max_length=512
            )
            with torch.no_grad():
                outputs = self.model(**inputs)
                start_scores = outputs.start_logits
                end_scores = outputs.end_logits
                start_index = torch.argmax(start_scores)
                end_index = torch.argmax(end_scores) + 1

                answer_tokens = inputs['input_ids'][0][start_index:end_index]
                answer = self.tokenizer.decode(answer_tokens, skip_special_tokens=True)
                answers.append(answer)

        combined_answer = " ".join(answers)
        return combined_answer

doc_processor = DocumentProcessor()

#Creates simple UI using widgets
uploader = widgets.FileUpload(
    accept='.pdf,.txt',
    multiple=False
)

uploader.observe(doc_processor.handle_upload, names='value')

display(uploader)

question_input = widgets.Text(
    value='',
    placeholder='Type your question here',
    description='Question:',
    disabled=True,
    layout=widgets.Layout(width='100%', height='100px')
)

submit_button = widgets.Button(
    description='Submit',
    disabled=True,
    button_style='primary'
)

answer_display = widgets.Textarea(
    value='',
    placeholder='The answer will appear here...',
    description='Answer:',
    disabled=True,
    layout=widgets.Layout(width='100%', height='200px')
)

#Submit button handler
def handle_submit(_):
    question = question_input.value.strip()

    if doc_processor.chunks is not None:
        relevant_chunks = doc_processor.process_question(question)
        answer = doc_processor.generate_answer(question, relevant_chunks)
        answer_display.value = answer

submit_button.on_click(handle_submit)

display(question_input)
display(submit_button)
display(answer_display)


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FileUpload(value={}, accept='.pdf,.txt', description='Upload')

Text(value='', description='Question:', disabled=True, layout=Layout(height='100px', width='100%'), placeholde…

Button(button_style='primary', description='Submit', disabled=True, style=ButtonStyle())

Textarea(value='', description='Answer:', disabled=True, layout=Layout(height='200px', width='100%'), placehol…