<a href="https://colab.research.google.com/github/Siddharth-R512/Knowledge-Based-QA-using-DistilBERT/blob/main/Knowledge_Based_QA_using_DistilBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

class KnowledgeBasedQASystem:

  def __init__(self, model_name='distilbert-base-uncased-distilled-squad'):
    self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    self.model = DistilBertForQuestionAnswering.from_pretrained(model_name)
    self.stop_words = set(stopwords.words('english'))
  def preprocess_text(self, text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in self.stop_words]
    return ' '.join(tokens)
  def answer_question(self, question, context):
    preprocessed_question = self.preprocess_text(question)
    preprocessed_context = self.preprocess_text(context)
    inputs = self.tokenizer(preprocessed_question, preprocessed_context, return_tensors='pt', max_length=512, truncation=True)
    outputs = self.model(**inputs)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)
    answer_tokens = inputs['input_ids'][0][start_index:end_index+1]
    answer = self.tokenizer.decode(answer_tokens)
    return answer


if __name__ == "__main__":
  qa_system = KnowledgeBasedQASystem()
  while True:
    question = input("Enter your question: ")
    context = input("Enter the context: ")
    answer = qa_system.answer_question(question, context)
    print("Answer:", answer)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

Enter your question: What is the capital of France
Enter the context: Paris is the capital city of France, located in the north-central part of the country on the Seine River
Answer: paris
Enter your question: What is the chemical symbol for water
Enter the context: Water is a chemical substance composed of two hydrogen atoms bonded to one oxygen atom. Its chemical formula is H2O
Answer: h2o
Enter your question: Who is known as the father of modern physics
Enter the context: Often referred to as the father of modern physics, Sir Isaac Newton was an English mathematician, physicist, and astronomer. He made significant contributions to the fields of mathematics, optics, and mechanics
Answer: sir isaac newton
