In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m92.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
Col

**1. Hugging Face Transformers (Pipeline):**

In [2]:
from transformers import pipeline

In [3]:
question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')

Downloading (…)lve/main/config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [4]:
context = r"""
Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
a model on a SQuAD task, you may leverage the examples/pytorch/question-answering/run_squad.py script.
"""

In [5]:
result = question_answerer(question="What is the context is about?",     context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

Answer: 'the task of extracting an answer from a text given a question', score: 0.1016, start: 34, end: 95


**2. Using PyTorch:**

In [6]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased-distilled-squad')
model = DistilBertModel.from_pretrained('distilbert-base-cased-distilled-squad')

In [8]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

In [9]:
inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

print(outputs)

BaseModelOutput(last_hidden_state=tensor([[[ 1.1810, -0.4073,  0.9986,  ..., -0.7445,  0.0380, -0.5510],
         [ 1.6172, -0.6785,  1.6932,  ..., -0.8216, -0.2387, -0.6187],
         [ 2.0840, -0.5496,  1.3313,  ..., -0.7791,  0.1698, -0.3950],
         ...,
         [ 0.2879, -0.1813,  1.2631,  ..., -0.2022,  0.4699,  0.5535],
         [ 0.6069, -0.1943,  0.7584,  ..., -0.5106, -0.4027, -0.4910],
         [ 1.0183, -0.8215,  0.9088,  ..., -0.8094,  0.8372, -0.2027]]]), hidden_states=None, attentions=None)


**3. Using TensorFlow:**

In [10]:
from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
import tensorflow as tf

In [11]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.

All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.


In [12]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

In [13]:
inputs = tokenizer(question, text, return_tensors="tf")
outputs = model(**inputs)

In [14]:
answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])

In [15]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'a nice puppet'

**4. From URL**

In [16]:
import requests
from bs4 import BeautifulSoup
from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
import tensorflow as tf

# Define the URL you want to scrape
url = "https://www.bigcommerce.com/ecommerce-answers/what-about-us-page/"  # Replace with your desired URL

# Send an HTTP GET request to the URL and fetch the content
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract text content from the HTML (modify this based on the webpage structure)
    text_content = " ".join([p.get_text() for p in soup.find_all("p")])

    # Tokenize the text using DistilBERT's tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
    inputs = tokenizer("What is the purpose of an about us page?", text_content, return_tensors="tf")

    # Load a pre-trained DistilBERT model
    model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

    # Perform inference to answer a question (modify the question as needed)
    outputs = model(**inputs)
    answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
    answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
    predict_answer_tokens = inputs.input_ids[0, answer_start_index:answer_end_index + 1]
    answer = tokenizer.decode(predict_answer_tokens)

    # Print the answer
    print(f"Answer: {answer}")

else:
    print(f"Failed to retrieve content from the URL. Status code: {response.status_code}")


All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.

All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.


Answer: to inform the reader about the company and its operations
