# Install Necessary packages


In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
Col

# Importing Libraries

In [2]:
import torch
from transformers import (
    BertForQuestionAnswering,
    BertTokenizerFast,
)

from scipy.special import softmax

import plotly.express as px
import pandas as pd
import numpy as np

In [3]:
model_name = 'deepset/bert-base-uncased-squad2'

model = BertForQuestionAnswering.from_pretrained(model_name)
tokenizer = BertTokenizerFast.from_pretrained(model_name)


Downloading (…)lve/main/config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/302 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

context: https://en.wikipedia.org/wiki/Giraffe

In [4]:
context = """
The giraffe is a large African hoofed mammal belonging to the genus Giraffa. It is the tallest living terrestrial animal and the largest ruminant on Earth. Traditionally, giraffes were thought to be one species, Giraffa camelopardalis, with nine subspecies. Most recently, researchers proposed dividing them into up to eight extant species due to new research into their mitochondrial and nuclear DNA, as well as morphological measurements. Seven other extinct species of Giraffa are known from the fossil record.
"""

In [5]:
question = "How many giraffe species are there?"

## Tokenization

In [6]:
inputs = tokenizer(question, context, return_tensors="pt")
tokenizer.tokenize(context)

['the',
 'gi',
 '##raf',
 '##fe',
 'is',
 'a',
 'large',
 'african',
 'ho',
 '##of',
 '##ed',
 'mammal',
 'belonging',
 'to',
 'the',
 'genus',
 'gi',
 '##raf',
 '##fa',
 '.',
 'it',
 'is',
 'the',
 'tallest',
 'living',
 'terrestrial',
 'animal',
 'and',
 'the',
 'largest',
 'rum',
 '##ina',
 '##nt',
 'on',
 'earth',
 '.',
 'traditionally',
 ',',
 'gi',
 '##raf',
 '##fe',
 '##s',
 'were',
 'thought',
 'to',
 'be',
 'one',
 'species',
 ',',
 'gi',
 '##raf',
 '##fa',
 'camel',
 '##opa',
 '##rda',
 '##lis',
 ',',
 'with',
 'nine',
 'subspecies',
 '.',
 'most',
 'recently',
 ',',
 'researchers',
 'proposed',
 'dividing',
 'them',
 'into',
 'up',
 'to',
 'eight',
 'extant',
 'species',
 'due',
 'to',
 'new',
 'research',
 'into',
 'their',
 'mitochondrial',
 'and',
 'nuclear',
 'dna',
 ',',
 'as',
 'well',
 'as',
 'morphological',
 'measurements',
 '.',
 'seven',
 'other',
 'extinct',
 'species',
 'of',
 'gi',
 '##raf',
 '##fa',
 'are',
 'known',
 'from',
 'the',
 'fossil',
 'record',
 '.'

In [7]:
with torch.no_grad():
  outputs = model(**inputs)

outputs

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[ 8.6147, -3.8342, -5.3155, -4.9616, -5.7517, -6.3353, -5.9955, -5.6306,
         -5.9607, -6.5582, -1.0037, -0.8722,  0.3918, -3.3915, -2.7076, -4.1584,
         -3.0436, -1.0836,  0.0858, -1.6589, -3.5880, -4.5092, -1.2625, -3.1903,
         -4.1797, -1.9904, -1.9859, -0.5005, -4.0035, -2.9158, -2.6802, -2.7219,
         -4.4752, -2.6741, -1.5990, -2.5716, -2.2904, -3.7346, -4.7141, -3.0863,
         -2.1613, -2.4703, -4.3582, -4.9624, -4.9226, -2.6644, -2.6418,  0.8500,
         -2.4713,  1.1136, -3.2708, -3.7608, -3.3522, -2.2513,  0.1003, -2.6170,
         -2.4569,  3.2704, -1.7601, -3.2791, -0.2696, -3.8203, -3.4298, -1.2619,
         -3.5974, -3.7142, -2.8176, -2.0486,  0.6114,  5.7514, -0.3363, -1.3607,
         -2.5663, -3.8741, -4.3766, -1.5292, -2.3659, -0.6962, -2.6823, -1.7130,
          3.5878, -1.3226,  5.0001, -2.5506, -1.6767, -4.3303, -4.9587, -3.3776,
         -2.9455, -4.8823, -3.7814, -1.6568, -4.9745, -2

In [8]:
start_scores, end_scores = softmax(outputs.start_logits)[0], softmax(outputs.end_logits)[0]
start_scores, end_scores

(array([8.98658931e-01, 3.52455049e-06, 8.01266424e-07, 1.14140721e-06,
        5.17987644e-07, 2.88986229e-07, 4.05934685e-07, 5.84654117e-07,
        4.20314905e-07, 2.31241444e-07, 5.97519611e-05, 6.81484817e-05,
        2.41223330e-04, 5.48737717e-06, 1.08737459e-05, 2.54855468e-06,
        7.77055266e-06, 5.51613775e-05, 1.77639929e-04, 3.10331889e-05,
        4.50805510e-06, 1.79451297e-06, 4.61276650e-05, 6.71027192e-06,
        2.49491791e-06, 2.22768431e-05, 2.23761708e-05, 9.88294778e-05,
        2.97547649e-06, 8.82958466e-06, 1.11755817e-05, 1.07192209e-05,
        1.85645422e-06, 1.12439902e-05, 3.29460017e-05, 1.24576354e-05,
        1.65029778e-05, 3.89369825e-06, 1.46204707e-06, 7.44584850e-06,
        1.87772039e-05, 1.37855595e-05, 2.08692700e-06, 1.14057809e-06,
        1.18683488e-06, 1.13529359e-05, 1.16125248e-05, 3.81409802e-04,
        1.37710131e-05, 4.96477587e-04, 6.19126104e-06, 3.79276116e-06,
        5.70721568e-06, 1.71610209e-05, 1.80217918e-04, 1.190482

In [9]:
start_idx = np.argmax(start_scores)
end_idx = np.argmax(end_scores)
start_idx, end_idx

(0, 0)

In [10]:
confidence_score = (start_scores[start_idx] + end_scores[end_idx]) / 2
confidence_score

0.8802082538604736

In [11]:
answer_ids = inputs.input_ids[0][start_idx: end_idx + 1]
answer_ids

tensor([101])

In [12]:
answer_tokens = tokenizer.convert_ids_to_tokens(answer_ids)
answer_tokens

['[CLS]']

In [13]:
answer = tokenizer.convert_tokens_to_string(answer_tokens)
answer

'[CLS]'

In [14]:
if answer != tokenizer.cls_token:
  print(answer, confidence_score)
else:
  print(None, confidence_score)

None 0.8802082538604736


In [15]:
scores_df = pd.DataFrame({
    "Token Position": list(range(len(start_scores))) * 2,
    "Score": list(start_scores) + list(end_scores),
    "Score Type": ["Start"] * len(start_scores) + ["End"] * len(end_scores)
})
px.bar(
    scores_df,
    x="Token Position",
    y="Score",
    color="Score Type",
    barmode="group",
    title="Start and End Scores for Tokens"
)

## Functions

**The problem with BERT is that it can't take long contexts the capacity is 512**


In [16]:
def predict_answer(context, question):
  inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
  with torch.no_grad():
    outputs = model(**inputs)

  start_scores, end_scores = softmax(outputs.start_logits)[0], softmax(outputs.end_logits)[0]
  start_idx = np.argmax(start_scores)
  end_idx = np.argmax(end_scores)

  confidence_score = (start_scores[start_idx] + end_scores[end_idx]) / 2
  answer_ids = inputs.input_ids[0][start_idx: end_idx + 1]
  answer_tokens = tokenizer.convert_ids_to_tokens(answer_ids)
  answer = tokenizer.convert_tokens_to_string(answer_tokens)
  if answer != tokenizer.cls_token:
    return answer, confidence_score
  else:
    return None, confidence_score

245

In [52]:
context = """Coffee is a beverage prepared from roasted coffee beans. Darkly colored, bitter, and slightly acidic, coffee has a stimulating effect on humans, primarily due to its caffeine content. It has the highest sales in the world market for hot drinks.[2]
The seeds of the Coffea plant's fruits are separated to produce unroasted green coffee beans. The beans are roasted and then ground into fine particles typically steeped in hot water before being filtered out, producing a cup of coffee. It is usually served hot, although chilled or iced coffee is common. Coffee can be prepared and presented in a variety of ways (e.g., espresso, French press, caffè latte, or already-brewed canned coffee). Sugar, sugar substitutes, milk, and cream are often added to mask the bitter taste or enhance the flavor.
Though coffee is now a global commodity, it has a long history tied closely to food traditions around the Red Sea. The earliest credible evidence of coffee drinking as the modern beverage appears in modern-day Yemen in southern Arabia in the middle of the 15th century in Sufi shrines, where coffee seeds were first roasted and brewed in a manner similar to how it is now prepared for drinking.[3] The coffee beans were procured by the Yemenis from the Ethiopian Highlands via coastal Somali intermediaries, and cultivated in Yemen. By the 16th century, the drink had reached the rest of the Middle East and North Africa, later spreading to Europe.
The two most commonly grown coffee bean types are C. arabica and C. robusta.[4] Coffee plants are cultivated in over 70 countries, primarily in the equatorial regions of the Americas, Southeast Asia, the Indian subcontinent, and Africa. As of 2018, Brazil was the leading grower of coffee beans, producing 35% of the world's total. Green, unroasted coffee is traded as an agricultural commodity. Despite sales of coffee reaching billions of dollars worldwide, farmers producing coffee beans disproportionately live in poverty. Critics of the coffee industry have also pointed to its negative impact on the environment and the clearing of land for coffee-growing and water use.
Meanwhile, coffee had been introduced to Brazil in 1727, although its cultivation did not gather momentum until independence in 1822.[34] After this time, massive tracts of rainforest were cleared for coffee plantations, first in the vicinity of Rio de Janeiro and later São Paulo.[35] Brazil went from having essentially no coffee exports in 1800 to being a significant regional producer in 1830, to being the largest producer in the world by 1852. In 1910–1920, Brazil exported around 70% of the world's coffee, Colombia, Guatemala, and Venezuela exported half of the remaining 30%, and Old World production accounted for less than 5% of world exports.[36]
Many countries in Central America took up cultivation in the latter half of the 19th century, and almost all were involved in the large-scale displacement and exploitation of the indigenous people. Harsh conditions led to many uprisings, coups, and bloody suppression of peasants.[37] The notable exception was Costa Rica, where lack of ready labor prevented the formation of large farms. Smaller farms and more egalitarian conditions ameliorated unrest over the 19th and 20th centuries.[38]
Rapid growth in coffee production in South America during the second half of the 19th century was matched by an increase in consumption in developed countries, though nowhere has this growth been as pronounced as in the United States, where a high rate of population growth was compounded by doubling of per capita consumption between 1860 and 1920. Though the United States was not the heaviest coffee-drinking nation at the time (Nordic countries, Belgium, and the Netherlands all had comparable or higher levels of per capita consumption), due to its sheer size, it was already the largest consumer of coffee in the world by 1860, and, by 1920, around half of all coffee produced worldwide was consumed in the US.[36]
Coffee has become a vital cash crop for many developing countries. Over one hundred million people in developing countries have become dependent on coffee as their primary source of income. It has become the primary export and backbone for African countries like Uganda, Burundi, Rwanda, and Ethiopia,[39] as well as many Central American countries."""

len(tokenizer.tokenize(context))

879

In [37]:
len(context)

4345

In [38]:
predict_answer(context, "What is coffee?")

('a beverage prepared from roasted coffee beans', 0.8857728242874146)

In [39]:
predict_answer(context, "What are the most common coffee beans")

('c. arabica and c. robusta', 0.7419992685317993)

In [40]:
predict_answer(context, "How can i make ice coffee?")

(None, 0.9987176656723022)

In [41]:
predict_answer(context, "How many people are dependent on coffee for their income?")

(None, 0.9929671883583069)

In [42]:
predict_answer(context[4000:], "How many people are dependent on coffee for their income?")

('over one hundred million', 0.8220916986465454)

In [43]:
predict_answer(context, "Which crop had highest sales")

('coffee is a beverage prepared from roasted coffee beans. darkly colored, bitter, and slightly acidic, coffee has a stimulating effect on humans, primarily due to its caffeine content. it has the highest sales in the world market for hot drinks',
 0.44648605585098267)

### Chunking sentence

In [29]:
def chunk_sentences(sentences, chunk_size, stride):
  chunks = []
  num_sentences = len(sentences)
  for i in range(0, num_sentences, chunk_size - stride):
    chunk = sentences[i: i + chunk_size]
    chunks.append(chunk)
  return chunks

## Dummy

In [30]:
sentences = [
    "Sentence 1.",
    "Sentence 2.",
    "Sentence 3.",
    "Sentence 4.",
    "Sentence 5.",
    "Sentence 6.",
    "Sentence 7.",
    "Sentence 8.",
    "Sentence 9.",
    "Sentence 10.",

]

chunked_sentences = chunk_sentences(sentences, chunk_size=3, stride=1)
chunked_sentences

[['Sentence 1.', 'Sentence 2.', 'Sentence 3.'],
 ['Sentence 3.', 'Sentence 4.', 'Sentence 5.'],
 ['Sentence 5.', 'Sentence 6.', 'Sentence 7.'],
 ['Sentence 7.', 'Sentence 8.', 'Sentence 9.'],
 ['Sentence 9.', 'Sentence 10.']]

In [53]:
sentences = context.split('\n')
sentences, len(sentences)

(['Coffee is a beverage prepared from roasted coffee beans. Darkly colored, bitter, and slightly acidic, coffee has a stimulating effect on humans, primarily due to its caffeine content. It has the highest sales in the world market for hot drinks.[2]',
  "The seeds of the Coffea plant's fruits are separated to produce unroasted green coffee beans. The beans are roasted and then ground into fine particles typically steeped in hot water before being filtered out, producing a cup of coffee. It is usually served hot, although chilled or iced coffee is common. Coffee can be prepared and presented in a variety of ways (e.g., espresso, French press, caffè latte, or already-brewed canned coffee). Sugar, sugar substitutes, milk, and cream are often added to mask the bitter taste or enhance the flavor.",
  'Though coffee is now a global commodity, it has a long history tied closely to food traditions around the Red Sea. The earliest credible evidence of coffee drinking as the modern beverage app

In [55]:
chunked_sentences = chunk_sentences(sentences, chunk_size=3, stride=1)
chunked_sentences, len(chunked_sentences)

([['Coffee is a beverage prepared from roasted coffee beans. Darkly colored, bitter, and slightly acidic, coffee has a stimulating effect on humans, primarily due to its caffeine content. It has the highest sales in the world market for hot drinks.[2]',
   "The seeds of the Coffea plant's fruits are separated to produce unroasted green coffee beans. The beans are roasted and then ground into fine particles typically steeped in hot water before being filtered out, producing a cup of coffee. It is usually served hot, although chilled or iced coffee is common. Coffee can be prepared and presented in a variety of ways (e.g., espresso, French press, caffè latte, or already-brewed canned coffee). Sugar, sugar substitutes, milk, and cream are often added to mask the bitter taste or enhance the flavor.",
   'Though coffee is now a global commodity, it has a long history tied closely to food traditions around the Red Sea. The earliest credible evidence of coffee drinking as the modern beverage 

In [56]:
questions = [
    "What is coffee?",
    "What are the most common coffee beans?",
    "How can I make ice coffee",
    "How many people are dependent on coffee for their income?",
]

answers = {}

for chunk in chunked_sentences:
  context = "\n".join(chunk)
  for question in questions:
    answer, score = predict_answer(context, question)
    if answer:
      if question not in answers:
        answers[question] = (answer, score)
      else:
        if score > answers[question][1]:
          answers[question] = (answer, score)

In [57]:
answers

{'What is coffee?': ('a beverage prepared from roasted coffee beans',
  0.8822323083877563),
 'What are the most common coffee beans?': ('c. arabica and c. robusta',
  0.9763001203536987),
 'How many people are dependent on coffee for their income?': ('over one hundred million',
  0.7991697788238525)}

In [None]:
s