In [None]:
from datasets import load_dataset

# load the dataset from huggingface in streaming mode and shuffle it
wiki_data = load_dataset(
    'vblagoje/wikipedia_snippets_streamed',
    split='train',
    streaming=True
).shuffle(seed=960)

Downloading builder script:   0%|          | 0.00/4.58k [00:00<?, ?B/s]

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.17.3-py3-none-a

In [None]:
# filter only documents with History as section_title
history = wiki_data.filter(
    lambda d: d['section_title'].startswith('Business')
)

In [None]:
!pip install tqdm



In [None]:
from tqdm.auto import tqdm  # progress bar

total_doc_count = 500

counter = 0
docs = []
# iterate through the dataset and apply our filter
for d in tqdm(history, total=total_doc_count):
    # extract the fields we need
    doc = {
        "article_title": d["article_title"],
        "section_title": d["section_title"],
        "passage_text": d["passage_text"]
    }
    # add the dict containing fields we need to docs list
    docs.append(doc)

    # stop iteration once we reach 50k
    if counter == total_doc_count:
        break

    # increase the counter on every iteration
    counter += 1

  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
import pandas as pd

# create a pandas dataframe with the documents we extracted
df = pd.DataFrame(docs)
columns = ['article_title','section_title']
df = df.drop(columns = columns)
df.head()

Unnamed: 0,passage_text
0,Thomas Sturge the elder Business career Sturge...
1,Department of Economics. The new school now ha...
2,which is when the project was taken over by No...
3,"Design Studio. His business model considers ""h..."
4,"manufacturer to Phillip Morris, Smirnoff and F..."


In [None]:
!pip install -qU datasets pinecone-client sentence-transformers torch

In [None]:
!pip3 install pinecone-client
import pinecone

api_key = "356c9c7b-cd52-4cbb-84a8-75193e726d90"
# connect to pinecone environment
pinecone.init(
    api_key=api_key,
    environment="gcp-starter"  # find next to API key in console
)



In [None]:
index_name = "abstractive-question-answering"

# Check if the abstractive-question-answering index exists
if index_name not in pinecone.list_indexes():
    try:
        # Create the index if it does not exist
        pinecone.create_index(
            index_name,
            dimension=768,
            metric="cosine"
        )
        print(f"Index '{index_name}' created successfully.")
    except pinecone.PineconeException as e:
        print(f"Failed to create the index: {e}")
else:
    # Connect to the existing abstractive-question-answering index
    index = pinecone.Index(index_name)
    print(f"Connected to existing index: {index_name}")

Connected to existing index: abstractive-question-answering


In [None]:

import torch
from sentence_transformers import SentenceTransformer

# set device to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# load the retriever model from huggingface model hub
retriever = SentenceTransformer("flax-sentence-embeddings/all_datasets_v3_mpnet-base", device=device)
retriever


Downloading (…)e933c/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)cbe6ee933c/README.md:   0%|          | 0.00/9.85k [00:00<?, ?B/s]

Downloading (…)e6ee933c/config.json:   0%|          | 0.00/591 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)33c/data_config.json:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)e933c/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading (…)933c/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)cbe6ee933c/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)6ee933c/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [None]:

# we will use batches of 16
batch_size = 16
from tqdm.auto import tqdm
for i in tqdm(range(0, len(df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(df))
    # extract batch
    batch = df.iloc[i:i_end]
    # generate embeddings for batch
    emb = retriever.encode(batch["passage_text"].tolist()).tolist()
    # get metadata
    meta = batch.to_dict(orient="records")
    # create unique IDs
    ids = [f"{idx}" for idx in range(i, i_end)]
    # add all to upsert list
    to_upsert = list(zip(ids, emb, meta))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

  0%|          | 0/32 [00:00<?, ?it/s]

{'dimension': 768,
 'index_fullness': 0.1376,
 'namespaces': {'': {'vector_count': 13760}},
 'total_vector_count': 13760}

In [None]:
!pip install transformers



In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

# load bart tokenizer and model from huggingface
tokenizer = BartTokenizer.from_pretrained('vblagoje/bart_lfqa')
generator = BartForConditionalGeneration.from_pretrained('vblagoje/bart_lfqa').to(device)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [None]:
def query_pinecone(query, top_k):
    # generate embeddings for the query
    xq = retriever.encode([query]).tolist()
    # search pinecone index for context passage with the answer
    xc = index.query(xq, top_k=top_k, include_metadata=True)
    return xc

In [None]:
def format_query(query, context):
    # extract passage_text from Pinecone search result and add the  tag
    context = [f" {m['metadata']['passage_text']}" for m in context]
    # concatinate all context passages
    context = " ".join(context)
    # contcatinate the query and context passages
    query = f"question: {query} context: {context}"
    return query


In [None]:
docs

[{'article_title': 'Thomas Sturge the elder',
  'section_title': 'Business career & Reformer and philanthropist',
  'passage_text': 'Thomas Sturge the elder Business career Sturge was born into a farming family at Olveston, Gloucestershire, in 1749. He was an apprentice at Poole, Dorset, by 1766, and afterwards began work as an oil-leather dresser. He seems to have been in London by 1782, where he worked as a tallow chandler and oil merchant. By 1785 he was at Walworth and then at Newington. He is also named as a spermaceti refiner there by 1791. Reformer and philanthropist Sturge was a devout Quaker and an elder of the society in London. Like other Quakers, he took an interest in social'},
 {'article_title': 'University of Leicester',
  'section_title': 'Business & English',
  'passage_text': 'Department of Economics. The new school now has approximately 150 academic staff, 50 from Economics and 100 from Management. In 2010 the former School of Management was ranked 2nd after Oxford U

In [None]:
query = "Tell me about National Boulevard Bank"
result = query_pinecone(query, top_k=1)
result

{'matches': [{'id': '2469',
              'metadata': {'article_title': 'Ulster Savings Bank',
                           'passage_text': '1916, the bank had $5.135 million '
                                           'in deposits.\n'
                                           'In 1896, Alton B. Parker was named '
                                           'president of the bank and served '
                                           'until his resignation in 1904.\n'
                                           'In September 2011, Lisa Marie '
                                           'Cathie was named president and '
                                           'chief executive officer of the '
                                           'bank, succeeding Marjorie '
                                           'Rovereto.\n'
                                           'In 2015, Glenn B. Sutherland '
                                           'replaced Lisa Marie Cathie as '
                 

In [None]:

from pprint import pprint

In [None]:

# format the query in the form generator expects the input
query = format_query(query, result["matches"])
pprint(query)

('question: Tell me about National Boulevard Bank context:  1916, the bank had '
 '$5.135 million in deposits.\n'
 'In 1896, Alton B. Parker was named president of the bank and served until '
 'his resignation in 1904.\n'
 'In September 2011, Lisa Marie Cathie was named president and chief executive '
 'officer of the bank, succeeding Marjorie Rovereto.\n'
 'In 2015, Glenn B. Sutherland replaced Lisa Marie Cathie as president and '
 'chief executive officer.\n'
 'In January 2016, the bank opened a branch in Newburgh, New York.\n'
 'In June 2016, William C. Calderara was named president and chief executive '
 'officer of the bank.\n'
 'In November 2016, a lawsuit alleged that the bank discriminated against '
 'African-American borrowers by offering them less favorable')


In [None]:

def generate_answer(query):
    # tokenize the query to get input_ids
    inputs = tokenizer([query], max_length=1024, return_tensors="pt").to(device)
    # use generator to predict output ids
    ids = generator.generate(inputs["input_ids"], num_beams=2, min_length=20, max_length=40)
    # use tokenizer to decode the output ids
    answer = tokenizer.batch_decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return pprint(answer)


In [None]:

generate_answer(query)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


('National Boulevard Bank was founded in 1896 by Alton Parker. The bank was '
 'founded in Newburgh, New York. In 1916, the bank had $5.135 million in '
 'deposits. The')


In [None]:
query = "What is game currency?"
context = query_pinecone(query, top_k=1)
query = format_query(query, context["matches"])
generate_answer(query)


("It's a way to buy things in the game. For example, if you want to buy a new "
 'weapon, you can buy it with real money. If you want to buy a new')


In [None]:
query = "who is Aban Pestonjee?"
context = query_pinecone(query, top_k=10)
query = format_query(query, context["matches"])
generate_answer(query)



('Aban Pestonjee is a British businessman who has been involved in a number of '
 'business ventures in India. He is the founder of the Abans Group, which is a '
 'company that sells')
