# Loading dataset

In [1]:
from datasets import Dataset, load_dataset, concatenate_datasets

In [2]:
dataset = load_dataset("rajpurkar/squad_v2")
dataset

README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [3]:
dataset['train'][0]

{'id': '56be85543aeaaa14008c9063',
 'title': 'Beyoncé',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'When did Beyonce start becoming popular?',
 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}

In [4]:
dataset['validation'][0]

{'id': '56ddde6b9a695914005b9628',
 'title': 'Normans',
 'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'question': 'In what country is Normandy located?',
 'answers': {'text': ['France', 'France', 'France', 'France'],
  'answer_start': [159, 159, 159, 159]}}

In [5]:
len(dataset["train"].filter(lambda row: row["question"] is None))

Filter:   0%|          | 0/130319 [00:00<?, ? examples/s]

0

In [6]:
len(dataset["validation"].filter(lambda row: row["question"] is None))

Filter:   0%|          | 0/11873 [00:00<?, ? examples/s]

0

## Exploring Dataset

In [7]:
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
# top 10 unique titles in train

train_df = pd.DataFrame(dataset['train'])

value_counts = train_df['title'].value_counts()
print(f"Total count of unique titles: {len(value_counts)}\nTop 10: {value_counts.head(10)}")

Total count of unique titles: 442
Top 10: title
Queen_Victoria             883
New_York_City              817
American_Idol              790
Beyoncé                    753
Frédéric_Chopin            697
Buddhism                   610
Pharmaceutical_industry    586
New_Haven,_Connecticut     582
Premier_League             551
Hunting                    531
Name: count, dtype: int64


In [9]:
# top 10 unique titles in validation

valid_df = pd.DataFrame(dataset['validation'])

value_counts_2 = valid_df['title'].value_counts()
print(f"Total count of unique titles: {len(value_counts_2)}\nTop 10: {value_counts_2.head(10)}")

Total count of unique titles: 35
Top 10: title
Economic_inequality                515
Rhine                              498
Warsaw                             486
Immune_system                      458
Yuan_dynasty                       445
Steam_engine                       444
Huguenot                           424
European_Union_law                 421
Computational_complexity_theory    418
Oxygen                             415
Name: count, dtype: int64


In [10]:
# any intersecting titles?
vc_train = set(value_counts.index)
vc_valid = set(value_counts_2.index)

vc_train.intersection(vc_valid)

set()

# Obtaining embeddings of all validation examples

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
import torch
import numpy as np
import os
import time

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

## Loading model and tokenizer

In [12]:
# setting env vars
set_seed(1234)
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# setting notebook vars
model_name = 'openai-community/gpt2'
# model_name = "google/long-t5-tglobal-xl"

In [13]:
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             torch_dtype=torch.bfloat16, 
                                             output_hidden_states=True,
                                             return_dict_in_generate = True, # neccessary for output hidden states
                                             device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [14]:
# adding pad token (to gpt models only)
num_added_toks = tokenizer.add_special_tokens({"pad_token":"<pad>"})
print("We have added", num_added_toks, "tokens")
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = 0

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


We have added 1 tokens


## Generating embeddings

In [15]:
import math

In [16]:
embed_filename = "./train_embeddings_gpt2.pt"

In [17]:
def get_embeddings(texts: list[str], tokenizer, model, batch_size: int =10, log_progress: bool =True) -> torch.tensor:
    # determine number of buckets
    num_buckets = math.ceil(len(texts) / batch_size)
    if log_progress:
        print(f"Processing in batches of {batch_size}. Total number of batches: {num_buckets}")

    # split data into buckets and generate embeddings
    vectors = []
    counter = 0
    for bucket in np.array_split(texts, num_buckets):
        if len(bucket) == 0:  # Skip empty buckets
            continue
        
        tokens = tokenizer(bucket.tolist(), padding = True, truncation=True, return_tensors="pt").to('cuda')
        with torch.no_grad():
            embeddings = model(**tokens).hidden_states[-1].mean(dim=1).detach().cpu()
            # embeddings = model(**tokens).encoder_last_hidden_state.mean(dim=1).detach().cpu()
            # outputs = model.encoder(input_ids=tokens["input_ids"], attention_mask=tokens["attention_mask"])
            # embeddings = outputs.last_hidden_state.mean(dim=1).detach().cpu()
        
        vectors.append(embeddings)
        
        if log_progress and counter % 100 == 0:
            print(f"Finished processing batch #{counter}")
        
        counter += 1
    
    return torch.concatenate(vectors)

In [18]:
dataset['train']['question'][104006]

'Are there any other areas of America Venezuelans settled in?'

Proceed to next section unless you want to regenerate the embeddings

In [19]:
start_time = time.time()
train_embeddings = get_embeddings(dataset['train']['question'], tokenizer, model, 50)
print(f"Embedding {len(train_embeddings)} records took {(time.time() - start_time)} seconds")

Processing in batches of 50. Total number of batches: 2607
Finished processing batch #0
Finished processing batch #100
Finished processing batch #200
Finished processing batch #300
Finished processing batch #400
Finished processing batch #500
Finished processing batch #600
Finished processing batch #700
Finished processing batch #800
Finished processing batch #900
Finished processing batch #1000
Finished processing batch #1100
Finished processing batch #1200
Finished processing batch #1300
Finished processing batch #1400
Finished processing batch #1500
Finished processing batch #1600
Finished processing batch #1700
Finished processing batch #1800
Finished processing batch #1900
Finished processing batch #2000
Finished processing batch #2100
Finished processing batch #2200
Finished processing batch #2300
Finished processing batch #2400
Finished processing batch #2500
Finished processing batch #2600
Embedding 130319 records took 313.0769011974335 seconds


Embedding 130319 records took 313.0769011974335 seconds

In [20]:
# print(f"Length of embedding is: {len(validation_embeddings[0])}\nSample of single embedding: {validation_embeddings[0]}")
print(f"Length of embedding is: {len(train_embeddings[0])}")

Length of embedding is: 768


In [21]:
# checking if any embedding contains Nan
print(torch.isnan(train_embeddings).any())

tensor(False)


In [22]:
torch.any(torch.isnan(train_embeddings), dim=1).nonzero(as_tuple=True)[0]

tensor([], dtype=torch.int64)

In [23]:
torch.save(train_embeddings, embed_filename)

# Obtain few shot examples for each test example

In [24]:
# retrieving embeddings
train_embeddings = torch.load(embed_filename)
len(train_embeddings)

  train_embeddings = torch.load(embed_filename)


130319

In [25]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm  # For progress bars

In [26]:
def obtain_few_shot_examples(batch, topn=2, embed_batch_size: int=7):
    # obtain embedding for example
    batch_embeddings = get_embeddings(batch['question'], tokenizer, model, embed_batch_size, log_progress=False)
    # compute pairwise cossim with all corpus examples
    similarities = cosine_similarity(batch_embeddings.float().numpy(), train_embeddings.float().numpy())

    few_shot_topn = []
    
    # only keep topn examples
    for sim_i in range(len(similarities)):
        topn_indices = np.argsort(similarities[sim_i])[-topn:][::-1]
        few_shot_topn.append([dataset['train'][int(topn_i)] for topn_i in topn_indices])
        
    batch = batch.add_column('few_shot', few_shot_topn)

    return batch

In [27]:
def obtain_few_shot_examples_in_batches(data, batch_size: int=5, topn: int=2, log_progress: bool =True):
    # determine number of buckets
    num_buckets = math.ceil(len(data) / batch_size)
    if log_progress:
        print(f"Processing in batches of {batch_size}. Total number of batches: {num_buckets}")

    all_buckets = []

    # split data into buckets
    counter = 0
    for i in range(num_buckets):
        start = i * batch_size
        end = (i + 1) * batch_size if i < num_buckets - 1 else len(data)
        
        # Create a Dataset subset
        bucket = data.select(range(start, end))

        if len(bucket) == 0:  # Skip empty buckets
            continue
        
        all_buckets.append(obtain_few_shot_examples(bucket, topn))
        
        if log_progress and counter % 50 == 0:
            print(f"Finished processing batch #{counter}")
        
        counter += 1
    
    return concatenate_datasets(all_buckets)

In [28]:
dataset_w_fewshot = obtain_few_shot_examples_in_batches(dataset['validation'], 50, 2)
dataset_w_fewshot

Processing in batches of 50. Total number of batches: 238
Finished processing batch #0
Finished processing batch #50
Finished processing batch #100
Finished processing batch #150
Finished processing batch #200


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'few_shot'],
    num_rows: 11873
})

In [29]:
print(f"Length of test dataset is: {len(dataset_w_fewshot)}\nSample of single example: {dataset_w_fewshot[0]}")

Length of test dataset is: 11873
Sample of single example: {'id': '56ddde6b9a695914005b9628', 'title': 'Normans', 'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.', 'question': 'In what country is Normandy located?', 'answers': {'text': ['France', 'France', 'France', 'France'], 'answer_start':

In [30]:
dataset_w_fewshot.to_parquet("squad_2_with_few_shot_gpt2.parquet")

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

33477879

In [31]:
dataset_w_fewshot.select(range(5000)).to_parquet("squad_2_with_few_shot_gpt2_5000.parquet")

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

14028484