In [1]:
import json
import pandas as pd
import re

In [2]:
GLOVE_FILEPATH = 'glove.6B/glove.6B.50d.txt'
EMBEDDING_DIM = 50
THRESHOLD_CONTEXT_LENGTH = 256
THRESHOLD_QUESTION_LENGTH = 256

In [3]:

def squad_json_to_dataframe(input_file_path):
    # Load the JSON file
    with open(input_file_path, 'r') as f:
        squad_data = json.load(f)

    # Initialize empty lists to store rows
    contexts = []
    questions = []
    answers_text = []
    answers_start = []
    ids = []

    # Iterate through the entries in the JSON file
    for article in squad_data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                qid = qa['id']
                # Each question might have multiple answers
                # SQuAD v1.1 has one answer per question, but handling multiple answers per question is straightforward
                for answer in qa['answers']:
                    answers_text.append(answer['text'])
                    answers_start.append(answer['answer_start'])
                    questions.append(question)
                    contexts.append(context)
                    ids.append(qid)

    # Create a DataFrame
    df = pd.DataFrame({
        'id': ids,
        'context': contexts,
        'question': questions,
        'answer_text': answers_text,
        'answer_start': answers_start
    })

    return df

In [4]:
def tokenize_corpus(corpus):
    
    # Regex pattern for word tokenization, keeping punctuations separate
    pattern = r'\w+|[^\w\s]'
    
    # Find all matches of the pattern
    tokens = re.findall(pattern, corpus)
    
    return tokens

In [5]:
train_squad_df = squad_json_to_dataframe('extras/train-v1.1.json')
dev_squad_df = squad_json_to_dataframe('extras/dev-v1.1.json')

# Data Pre-processing

In [6]:
train_squad_df['context_tokens'] = train_squad_df['context'].map(tokenize_corpus)
train_squad_df['question_tokens'] = train_squad_df['question'].map(tokenize_corpus)

dev_squad_df['context_tokens'] = dev_squad_df['context'].map(tokenize_corpus)
dev_squad_df['question_tokens'] = dev_squad_df['question'].map(tokenize_corpus)

In [7]:
train_squad_df.head()

Unnamed: 0,id,context,question,answer_text,answer_start,context_tokens,question_tokens
0,5733be284776f41900661182,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,"[Architecturally, ,, the, school, has, a, Cath...","[To, whom, did, the, Virgin, Mary, allegedly, ..."
1,5733be284776f4190066117f,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,188,"[Architecturally, ,, the, school, has, a, Cath...","[What, is, in, front, of, the, Notre, Dame, Ma..."
2,5733be284776f41900661180,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,279,"[Architecturally, ,, the, school, has, a, Cath...","[The, Basilica, of, the, Sacred, heart, at, No..."
3,5733be284776f41900661181,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,381,"[Architecturally, ,, the, school, has, a, Cath...","[What, is, the, Grotto, at, Notre, Dame, ?]"
4,5733be284776f4190066117e,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,92,"[Architecturally, ,, the, school, has, a, Cath...","[What, sits, on, top, of, the, Main, Building,..."


In [8]:
dev_squad_df.head()

Unnamed: 0,id,context,question,answer_text,answer_start,context_tokens,question_tokens
0,56be4db0acb8001400a502ec,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,Denver Broncos,177,"[Super, Bowl, 50, was, an, American, football,...","[Which, NFL, team, represented, the, AFC, at, ..."
1,56be4db0acb8001400a502ec,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,Denver Broncos,177,"[Super, Bowl, 50, was, an, American, football,...","[Which, NFL, team, represented, the, AFC, at, ..."
2,56be4db0acb8001400a502ec,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,Denver Broncos,177,"[Super, Bowl, 50, was, an, American, football,...","[Which, NFL, team, represented, the, AFC, at, ..."
3,56be4db0acb8001400a502ed,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,Carolina Panthers,249,"[Super, Bowl, 50, was, an, American, football,...","[Which, NFL, team, represented, the, NFC, at, ..."
4,56be4db0acb8001400a502ed,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,Carolina Panthers,249,"[Super, Bowl, 50, was, an, American, football,...","[Which, NFL, team, represented, the, NFC, at, ..."


In [9]:
train_squad_df = train_squad_df[train_squad_df['context_tokens'].apply(lambda x: len(x) <= THRESHOLD_CONTEXT_LENGTH )]
dev_squad_df = dev_squad_df[dev_squad_df['context_tokens'].apply(lambda x: len(x) <= THRESHOLD_CONTEXT_LENGTH )]

print(f"Size of training dataset after getting rid of data with context > {THRESHOLD_CONTEXT_LENGTH}: {len(train_squad_df)}")
print(f"Size of dev dataset after getting rid of data with context > {THRESHOLD_CONTEXT_LENGTH}: {len(dev_squad_df)}")

Size of training dataset after getting rid of data with context > 256: 83385
Size of dev dataset after getting rid of data with context > 256: 32874


In [10]:
train_squad_df = train_squad_df[train_squad_df['question'].apply(lambda x: len(x) <= THRESHOLD_QUESTION_LENGTH )]
dev_squad_df = dev_squad_df[dev_squad_df['question'].apply(lambda x: len(x) <= THRESHOLD_QUESTION_LENGTH )]

print(f"Size of training dataset after getting rid of data with question length > {THRESHOLD_QUESTION_LENGTH}: {len(train_squad_df)}")
print(f"Size of dev dataset after getting rid of data with question length > {THRESHOLD_QUESTION_LENGTH}: {len(dev_squad_df)}")

Size of training dataset after getting rid of data with question length > 256: 83383
Size of dev dataset after getting rid of data with question length > 256: 32874


In [11]:
# Find the answer start based on tokens
def char_to_token_index(context, answer_start_char_index):
    # Split the context into tokens using the same regex pattern
    tokens = re.findall(r'\w+|[^\w\s]', context)
    
    current_char_index = 0
    for i, token in enumerate(tokens):
        token_length = len(token)
        # Check if the answer_start_char_index falls within the range of the current token
        if current_char_index <= answer_start_char_index < current_char_index + token_length:
            return i
        # Update the current_char_index to the next token's start index
        current_char_index += token_length
        # If there is a space after the current token, increment the index
        if context[current_char_index:current_char_index + 1] == ' ':
            current_char_index += 1
    
    return -1  

In [12]:
train_squad_df['answer_start_token_index'] = train_squad_df.apply(lambda row: char_to_token_index(row['context'], row['answer_start']), axis=1)
dev_squad_df['answer_start_token_index'] = dev_squad_df.apply(lambda row: char_to_token_index(row['context'], row['answer_start']), axis=1)

In [13]:
train_squad_df.drop(columns=['context_tokens','question_tokens'],inplace=True)
dev_squad_df.drop(columns=['context_tokens','question_tokens'],inplace=True)

In [14]:
train_squad_df.head()

Unnamed: 0,id,context,question,answer_text,answer_start,answer_start_token_index
0,5733be284776f41900661182,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,103
1,5733be284776f4190066117f,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,188,38
2,5733be284776f41900661180,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,279,58
3,5733be284776f41900661181,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,381,77
4,5733be284776f4190066117e,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,92,18


In [15]:
dev_squad_df.head()

Unnamed: 0,id,context,question,answer_text,answer_start,answer_start_token_index
0,56be4db0acb8001400a502ec,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,Denver Broncos,177,33
1,56be4db0acb8001400a502ec,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,Denver Broncos,177,33
2,56be4db0acb8001400a502ec,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,Denver Broncos,177,33
3,56be4db0acb8001400a502ed,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,Carolina Panthers,249,44
4,56be4db0acb8001400a502ed,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,Carolina Panthers,249,44


# Tokenization

In [16]:
corpus_series_train = (train_squad_df['context'] + ' ' + train_squad_df['question'] + ' ' + train_squad_df['answer_text']).str.lower()
single_corpus_train = ' '.join(corpus_series_train.tolist())
single_corpus_train = ' '.join(single_corpus_train.split())

corpus_series_dev = (dev_squad_df['context'] + ' ' + dev_squad_df['question'] + ' ' + dev_squad_df['answer_text']).str.lower()
single_corpus_dev = ' '.join(corpus_series_dev.tolist())
single_corpus_dev = ' '.join(single_corpus_dev.split())

single_corpus = single_corpus_dev+single_corpus_train

print(single_corpus[:1000])
print(f'Size of entire corpus: {len(single_corpus)}')

super bowl 50 was an american football game to determine the champion of the national football league (nfl) for the 2015 season. the american football conference (afc) champion denver broncos defeated the national football conference (nfc) champion carolina panthers 24–10 to earn their third super bowl title. the game was played on february 7, 2016, at levi's stadium in the san francisco bay area at santa clara, california. as this was the 50th super bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each super bowl game with roman numerals (under which the game would have been known as "super bowl l"), so that the logo could prominently feature the arabic numerals 50. which nfl team represented the afc at super bowl 50? denver broncos super bowl 50 was an american football game to determine the champion of the national football league (nfl) for the 2015 season. the american football confe

In [17]:
def tokenize_corpus(corpus):
    
    # Regex pattern for word tokenization, keeping punctuations separate
    pattern = r'\w+|[^\w\s]'
    
    # Find all matches of the pattern
    tokens = re.findall(pattern, corpus)
    
    return tokens

tokens = tokenize_corpus(single_corpus)

In [18]:
# Convert tokens to set.
setTokens = set(tokens)
setTokens = list(setTokens)
len(setTokens)

83297

In [19]:
# Remove all the tokens not in glove embeddings

# Store all glove words.
glove_words = []
# Iterate through file storing glove words.
f = open(GLOVE_FILEPATH, encoding="utf-8")
for line in f:
  values = line.strip().split(' ')
  word = values[0]
  glove_words.append(word)
f.close()

# Iterate over setTokens and remove tokens not in glove.
for token in setTokens:
  if token not in glove_words:
    setTokens.remove(token)

# We assign index 0 for padding token and 1 for unknown token.
setTokens[0] = '<PAD>'
setTokens[1] = '<UNK>'

print(f'Vocab size after non-glove tokens are removed and unknown and padding token are added: {len(setTokens)}')

Vocab size after non-glove tokens are removed and unknown and padding token are added: 69980


In [20]:
# Create tokentoidx and idxtotoken dictionary 
token2idx = {}
for idx,word in enumerate(setTokens):
    token2idx.update({word:idx}) 

idx2token = {v:k for k,v in token2idx.items()}

len(token2idx), len(idx2token)

(69980, 69980)

# Create Embedding Matrix

In [21]:
import torch
import torch.nn as nn
import numpy as np
import pickle

In [22]:
# Create Embedding matrix
embedding = nn.Embedding(len(token2idx),EMBEDDING_DIM)
for params in embedding.parameters():
    params.requires_grad = False

In [23]:
# Set padding and unknown embeddings.

# Set 0'th index as padding
embedding.weight[0] = torch.zeros((EMBEDDING_DIM))
# Set 1st index and unknown weight of glove
with open(GLOVE_FILEPATH) as f:
    for line in f:
        values = line.strip().split(' ')
unknown_vec = np.asarray(values[1:], dtype='float32')
unknown_vec = torch.from_numpy(unknown_vec)
embedding.weight[1] = unknown_vec

In [24]:
# Create final embedding matrix.
with open(GLOVE_FILEPATH) as f:
    count = 0
    for line in f:
        values = line.strip().split(' ')
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        vec = torch.from_numpy(vec)
        if word in token2idx:
            idx = token2idx[word]
            embedding.weight[idx] = vec
            count+=1

In [25]:
torch.save(embedding.state_dict(),f'squad-assets/squad.glove.6B.{EMBEDDING_DIM}d.pt')
# Save idxtotoken and tokentoidx
with open('squad-assets/token2idx.pkl', 'wb') as file:
    pickle.dump(token2idx, file)
with open('squad-assets/idx2token.pkl', 'wb') as file:
    pickle.dump(idx2token, file)

In [26]:
train_squad_df.to_csv('squad-assets/train_data.csv')
dev_squad_df.to_csv('squad-assets/dev_data.csv')