In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!ls /content/gdrive/'My Drive/Colab Notebooks/SQUAD'

ls: cannot access '/content/gdrive/My Drive/Colab Notebooks/SQUAD': No such file or directory


In [None]:
import numpy as np 
import pandas as pd
import json
import re
import nltk
nltk.download('punkt')
from tqdm import tqdm
from textblob import TextBlob


In [None]:
dev = pd.read_json('/content/drive/My Drive/Colab Notebooks/SQUAD/data/dev-v1.1.json')
train = pd.read_json('/content/drive/My Drive/Colab Notebooks/SQUAD/data/train-v1.1.json')

print(train.shape)
print(dev.shape)

In [None]:
train.data [0]

In [None]:
contexts = []
questions = []
answers_text = []
answers_start = []
title = []
for i in range(train.shape[0]):
    topic = train.iloc[i,0]['paragraphs']
    title_ = train.iloc[i,0]['title']
    for sub_para in topic:
        for q_a in sub_para['qas']:
            questions.append(q_a['question'])
            if len(q_a['answers'])>0 :
                answers_start.append(q_a['answers'][0]['answer_start']) 
                answers_text.append(q_a['answers'][0]['text'])
            else:
                answers_start.append(None)
                answers_text.append(None)
            contexts.append(sub_para['context'])
            title.append(title_)
            
# test data

test_contexts = []
test_questions = []
test_answers_text = []
test_answers_start = []
test_title = []
for i in range(dev.shape[0]):
    topic = dev.iloc[i,0]['paragraphs']
    title_ = dev.iloc[i,0]['title']
    for sub_para in topic:
        for q_a in sub_para['qas']:
            test_questions.append(q_a['question'])
            if len(q_a['answers'])>0 :
                test_answers_start.append(q_a['answers'][0]['answer_start']) 
                test_answers_text.append(q_a['answers'][0]['text'])
            else:
                test_answers_start.append(None)
                test_answers_text.append(None)
            test_contexts.append(sub_para['context'])
            test_title.append(title_)

In [None]:
test_contexts[0]

'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.'

In [None]:

train = pd.DataFrame({"context":contexts, "question": questions, "answer_start": answers_start, "text": answers_text,'title':title})
train.dropna(inplace=True)

train.info()

In [None]:
dev = pd.DataFrame({"context":test_contexts, "question": test_questions, "answer_start": test_answers_start, "text": test_answers_text,'title':test_title})
dev.dropna(inplace=True)
dev.info()

In [None]:
train.head()

In [None]:
dev.head()

Data Preprocessing

In [None]:
def decontracted(phrase):
    """
    This function remooves punctuation from given sentence.
    """
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    # string operation
    phrase = phrase.replace('\\r', ' ')
    phrase = phrase.replace('\\"', ' ')
    phrase = phrase.replace('\\n', ' ')

    phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase.lower())
    
    return phrase

In [None]:
def tokenize(sentence):
    """
    Returns tokenised words.
    """
    return nltk.word_tokenize(sentence)


In [None]:
def answer_span(context,ans):
    """
    This funtion returns anwer span start index and end index.
    """
    ans_token = tokenize(ans)
    con_token = tokenize(context)
    ans_len = len(ans_token)
    
    if ans_len!=0 and ans_token[0] in con_token:
    
        indices = [i for i, x in enumerate(con_token) if x == ans_token[0]]
        if (len(indices)>1):
            start = [i for i in indices if (con_token[i:i+ans_len] == ans_token) ]
            end = start[0] + ans_len - 1
            return start[0],end

        else:
            start = con_token.index(ans_token[0])
            end = start + ans_len - 1
            return start,end
    else:
        return -1,-1

Context

In [None]:
#For Training data:
preprocessed_context = []
# tqdm is for printing the status bar
for sentance in tqdm(train["context"].values):
    sent = decontracted(sentance)
    preprocessed_context.append(sent.strip())
    
train["clean_context"] = preprocessed_context

# for dev data
dev_preprocessed_context = []
for sentance in tqdm(dev["context"].values):
    sent = decontracted(sentance)
    dev_preprocessed_context.append(sent.strip())
    
dev["clean_context"] = dev_preprocessed_context

100%|██████████| 87599/87599 [00:04<00:00, 17605.66it/s]
100%|██████████| 10570/10570 [00:00<00:00, 17200.47it/s]


Questions

In [None]:
#For Training data:
preprocessed_question = []
# tqdm is for printing the status bar
for sentance in tqdm(train["question"].values):
    sent = decontracted(sentance)
    preprocessed_question.append(sent.strip())
    
train["clean_question"] = preprocessed_question

# for dev data
dev_preprocessed_question = []
for sentance in tqdm(dev["question"].values):
    sent = decontracted(sentance)
    dev_preprocessed_question.append(sent.strip())
    
dev["clean_question"] = dev_preprocessed_question

100%|██████████| 87599/87599 [00:01<00:00, 74129.63it/s]
100%|██████████| 10570/10570 [00:00<00:00, 68576.04it/s]


Answer

In [None]:
preprocessed_answer = []
# tqdm is for printing the status bar
for sentance in tqdm(train["text"].values):
    sent = decontracted(sentance)
    preprocessed_answer.append(sent.strip())
    
    
train["clean_answer"] = preprocessed_answer

# for dev data
    
dev_preprocessed_answer = []
# tqdm is for printing the status bar
for sentance in tqdm(dev["text"].values):
    sent = decontracted(sentance)
    dev_preprocessed_answer.append(sent.strip())
    
dev["clean_answer"] = dev_preprocessed_answer

100%|██████████| 87599/87599 [00:00<00:00, 88700.87it/s]
100%|██████████| 10570/10570 [00:00<00:00, 89173.38it/s]


In [None]:
train.head()

In [None]:
dev.head()