In [20]:
import requests
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

## Initial Analysis of Squad Data Format

In [2]:
#importing the dataset into the notebook
!wget 'https://raw.githubusercontent.com/rajpurkar/SQuAD-explorer/master/dataset/train-v1.1.json'
with open("train-v1.1.json") as f:
  squad = json.load(f)

--2022-12-07 01:53:15--  https://raw.githubusercontent.com/rajpurkar/SQuAD-explorer/master/dataset/train-v1.1.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30288272 (29M) [text/plain]
Saving to: ‘train-v1.1.json’


2022-12-07 01:53:19 (79.6 MB/s) - ‘train-v1.1.json’ saved [30288272/30288272]



In [3]:
squad['data'][0].keys()

dict_keys(['title', 'paragraphs'])

As we can see there are two keys in the dataset, title and paragraph 

Each title has many paragraph inside it, and for each paragraph there are many questions and corresponding answers.

So there are 35 rows with title and paragraph keys in this dataset

## Preprocesing

In [4]:
import json

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# This file has been saved in my Google Drive
with open('/content/drive/MyDrive/merged_file.json', 'rb') as f:
  val_data= json.load(f)

print('Size of the complete dataset ' + str(len(val_data['data'])))

Size of the complete dataset 151


In [7]:
import numpy as np
import pandas as pd

In [8]:
def squad_to_df(path):
    file = json.loads(open(path).read())

    json_data = pd.json_normalize(file, 'data')
    qas = pd.json_normalize(file, ['data','paragraphs','qas'])
    context = pd.json_normalize(file,['data','paragraphs'])
    
    #print(r['context'].values)

    contexts = np.repeat(context['context'].values, context.qas.str.len())
    qas['context'] = contexts

    data = qas[['id','question','context','answers']].set_index('id').reset_index()
    data['context_id'] = data['context'].factorize()[0]
    
    return data

In [9]:
data = squad_to_df('/content/drive/MyDrive/merged_file.json')
data.head()

Unnamed: 0,id,question,context,answers,context_id
0,597553,In which state is Los Angeles?,Los Angeles is a sprawling Southern California...,"[{'answer_id': 647225, 'document_id': 1202512,...",0
1,597555,What more can be told about the city?,Los Angeles is a sprawling Southern California...,"[{'answer_id': 647227, 'document_id': 1202512,...",0
2,597559,What are some iconic thing about the city?,Los Angeles is a sprawling Southern California...,"[{'answer_id': 647232, 'document_id': 1202512,...",0
3,597559,What are some iconic thing about the city?,New York City comprises 5 boroughs sitting whe...,"[{'answer_id': 647231, 'document_id': 1202511,...",1
4,598541,What could be said the geography of the city?,New York City comprises 5 boroughs sitting whe...,"[{'answer_id': 650227, 'document_id': 1202511,...",1


## Get Unique Documents

In [10]:
data[['context']].drop_duplicates().reset_index(drop=True)

Unnamed: 0,context
0,Los Angeles is a sprawling Southern California...
1,New York City comprises 5 boroughs sitting whe...
2,"Philadelphia, Pennsylvania’s largest city, is ..."
3,"Chicago, on Lake Michigan in Illinois, is amon..."
4,"Houston is a large metropolis in Texas, extend..."
...,...
146,"Rochester is a city on Lake Ontario, in New Yo..."
147,Des Moines is the capital city of Iowa. The go...
148,"Moreno Valley is a city in Riverside County, C..."
149,"Oxnard is a seaside city west of Los Angeles, ..."


In [38]:
vectorizer_cnf = {
    'lowercase': True,
    'stop_words': 'english',
    'analyzer': 'word',
    'binary': True,
}

nn_cnf = {
    'n_neighbors': 4,
    'metric': 'cosine' #'euclidean'
}

embedding = TfidfVectorizer(**vectorizer_cnf)
nearest_neighbor = NearestNeighbors(**nn_cnf)

In [None]:
import sklearn
sorted(sklearn.neighbors.VALID_METRICS['brute'])

In [39]:
X = embedding.fit_transform(data['context'])
nearest_neighbor.fit(X, data['context_id'])

NearestNeighbors(metric='cosine', n_neighbors=4)

In [45]:
text = 'What are the most popular things in Houston?'

vector = embedding.transform([text])
vector = embedding.inverse_transform(vector)

vector

[array(['popular', 'houston'], dtype='<U16')]

In [48]:
value = nearest_neighbor.kneighbors(embedding.transform([text]), return_distance=False)
selected = data.iloc[value[0][0]]['context']
selected

'Houston is a large metropolis in Texas, extending to Galveston Bay. It’s closely linked with the Space Center Houston, the coastal visitor center at NASA’s astronaut training and flight control complex. The city’s relatively compact Downtown includes the Theater District, home to the renowned Houston Grand Opera, and the Historic District, with 19th-century architecture and upscale restaurants.'