In [None]:
import numpy as np
import csv
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [None]:
def json_to_df(json_file):
    arrayForDF = []
    for current_subject in json_file["data"]:
        sub = " ".join(current_subject["title"][22:].split('/'))
        subject = " ".join(sub.split('-'))
        for current_context in current_subject["paragraphs"]:
            context = current_context["context"]
            context = context[context.rfind('}') + 1:]
            for current_question in current_context["qas"]:
                question = current_question["question"]
                if len(question) > 2:
                    is_impossible = current_question["is_impossible"]
                    if not is_impossible:
                        for answer in current_question["answers"]:
                            answer_text = answer["text"]
                            if(answer_text[0] == '!'):
                                answer_text = answer_text[answer_text.rfind('}') + 1:]
                            else:
                                answer_text = answer_text[:answer_text.rfind('}')]
                            #answer_start = answer["answer_start"]
                            record = {
                                #"answer_start": answer_start,
                                "question": question,
                                "answer": answer_text,
                                "context": context,
                                #"subject": subject,
                            }
                            arrayForDF.append(record)

    df = pd.DataFrame(arrayForDF)
    return df


train_df = json_to_df(pd.read_json('./datasets/train.json'))
train2_df = json_to_df(pd.read_json('./datasets/train2.json'))
mashqa = pd.concat([train_df, train2_df], axis=0)

In [2]:
medquad = pd.read_csv('./csv_datasets/medquad.csv')

In [3]:
medquad.rename(columns={'Questions': 'question', 'Answers': 'answer', 'Focus': 'context'}, inplace=True)

In [4]:
final_df = pd.concat([mashqa, medquad], axis=0)

In [5]:
final_df.reset_index(drop=True, inplace=True)

In [7]:
print("Start\n")
question_embeddings = model.encode(final_df['question'])
print("Finished Part 1\n")
context_embeddings = model.encode(final_df['context'])
print("Finished Part 2\n")

Start

Finished Part 1



In [9]:
nn = NearestNeighbors(n_neighbors=5, metric='cosine')
nn.fit(question_embeddings)
nn.fit(context_embeddings)

In [10]:
query = "<query>"
query_embedding = model.encode([query])
distances, indices = nn.kneighbors([query_embedding[0]])

In [12]:
results = final_df['answer'].iloc[indices[0]]
print(results)

60269    Variations in over 30 genes, occurring in diff...
60268    Vitiligo is a common disorder, affecting betwe...
60270    Vitiligo sometimes runs in families, but the i...
60271    These resources address the diagnosis or manag...
60267    Vitiligo is a condition that causes patchy los...
Name: answer, dtype: object


In [14]:
np.save('question_embeddings.npy', question_embeddings)
np.save('context_embeddings.npy', context_embeddings)

In [16]:
with open('question_embeddings.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    for embedding in question_embeddings:
        writer.writerow(embedding)
with open('context_embeddings.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    for embedding in context_embeddings:
        writer.writerow(embedding)

In [17]:
#save dataframe as csv
final_df.to_csv('final_df.csv', index=False)