In [42]:
import numpy as np, pandas as pd
import json
import ast 
from textblob import TextBlob
import nltk
import torch
import pickle
from scipy import spatial
import warnings
warnings.filterwarnings('ignore')
import spacy
from nltk import Tree
#en_nlp = spacy.load('en')
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [43]:
train = pd.read_csv("data.csv")

In [44]:
train.shape

(363, 4)

Loading Embedding dictionary

In [45]:
with open("/content/dict_embeddings1.pickle", "rb") as f:
    d1 = pickle.load(f)

In [46]:
with open("/content/dict_embeddings2.pickle", "rb") as f:
    d2 = pickle.load(f)

In [47]:
dict_emb = dict(d1)
dict_emb.update(d2)

In [48]:
len(dict_emb)

464

In [49]:
del d1, d2

Data Processing

In [53]:
def get_target(x):
    idx = -1
    for i in range(len(x["sentences"])):
        if x["answer"] in x["sentences"][i]: idx = i
    return idx

In [50]:
train.head(3)

Unnamed: 0,text,question,answer,answer_start
0,Los Angeles is a sprawling Southern California...,In which state is Los Angeles?,California,36
1,Los Angeles is a sprawling Southern California...,What more can be told about the city?,TCL Chinese Theatre displays celebrities’ hand...,266
2,Los Angeles is a sprawling Southern California...,What are some iconic thing about the city?,"Paramount Pictures, Universal and Warner Brothers",161


In [51]:
train.dropna(inplace=True)

In [52]:
train.shape


(363, 4)

In [54]:
def process_data(train):
    
    print("step 1")
    train['sentences'] = train['text'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])
    
    print("step 2")
    train["target"] = train.apply(get_target, axis = 1)
    
    print("step 3")
    train['sent_emb'] = train['sentences'].apply(lambda x: [dict_emb[item][0] if item in\
                                                           dict_emb else np.zeros(4096) for item in x])
    print("step 4")
    train['quest_emb'] = train['question'].apply(lambda x: dict_emb[x] if x in dict_emb else np.zeros(4096) )
        
    return train   

In [55]:
train = process_data(train)

step 1
step 2
step 3
step 4


In [56]:
train.head()

Unnamed: 0,text,question,answer,answer_start,sentences,target,sent_emb,quest_emb
0,Los Angeles is a sprawling Southern California...,In which state is Los Angeles?,California,36,[Los Angeles is a sprawling Southern Californi...,0,"[[0.12785082, 0.23619145, 0.12586468, -0.00088...","[[0.06884657, 0.087404236, 0.12076794, -0.0216..."
1,Los Angeles is a sprawling Southern California...,What more can be told about the city?,TCL Chinese Theatre displays celebrities’ hand...,266,[Los Angeles is a sprawling Southern Californi...,2,"[[0.12785082, 0.23619145, 0.12586468, -0.00088...","[[0.0794738, 0.07783784, 0.073144525, 0.018132..."
2,Los Angeles is a sprawling Southern California...,What are some iconic thing about the city?,"Paramount Pictures, Universal and Warner Brothers",161,[Los Angeles is a sprawling Southern Californi...,1,"[[0.12785082, 0.23619145, 0.12586468, -0.00088...","[[0.08044811, 0.06524622, 0.121971734, 0.00339..."
3,New York City comprises 5 boroughs sitting whe...,What are some iconic thing about the city?,Empire State Building and sprawling Central Park,269,[New York City comprises 5 boroughs sitting wh...,2,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.08044811, 0.06524622, 0.121971734, 0.00339..."
4,New York City comprises 5 boroughs sitting whe...,What could be said the geography of the city?,comprises 5 boroughs sitting where the Hudson ...,14,[New York City comprises 5 boroughs sitting wh...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.037272844, 0.08949425, 0.048594367, 0.0261..."


Predicted Cosine & Euclidean Index

In [57]:
def cosine_sim(x):
    li = []
    for item in x["sent_emb"]:
        li.append(spatial.distance.cosine(item,x["quest_emb"][0]))
    return li   

In [58]:
def pred_idx(distances):
    return np.argmin(distances)  

In [59]:
def predictions(train):
    
    train["cosine_sim"] = train.apply(cosine_sim, axis = 1)
    train["diff"] = (train["quest_emb"] - train["sent_emb"])**2
    train["euclidean_dis"] = train["diff"].apply(lambda x: list(np.sum(x, axis = 1)))
    del train["diff"]
    
    print("cosine start")
    
    train["pred_idx_cos"] = train["cosine_sim"].apply(lambda x: pred_idx(x))
    train["pred_idx_euc"] = train["euclidean_dis"].apply(lambda x: pred_idx(x))
    
    return train

In [60]:
predicted = predictions(train)

cosine start


In [61]:
predicted.head(3)

Unnamed: 0,text,question,answer,answer_start,sentences,target,sent_emb,quest_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc
0,Los Angeles is a sprawling Southern California...,In which state is Los Angeles?,California,36,[Los Angeles is a sprawling Southern Californi...,0,"[[0.12785082, 0.23619145, 0.12586468, -0.00088...","[[0.06884657, 0.087404236, 0.12076794, -0.0216...","[0.35271120071411133, 0.4922119379043579, 0]","[14.929879565052445, 19.573356675737674, 12.83...",2,2
1,Los Angeles is a sprawling Southern California...,What more can be told about the city?,TCL Chinese Theatre displays celebrities’ hand...,266,[Los Angeles is a sprawling Southern Californi...,2,"[[0.12785082, 0.23619145, 0.12586468, -0.00088...","[[0.0794738, 0.07783784, 0.073144525, 0.018132...","[0.48899638652801514, 0.5336130857467651, 0]","[19.393929070613257, 20.445223101884604, 10.96...",2,2
2,Los Angeles is a sprawling Southern California...,What are some iconic thing about the city?,"Paramount Pictures, Universal and Warner Brothers",161,[Los Angeles is a sprawling Southern Californi...,1,"[[0.12785082, 0.23619145, 0.12586468, -0.00088...","[[0.08044811, 0.06524622, 0.121971734, 0.00339...","[0.40747690200805664, 0.4056297540664673, 0]","[16.765251823931877, 16.32315254212644, 11.834...",2,2


In [62]:
predicted["cosine_sim"][0]

[0.35271120071411133, 0.4922119379043579, 0]

In [63]:
predicted["euclidean_dis"][0]

[14.929879565052445, 19.573356675737674, 12.833425665780116]

Accuracy

In [64]:
def accuracy(target, predicted):
    
    acc = (target==predicted).sum()/len(target)
    
    return acc

Accuracy for euclidean Distance

In [65]:
print(accuracy(predicted["target"], predicted["pred_idx_euc"]))

0.3278236914600551


Accuracy for Cosine Similarity

In [66]:
print(accuracy(predicted["target"], predicted["pred_idx_cos"]))

0.3774104683195592
