In [1]:
from utils import alsoknownas, birthdate, deathday, nameofperson
import numpy as np, pandas as pd
import json
import ast 
from textblob import TextBlob
import nltk
import torch
import pickle
from scipy import spatial
import warnings
warnings.filterwarnings('ignore')
import spacy
from nltk import Tree
en_nlp = spacy.load('en_core_web_sm')
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [2]:
#Load the training & the querying dataset
train = pd.read_csv("trainingDataset.csv")
question = pd.read_csv("questions.csv", encoding='utf8')

In [3]:
train.shape

(1605, 6)

### Loading Embedding dictionary

In [4]:
#Load the dictionary d1
with open("data/dict_embeddings1.pickle", "rb") as f:
    d1 = pickle.load(f)

In [5]:
#Load the dictionary d2
with open("data/dict_embeddings2.pickle", "rb") as f:
    d2 = pickle.load(f)

In [6]:
#Append both dictionaries together
dict_emb = dict(d1)
dict_emb.update(d2)

In [7]:
#Length of the final dictionary
len(dict_emb)

1953

In [8]:
del d1, d2

## Data Processing

In [10]:
train.head(3)

Unnamed: 0,date,title,category,link,abstract,paragraphs
0,21.04.2020,Barack Obama,,https://en.wikipedia.org/wiki/Barack_Obama,Barack Hussein Obama II ( (listen); born Augus...,['Barack Hussein Obama II ( (listen); born Aug...
1,21.04.2020,Joe Biden,,https://en.wikipedia.org/wiki/Joe_Biden,Joseph Robinette Biden Jr. (; born November 20...,['Joseph Robinette Biden Jr. (; born November ...
2,21.04.2020,George W. Bush,,https://en.wikipedia.org/wiki/George_W._Bush,"George Walker Bush (born July 6, 1946) is an A...","['George Walker Bush (born July 6, 1946) is an..."


In [11]:
train.shape

(1605, 6)

In [12]:
#Preprocessing training ans testing data. Converting them to sentence vectors
def preprocess_data(train):
    
    print("step 1")
    train['sentences'] = train['abstract'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])
    
    print("step 2")
    train['sent_emb'] = train['sentences'].apply(lambda x: [dict_emb[item][0] if item in\
                                                           dict_emb else np.zeros(4096) for item in x])
    print("step 3")
    question['quest_emb'] = question['question'].apply(lambda x: dict_emb[x] if x in dict_emb else np.zeros(4096) )
        
    return train   

In [13]:
train = preprocess_data(train)

step 1
step 2
step 3


In [14]:
train.shape

(1605, 8)

In [15]:
train

Unnamed: 0,date,title,category,link,abstract,paragraphs,sentences,sent_emb
0,21.04.2020,Barack Obama,,https://en.wikipedia.org/wiki/Barack_Obama,Barack Hussein Obama II ( (listen); born Augus...,['Barack Hussein Obama II ( (listen); born Aug...,[Barack Hussein Obama II ( (listen); born Augu...,"[[-0.028743807, 0.049431216, 0.20910737, 0.034..."
1,21.04.2020,Joe Biden,,https://en.wikipedia.org/wiki/Joe_Biden,Joseph Robinette Biden Jr. (; born November 20...,['Joseph Robinette Biden Jr. (; born November ...,[Joseph Robinette Biden Jr. (; born November 2...,"[[0.01917268, 0.051781386, 0.21973753, 0.05867..."
2,21.04.2020,George W. Bush,,https://en.wikipedia.org/wiki/George_W._Bush,"George Walker Bush (born July 6, 1946) is an A...","['George Walker Bush (born July 6, 1946) is an...","[George Walker Bush (born July 6, 1946) is an ...","[[-0.028743807, 0.049431216, 0.16229874, 0.008..."
3,21.04.2020,Donald Trump,,https://en.wikipedia.org/wiki/Donald_Trump,"Donald John Trump (born June 14, 1946) is the ...","['Donald John Trump (born June 14, 1946) is th...","[Donald John Trump (born June 14, 1946) is the...","[[-0.01994209, 0.063180044, 0.1411632, 0.01234..."
4,21.04.2020,George Washington,,https://en.wikipedia.org/wiki/George_Washington,"George Washington (February 22, 1732 – Decembe...","['George Washington (February 22, 1732 – Decem...","[George Washington (February 22, 1732 – Decemb...","[[-0.028743807, 0.049431216, 0.1495678, 0.0356..."
...,...,...,...,...,...,...,...,...
1600,21.04.2020,Pablo Squella,,https://en.wikipedia.org/wiki/Pablo_Squella,"Pablo Squella Serrano (born August 14, 1963) i...","['Pablo Squella Serrano (born August 14, 1963)...","[Pablo Squella Serrano (born August 14, 1963) ...","[[-0.00986898, 0.16163094, 0.19587837, 0.07248..."
1601,21.04.2020,Ulises de la Cruz,,https://en.wikipedia.org/wiki/Ulises_de_la_Cruz,Ulises Hernán de la Cruz Bernardo (born 8 Febr...,"[""Ulises Hernán de la Cruz Bernardo (born 8 Fe...",[Ulises Hernán de la Cruz Bernardo (born 8 Feb...,"[[-0.010844637, 0.2535295, 0.18173045, 0.03870..."
1602,21.04.2020,Cecilia Tait,,https://en.wikipedia.org/wiki/Cecilia_Tait,"Cecilia Roxana Tait Villacorta (born May 2, 19...","['Cecilia Roxana Tait Villacorta (born May 2, ...","[Cecilia Roxana Tait Villacorta (born May 2, 1...","[[0.0023999258, 0.07554371, 0.14318627, 0.0211..."
1603,21.04.2020,Cenaida Uribe,,https://en.wikipedia.org/wiki/Cenaida_Uribe,Cenaida Cebastiana Uribe Medina (born December...,['Cenaida Cebastiana Uribe Medina (born Decemb...,[Cenaida Cebastiana Uribe Medina (born Decembe...,"[[-0.007458802, 0.11935111, 0.14588766, 0.0222..."


In [16]:
question.head()

Unnamed: 0,question,Actual Abstract,quest_emb
0,Name an American actor who was the president o...,John Gavin (born Juan Vincent Apablasa Jr.; Ap...,"[[-0.028743807, 0.054637525, 0.14788046, 0.025..."
1,"What was Louise Bours, a Member of the Europea...","Louise Bours (born 23 December 1968), also kno...","[[0.006152355, 0.101373315, 0.08296338, 0.0738..."
2,"Name a British politician, who was a Member of...",Elizabeth Lynne (born 22 January 1948) is a Br...,"[[0.016666643, 0.10489679, 0.107962236, 0.1114..."
3,Name a British character actor who appeared in...,James Robertson Justice (born James Norval Har...,"[[0.016666643, 0.09101592, 0.13560656, 0.11484..."
4,What was Muhammad Yusuf Khan professionally kn...,"Muhammad Yusuf Khan (born 11 December 1922), k...","[[-0.018900674, 0.049431216, 0.10987092, -0.01..."


## Predicted Cosine Similarity

In [17]:
#Predict Cosine Similarity.
import numpy as np
li = []
li1 = []
for item in question['quest_emb']:
    for item1 in train['sent_emb']:
        li.append(spatial.distance.cosine(item,item1))
    retrieve=li.index(min(li))
    li1.append(train.loc[retrieve,'abstract'])
    li.clear()
question['Predicted Abstract'] = li1

In [18]:
question

Unnamed: 0,question,Actual Abstract,quest_emb,Predicted Abstract
0,Name an American actor who was the president o...,John Gavin (born Juan Vincent Apablasa Jr.; Ap...,"[[-0.028743807, 0.054637525, 0.14788046, 0.025...",John Gavin (born Juan Vincent Apablasa Jr.; Ap...
1,"What was Louise Bours, a Member of the Europea...","Louise Bours (born 23 December 1968), also kno...","[[0.006152355, 0.101373315, 0.08296338, 0.0738...","Louise Bours (born 23 December 1968), also kno..."
2,"Name a British politician, who was a Member of...",Elizabeth Lynne (born 22 January 1948) is a Br...,"[[0.016666643, 0.10489679, 0.107962236, 0.1114...",Elizabeth Lynne (born 22 January 1948) is a Br...
3,Name a British character actor who appeared in...,James Robertson Justice (born James Norval Har...,"[[0.016666643, 0.09101592, 0.13560656, 0.11484...",James Robertson Justice (born James Norval Har...
4,What was Muhammad Yusuf Khan professionally kn...,"Muhammad Yusuf Khan (born 11 December 1922), k...","[[-0.018900674, 0.049431216, 0.10987092, -0.01...","Muhammad Yusuf Khan (born 11 December 1922), k..."
...,...,...,...,...
396,"Who is Harold Peter ""Herb"" Capozzi?","Harold Peter ""Herb"" Capozzi (April 24, 1925 – ...","[[-0.016619641, 0.049431216, 0.012119766, -0.0...","Harold Peter ""Herb"" Capozzi (April 24, 1925 – ..."
397,"What was Charles Griffith Wynne, Liberal Tory...",Charles Griffith Wynne (14 August 1815 – 3 Mar...,"[[-0.018900674, 0.0934277, 0.13245924, 0.04328...",Charles Griffith Wynne (14 August 1815 – 3 Mar...
398,Name an English Christian worship leader and w...,Stuart Townend (born 1963) is an English Chris...,"[[0.011094983, 0.049431216, -0.016316894, 0.06...",Stuart Townend (born 1963) is an English Chris...
399,Who is Michael James Tomlinson-Mynors?,Michael James Tomlinson-Mynors (born 1 October...,"[[-0.020429114, 0.049431216, -0.06760542, 0.00...",Thos.


In [19]:
#Predict exact answer using spaCy Matcher and Entity Ruler
aka = ['known as', 'nicknamed', 'known mononymously as', 'known professionally as']
bday = ['born?']
death = ['die?']
name = ['Name']
who = ['Who']
answerlist=[]
for index,row in question.iterrows():
    query = row['question']
    querylist = query.split()
    answerline = row['Predicted Abstract']
    if row['Predicted Abstract'].split()[0] == row['Actual Abstract'].split()[0]:
        for querywords in querylist:
            if querywords in bday:
                answerlist.append(birthdate(answerline))
            if querywords in death:
                answerlist.append(deathday(answerline))
            if querywords in who:
                answerlist.append(answerline)
            if querywords in name:
                answerlist.append(nameofperson(answerline))
        for phrase in aka:
            if phrase in query:
                answerlist.append(alsoknownas(answerline))
    else:
        answerlist.append('Answer not found!')

In [20]:
question['Answer'] = answerlist

In [21]:
question

Unnamed: 0,question,Actual Abstract,quest_emb,Predicted Abstract,Answer
0,Name an American actor who was the president o...,John Gavin (born Juan Vincent Apablasa Jr.; Ap...,"[[-0.028743807, 0.054637525, 0.14788046, 0.025...",John Gavin (born Juan Vincent Apablasa Jr.; Ap...,John Gavin
1,"What was Louise Bours, a Member of the Europea...","Louise Bours (born 23 December 1968), also kno...","[[0.006152355, 0.101373315, 0.08296338, 0.0738...","Louise Bours (born 23 December 1968), also kno...",Louise van de Bours
2,"Name a British politician, who was a Member of...",Elizabeth Lynne (born 22 January 1948) is a Br...,"[[0.016666643, 0.10489679, 0.107962236, 0.1114...",Elizabeth Lynne (born 22 January 1948) is a Br...,Elizabeth Lynne
3,Name a British character actor who appeared in...,James Robertson Justice (born James Norval Har...,"[[0.016666643, 0.09101592, 0.13560656, 0.11484...",James Robertson Justice (born James Norval Har...,James Robertson Justice
4,What was Muhammad Yusuf Khan professionally kn...,"Muhammad Yusuf Khan (born 11 December 1922), k...","[[-0.018900674, 0.049431216, 0.10987092, -0.01...","Muhammad Yusuf Khan (born 11 December 1922), k...",Dilip Kumar
...,...,...,...,...,...
396,"Who is Harold Peter ""Herb"" Capozzi?","Harold Peter ""Herb"" Capozzi (April 24, 1925 – ...","[[-0.016619641, 0.049431216, 0.012119766, -0.0...","Harold Peter ""Herb"" Capozzi (April 24, 1925 – ...","Harold Peter ""Herb"" Capozzi (April 24, 1925 – ..."
397,"What was Charles Griffith Wynne, Liberal Tory...",Charles Griffith Wynne (14 August 1815 – 3 Mar...,"[[-0.018900674, 0.0934277, 0.13245924, 0.04328...",Charles Griffith Wynne (14 August 1815 – 3 Mar...,Charles WynneFinch
398,Name an English Christian worship leader and w...,Stuart Townend (born 1963) is an English Chris...,"[[0.011094983, 0.049431216, -0.016316894, 0.06...",Stuart Townend (born 1963) is an English Chris...,Stuart Townend
399,Who is Michael James Tomlinson-Mynors?,Michael James Tomlinson-Mynors (born 1 October...,"[[-0.020429114, 0.049431216, -0.06760542, 0.00...",Thos.,Answer not found!


In [22]:
question

Unnamed: 0,question,Actual Abstract,quest_emb,Predicted Abstract,Answer
0,Name an American actor who was the president o...,John Gavin (born Juan Vincent Apablasa Jr.; Ap...,"[[-0.028743807, 0.054637525, 0.14788046, 0.025...",John Gavin (born Juan Vincent Apablasa Jr.; Ap...,John Gavin
1,"What was Louise Bours, a Member of the Europea...","Louise Bours (born 23 December 1968), also kno...","[[0.006152355, 0.101373315, 0.08296338, 0.0738...","Louise Bours (born 23 December 1968), also kno...",Louise van de Bours
2,"Name a British politician, who was a Member of...",Elizabeth Lynne (born 22 January 1948) is a Br...,"[[0.016666643, 0.10489679, 0.107962236, 0.1114...",Elizabeth Lynne (born 22 January 1948) is a Br...,Elizabeth Lynne
3,Name a British character actor who appeared in...,James Robertson Justice (born James Norval Har...,"[[0.016666643, 0.09101592, 0.13560656, 0.11484...",James Robertson Justice (born James Norval Har...,James Robertson Justice
4,What was Muhammad Yusuf Khan professionally kn...,"Muhammad Yusuf Khan (born 11 December 1922), k...","[[-0.018900674, 0.049431216, 0.10987092, -0.01...","Muhammad Yusuf Khan (born 11 December 1922), k...",Dilip Kumar
...,...,...,...,...,...
396,"Who is Harold Peter ""Herb"" Capozzi?","Harold Peter ""Herb"" Capozzi (April 24, 1925 – ...","[[-0.016619641, 0.049431216, 0.012119766, -0.0...","Harold Peter ""Herb"" Capozzi (April 24, 1925 – ...","Harold Peter ""Herb"" Capozzi (April 24, 1925 – ..."
397,"What was Charles Griffith Wynne, Liberal Tory...",Charles Griffith Wynne (14 August 1815 – 3 Mar...,"[[-0.018900674, 0.0934277, 0.13245924, 0.04328...",Charles Griffith Wynne (14 August 1815 – 3 Mar...,Charles WynneFinch
398,Name an English Christian worship leader and w...,Stuart Townend (born 1963) is an English Chris...,"[[0.011094983, 0.049431216, -0.016316894, 0.06...",Stuart Townend (born 1963) is an English Chris...,Stuart Townend
399,Who is Michael James Tomlinson-Mynors?,Michael James Tomlinson-Mynors (born 1 October...,"[[-0.020429114, 0.049431216, -0.06760542, 0.00...",Thos.,Answer not found!


In [23]:
question.to_csv('qa_unsupervised.csv')

In [24]:
#Calculate Cosine Similarity
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
  
cos_sim=[]
for index,row in question.iterrows():
    
    X = row['Predicted Abstract']
    Y = row['Actual Abstract']  
    # tokenization 
    X_list = word_tokenize(X)  
    Y_list = word_tokenize(Y) 
  
    # sw contains the list of stopwords 
    sw = stopwords.words('english')  
    l1 =[];l2 =[] 
  
    # remove stop words from string 
    X_set = {w for w in X_list if not w in sw}  
    Y_set = {w for w in Y_list if not w in sw} 
  
    # form a set containing keywords of both strings  
    rvector = X_set.union(Y_set)  
    for w in rvector: 
        if w in X_set: l1.append(1) # create a vector 
        else: l1.append(0) 
        if w in Y_set: l2.append(1) 
        else: l2.append(0) 
    c = 0
  
    # cosine formula  
    for i in range(len(rvector)): 
        c+= l1[i]*l2[i] 
    cosine = c / float((sum(l1)*sum(l2))**0.5) 
    cos_sim.append(cosine)

In [25]:
#Accuracy
print(sum(cos_sim)/len(cos_sim))

0.7730022152340004


In [27]:
#Sample Query and Answer
print('query: \n'+question.iloc[12]['question']+'\n')
print('answer: \n'+question.iloc[12]['Answer']+'\n')
print('paragraph: \n'+question.iloc[12]['Predicted Abstract']+'\n')

query: 
When did Frederick Alers Hankey die?

answer: 
15 February 1892

paragraph: 
Frederick Alers Hankey (29 March 1833 – 15 February 1892) was an English banker and Conservative politician who sat in the House of Commons from 1885 to 1892.

