In [114]:
import pandas as pd
import numpy as np
from spacy.en import English
import editdistance

pd.set_option('max_rows',500)
pd.set_option('max_colWidth', 200)

In [2]:
df = pd.read_pickle('/home/rnam/documents/squad/data/data_all.pkl')

In [3]:
df.columns

Index(['article_id', 'article_title', 'context_id', 'context', 'question_id',
       'question', 'answer_start', 'answer_text'],
      dtype='object')

In [6]:
first_word = df.question.apply(lambda x: x.split()[0])

In [13]:
first_word.value_counts().head()

What    37321
How      8084
Who      8080
When     5420
In       4308
Name: question, dtype: int64

In [14]:
df.head()

Unnamed: 0,article_id,article_title,context_id,context,question_id,question,answer_start,answer_text
0,0,University_of_Notre_Dame,0,"Architecturally, the school has a Catholic cha...",5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,515,Saint Bernadette Soubirous
1,0,University_of_Notre_Dame,0,"Architecturally, the school has a Catholic cha...",5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,188,a copper statue of Christ
2,0,University_of_Notre_Dame,0,"Architecturally, the school has a Catholic cha...",5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,279,the Main Building
3,0,University_of_Notre_Dame,0,"Architecturally, the school has a Catholic cha...",5733be284776f41900661181,What is the Grotto at Notre Dame?,381,a Marian place of prayer and reflection
4,0,University_of_Notre_Dame,0,"Architecturally, the school has a Catholic cha...",5733be284776f4190066117e,What sits on top of the Main Building at Notre...,92,a golden statue of the Virgin Mary


### Context Table

In [56]:
context = df[['article_id','context_id','context']].drop_duplicates()

In [57]:
context.head()

Unnamed: 0,article_id,context_id,context
0,0,0,"Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper..."
5,0,1,"As at most other universities, Notre Dame's students run a number of news media outlets. The nine student-run outlets include three newspapers, both a radio and television station, and several mag..."
10,0,2,"The university is the major seat of the Congregation of Holy Cross (albeit not its official headquarters, which are in Rome). Its main seminary, Moreau Seminary, is located on the campus across St..."
15,0,3,"The College of Engineering was established in 1920, however, early courses in civil and mechanical engineering were a part of the College of Science since the 1870s. Today the college, housed in t..."
20,0,4,All of Notre Dame's undergraduate students are a part of one of the five undergraduate colleges at the school or are in the First Year of Studies program. The First Year of Studies program was est...


### Break the context into sentences

In [31]:
nlp = English()

In [58]:
total = []
for text in context.context.values:
    sent0 = nlp(text)
    total.append([sent1 for sent1 in sent0.sents])

context['context_sentence'] = total

In [60]:
context.context_sentence.values[-1]

[Kathmandu Metropolitan City (KMC), in order to promote international relations has established an International Relations Secretariat (IRC).,
 KMC's first international relationship was established in 1975 with the city of Eugene, Oregon, United States.,
 This activity has been further enhanced by establishing formal relationships with 8 other cities: Motsumoto City of Japan, Rochester of the USA, Yangon (formerly Rangoon) of Myanmar, Xi'an of the People's Republic of China, Minsk of Belarus, and Pyongyang of the Democratic Republic of Korea.,
 KMC's constant endeavor is to enhance its interaction with SAARC countries, other International agencies and many other major cities of the world to achieve better urban management and developmental programs for Kathmandu.]

### Tokenize and lemma the context sentence and question, then score them using edit distance

In [110]:
# def vocabbuilder(list0, list1):
#     vocab = {}
#     for idx, word in enumerate(list0 + list1):
#         if word in vocab:
#             pass
#         else:
#             vocab[word] = idx
#     return vocab

In [43]:
test = context.context_sentence.values[0]

In [None]:
questions = df.question.values
context_id = df.context_id.values
article_id = df.article_id.values
total = []
sentence_min_edist = []
for i, question in enumerate(questions):
    context_index = context_id[i]
    article_index = article_id[i]
    sentences = context[(context.context_id==context_index) & (context.article_id==article_index)].context_sentence.values[0]
    
    score_keeper = 99999
    scores = {}
    for sidx, sentence in enumerate(sentences):
        # -- lemma the tokens
        question_nlp = nlp(question)
        question_lemma = [k.lemma_ for k in question_nlp]
        sentence_lemma = [j.lemma_ for j in sentence]
        
        # produce edit distance scores, key is the sentence index within the context (for each questions)
        scores[sidx] = editdistance.eval(question_lemma, sentence_lemma)
    total.append(scores)
    min_score_key = min(scores, key=scores.get)
    sentence_min_edist.append(sentences[min_score_key])

df['question_context_edist'] = total
df['context_min_edist'] = sentence_min_edist

In [117]:
total

[{0: 13, 1: 16, 2: 29, 3: 13, 4: 15, 5: 20, 6: 32}]

In [121]:
df[:10][['question_context_edist', 'question', 'context_min_edist']]

Unnamed: 0,question_context_edist,question
0,"{0: 13, 1: 16, 2: 29, 3: 13, 4: 15, 5: 20, 6: 32}",To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
1,"{0: 11, 1: 15, 2: 27, 3: 12, 4: 15, 5: 21, 6: 32}",What is in front of the Notre Dame Main Building?
2,"{0: 14, 1: 15, 2: 27, 3: 14, 4: 14, 5: 21, 6: 30}",The Basilica of the Sacred heart at Notre Dame is beside to which structure?
3,"{0: 8, 1: 15, 2: 28, 3: 11, 4: 13, 5: 20, 6: 33}",What is the Grotto at Notre Dame?
4,"{0: 11, 1: 16, 2: 27, 3: 12, 4: 15, 5: 22, 6: 32}",What sits on top of the Main Building at Notre Dame?
5,"{0: 16, 1: 23, 2: 29, 3: 19, 4: 10, 5: 31, 6: 26, 7: 23, 8: 24, 9: 19, 10: 20}",When did the Scholastic Magazine of Notre dame begin publishing?
6,"{0: 16, 1: 23, 2: 30, 3: 19, 4: 10, 5: 32, 6: 26, 7: 24, 8: 25, 9: 18, 10: 20}",How often is Notre Dame's the Juggler published?
7,"{0: 17, 1: 23, 2: 30, 3: 19, 4: 10, 5: 29, 6: 26, 7: 25, 8: 24, 9: 18, 10: 20}",What is the daily student paper at Notre Dame called?
8,"{0: 17, 1: 22, 2: 31, 3: 19, 4: 10, 5: 31, 6: 26, 7: 25, 8: 24, 9: 19, 10: 20}",How many student news papers are found at Notre Dame?
9,"{0: 18, 1: 23, 2: 29, 3: 19, 4: 14, 5: 31, 6: 26, 7: 23, 8: 23, 9: 19, 10: 19}",In what year did the student paper Common Sense begin publication at Notre Dame?


In [102]:
for k in question_nlp:
    print(k.lemma_, k)

to To
whom whom
do did
the the
virgin Virgin
mary Mary
allegedly allegedly
appear appear
in in
1858 1858
in in
lourdes Lourdes
france France
? ?


In [115]:
editdistance.eval(sentence_lemma, question_lemma)

32