In [1]:
import pandas as pd
from tqdm import tqdm
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
import _pickle as cPickle
from pathlib import Path

def dumpPickle(fileName, content):
    pickleFile = open(fileName, 'wb')
    cPickle.dump(content, pickleFile, -1)
    pickleFile.close()

def loadPickle(fileName):    
    file = open(fileName, 'rb')
    content = cPickle.load(file)
    file.close()
    
    return content
    
def pickleExists(fileName):
    file = Path(fileName)
    
    if file.is_file():
        return True
    
    return False

In [3]:
train = pd.read_json('./data/squad-v1/train-v1.1.json', orient='column')
dev = pd.read_json('./data/squad-v1/dev-v1.1.json', orient='column')

df = pd.concat([train, dev], ignore_index=True)
df.head()

Unnamed: 0,data,version
0,"{'title': 'University_of_Notre_Dame', 'paragra...",1.1
1,"{'title': 'Beyoncé', 'paragraphs': [{'context'...",1.1
2,"{'title': 'Montana', 'paragraphs': [{'context'...",1.1
3,"{'title': 'Genocide', 'paragraphs': [{'context...",1.1
4,"{'title': 'Antibiotics', 'paragraphs': [{'cont...",1.1


In [4]:
currText = df['data'][0]['paragraphs'][0]['context']
currQas = df['data'][0]['paragraphs'][0]['qas']

In [5]:
currQas

[{'answers': [{'answer_start': 515, 'text': 'Saint Bernadette Soubirous'}],
  'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
  'id': '5733be284776f41900661182'},
 {'answers': [{'answer_start': 188, 'text': 'a copper statue of Christ'}],
  'question': 'What is in front of the Notre Dame Main Building?',
  'id': '5733be284776f4190066117f'},
 {'answers': [{'answer_start': 279, 'text': 'the Main Building'}],
  'question': 'The Basilica of the Sacred heart at Notre Dame is beside to which structure?',
  'id': '5733be284776f41900661180'},
 {'answers': [{'answer_start': 381,
    'text': 'a Marian place of prayer and reflection'}],
  'question': 'What is the Grotto at Notre Dame?',
  'id': '5733be284776f41900661181'},
 {'answers': [{'answer_start': 92,
    'text': 'a golden statue of the Virgin Mary'}],
  'question': 'What sits on top of the Main Building at Notre Dame?',
  'id': '5733be284776f4190066117e'}]

In [6]:
currDoc = nlp(currText)
currDoc

Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.

In [7]:
def extractAnswers(qas, doc):
    answers = []

    senStart = 0
    senId = 0

    for sentence in doc.sents:
        senLen = len(sentence.text)

        for answer in qas:
            answerStart = answer['answers'][0]['answer_start']

            if (answerStart >= senStart and answerStart < (senStart + senLen)):
                answers.append({'sentenceId': senId, 'text': answer['answers'][0]['text']})

        senStart += senLen
        senId += 1
    
    return answers

In [8]:
currAnswers = extractAnswers(currQas, currDoc)
currAnswers

[{'sentenceId': 1, 'text': 'a golden statue of the Virgin Mary'},
 {'sentenceId': 2, 'text': 'a copper statue of Christ'},
 {'sentenceId': 3, 'text': 'the Main Building'},
 {'sentenceId': 4, 'text': 'a Marian place of prayer and reflection'},
 {'sentenceId': 5, 'text': 'Saint Bernadette Soubirous'}]

In [9]:
def tokenIsAnswer(token, sentenceId, answers):
    for i in range(len(answers)):
        if (answers[i]['sentenceId'] == sentenceId):
            if (answers[i]['text'] == token):
                return True
    return False

In [10]:
def getNEStartIndexs(doc):
    neStarts = {}
    for ne in doc.ents:
        neStarts[ne.start] = ne
        
    return neStarts 

In [11]:
def getSentenceStartIndexes(doc):
    senStarts = []
    
    for sentence in doc.sents:
        senStarts.append(sentence[0].i)
    
    return senStarts
    
def getSentenceForWordPosition(wordPos, senStarts):
    for i in range(1, len(senStarts)):
        if (wordPos < senStarts[i]):
            return i - 1

In [12]:
#Creating the dataframe
wordColums = ['text', 'isAnswer', 'titleId', 'paragrapghId', 'sentenceId','wordCount', 'NER', 'POS', 'TAG', 'DEP','shape']
wordDf = pd.DataFrame(columns=wordColums)

#Save to pickle

#load df

#Add new words to array
newWord = ['koala', True, 0, 0, 4, 1, None, None, None, None, 'xxxxx']
newWords = []
#newWords.append(newWord)

#Make array to dataframe
newWordsDf = pd.DataFrame(newWords, columns=wordColums)
newWordsDf

#Merge dataframes

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape


In [13]:
def addWordsForParagrapgh(newWords, titleId, paragraphId):
    text = df['data'][titleId]['paragraphs'][paragraphId]['context']
    qas = df['data'][titleId]['paragraphs'][paragraphId]['qas']

    doc = nlp(text)

    answers = extractAnswers(qas, doc)
    neStarts = getNEStartIndexs(doc)
    senStarts = getSentenceStartIndexes(doc)
    
    #index of word in spacy doc text
    i = 0
    
    while (i < len(doc)):
        #If the token is a start of a Named Entity, add it and push to index to end of the NE
        if (i in neStarts):
            word = neStarts[i]
            #add word
            currentSentence = getSentenceForWordPosition(word.start, senStarts)
            wordLen = word.end - word.start
            shape = ''
            for wordIndex in range(word.start, word.end):
                shape += (' ' + doc[wordIndex].shape_)

            newWords.append([word.text,
                            tokenIsAnswer(word.text, currentSentence, answers),
                            titleId,
                            paragraphId,
                            currentSentence,
                            wordLen,
                            word.label_,
                            None,
                            None,
                            None,
                            shape])
            i = neStarts[i].end - 1
        #If not a NE, add the word if it's not a stopword or a non-alpha (not regular letters)
        else:
            if (doc[i].is_stop == False and doc[i].is_alpha == True):
                word = doc[i]

                currentSentence = getSentenceForWordPosition(i, senStarts)
                wordLen = 1

                newWords.append([word.text,
                                tokenIsAnswer(word.text, currentSentence, answers),
                                titleId,
                                paragraphId,
                                currentSentence,
                                wordLen,
                                None,
                                word.pos_,
                                word.tag_,
                                word.dep_,
                                word.shape_])
        i += 1


In [14]:
newWords

[]

In [15]:
addWordsForParagrapgh(newWords, 0, 0)

In [16]:
newWordsDf = pd.DataFrame(newWords, columns=wordColums)
newWordsDf.head()

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape
0,Architecturally,False,0,0,0.0,1,,ADV,RB,advmod,Xxxxx
1,school,False,0,0,0.0,1,,NOUN,NN,nsubj,xxxx
2,Catholic,False,0,0,0.0,1,NORP,,,,Xxxxx
3,character,False,0,0,0.0,1,,NOUN,NN,dobj,xxxx
4,Atop,False,0,0,1.0,1,,ADP,IN,prep,Xxxx


In [17]:
newWordsDf[newWordsDf['isAnswer'] == True].head()

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape
21,the Main Building,True,0,0,3.0,3,ORG,,,,xxx Xxxx Xxxxx
39,Saint Bernadette Soubirous,True,0,0,5.0,3,GPE,,,,Xxxxx Xxxxx Xxxxx


In [18]:
words = []

#titlesCount = len(df['data'])
titlesCount = 2

for titleId in tqdm(range(titlesCount)):
    paragraphsCount = len(df['data'][titleId]['paragraphs'])
        
    for paragraphId in range(paragraphsCount):
        addWordsForParagrapgh(words, titleId, paragraphId)
        

100%|██████████| 2/2 [00:03<00:00,  1.54s/it]


In [19]:
wordsDf = pd.DataFrame(words, columns=wordColums)
wordsDf.head()

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape
0,Architecturally,False,0,0,0.0,1,,ADV,RB,advmod,Xxxxx
1,school,False,0,0,0.0,1,,NOUN,NN,nsubj,xxxx
2,Catholic,False,0,0,0.0,1,NORP,,,,Xxxxx
3,character,False,0,0,0.0,1,,NOUN,NN,dobj,xxxx
4,Atop,False,0,0,1.0,1,,ADP,IN,prep,Xxxx


In [20]:
print("Total words for 2 articles:", len(wordsDf))

Total words for 2 articles: 8694


### Generating entire word dataset

In [21]:
wordPickleName = './data/pickles/wordsDf.pkl'

#If the dataframe is already generated, load it.
if (pickleExists(wordPickleName)):
    print("Pickle found. Saved some time.")
    wordsDf = loadPickle(wordPickleName)
else:
    #Extracting words
    words = []

    #titlesCount = len(df['data'])   
    titlesCount = 10   

    for titleId in tqdm(range(titlesCount)):
        paragraphsCount = len(df['data'][titleId]['paragraphs'])

#         printProgress(titleId, titlesCount - 1)

        for paragraphId in range(paragraphsCount):
            addWordsForParagrapgh(words, titleId, paragraphId)
    
    #Create the dataframe
    wordColums = ['text', 'isAnswer', 'titleId', 'paragrapghId', 'sentenceId','wordCount', 'NER', 'POS', 'TAG', 'DEP','shape']
    wordsDf = pd.DataFrame(words, columns=wordColums)
    
    #Pickle the result
    dumpPickle(wordPickleName, wordsDf)
    print("Result was not pickled. You had to wait.")

100%|██████████| 10/10 [00:10<00:00,  1.01s/it]

Result was not pickled. You had to wait.





In [22]:
print("Total words for all articles:", len(wordsDf))

Total words for all articles: 32191


In [23]:
wordsDf.head(20)

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape
0,Architecturally,False,0,0,0.0,1,,ADV,RB,advmod,Xxxxx
1,school,False,0,0,0.0,1,,NOUN,NN,nsubj,xxxx
2,Catholic,False,0,0,0.0,1,NORP,,,,Xxxxx
3,character,False,0,0,0.0,1,,NOUN,NN,dobj,xxxx
4,Atop,False,0,0,1.0,1,,ADP,IN,prep,Xxxx
5,the Main Building's,False,0,0,1.0,4,ORG,,,,xxx Xxxx Xxxxx 'x
6,gold,False,0,0,1.0,1,,NOUN,NN,compound,xxxx
7,dome,False,0,0,1.0,1,,NOUN,NN,nsubj,xxxx
8,golden,False,0,0,1.0,1,,ADJ,JJ,amod,xxxx
9,statue,False,0,0,1.0,1,,NOUN,NN,attr,xxxx


In [24]:
totalAnswers = len(wordsDf[wordsDf['isAnswer'] == True])
print(totalAnswers, 'total answers', '{:.2f}%'.format((totalAnswers / len(wordsDf)) * 100), 'of all words are answers.')

1161 total answers 3.61% of all words are answers.
