In [1]:
import pandas as pd
from IPython.display import Markdown, display, clear_output
from nltk import tokenize
from scipy import stats
from IPython.core.debugger import set_trace
from pathlib import Path
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/pratik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def printBold(string):
    display(Markdown('**' + string + '**'))

### Pickling

In [3]:
import _pickle as cPickle
from pathlib import Path

def dumpPickle(fileName, content):
    pickleFile = open(fileName, 'wb')
    cPickle.dump(content, pickleFile, -1)
    pickleFile.close()

def loadPickle(fileName):    
    file = open(fileName, 'rb')
    content = cPickle.load(file)
    file.close()
    
    return content
    
def pickleExists(fileName):
    file = Path(fileName)
    
    if file.is_file():
        return True
    
    return False

### Reading datasets
Here we could combine datasets to figure out how question and answers are formed.

In [4]:
train = pd.read_json('./data/squad-v1/train-v1.1.json', orient='column')
dev = pd.read_json('./data/squad-v1/dev-v1.1.json', orient='column')

In [5]:
df = pd.concat([train, dev], ignore_index=True)

In [6]:
df.head()

Unnamed: 0,data,version
0,"{'title': 'University_of_Notre_Dame', 'paragra...",1.1
1,"{'title': 'Beyoncé', 'paragraphs': [{'context'...",1.1
2,"{'title': 'Montana', 'paragraphs': [{'context'...",1.1
3,"{'title': 'Genocide', 'paragraphs': [{'context...",1.1
4,"{'title': 'Antibiotics', 'paragraphs': [{'cont...",1.1


In [7]:
def showQuestion(titleId, paragraphId, questionId):

    title = df['data'][titleId]['title']
    paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
    question = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['question']
    answer = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['text']
    answerStart = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['answer_start']

    printBold('Title')
    print(title)
    printBold('Paragraph')
    print(paragraph)
    printBold('Question')
    print(question)
    printBold('Answer')
    print(answerStart)
    print(answer)

In [8]:
titleId = 0
paragraphId = 0 
questionId = 0

showQuestion(titleId, paragraphId, questionId)

**Title**

University_of_Notre_Dame


**Paragraph**

Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.


**Question**

To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?


**Answer**

515
Saint Bernadette Soubirous


### Dataset Size

In [14]:
titlesCount = len(df['data'])
totalParagraphsCount = 0
totalQuestionsCount = 0

for titleId in range(titlesCount):
    paragraphsCount = len(df['data'][titleId]['paragraphs'])
    totalParagraphsCount += paragraphsCount
    
    for paragraphId in range(paragraphsCount):
        questionsCount = len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])
        
        totalQuestionsCount += questionsCount
        
print('Titles', titlesCount)
print('Paragraphs', totalParagraphsCount)
print('Questions', totalQuestionsCount)

Titles 490
Paragraphs 20963
Questions 98169


### Titles

In [15]:
titles = []
for titleId in range(len(df['data'])):
    titles.append(df['data'][titleId]['title'])
    
for i in range(20):
    print(titles[i])

University_of_Notre_Dame
Beyoncé
Montana
Genocide
Antibiotics
Frédéric_Chopin
Sino-Tibetan_relations_during_the_Ming_dynasty
IPod
The_Legend_of_Zelda:_Twilight_Princess
Spectre_(2015_film)
2008_Sichuan_earthquake
New_York_City
To_Kill_a_Mockingbird
Solar_energy
Tajikistan
Anthropology
Portugal
Kanye_West
Buddhism
American_Idol


### Questions
We thought of removing an answer from sentence and convert that sentence to a question

In [16]:
titleId = 0
paragraphId = 0 
questionId = 0

showQuestion(titleId, paragraphId, questionId)

**Title**

University_of_Notre_Dame


**Paragraph**

Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.


**Question**

To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?


**Answer**

515
Saint Bernadette Soubirous


In [18]:
def extractSentence(paragraph, answerStart):
    
    sentences = tokenize.sent_tokenize(paragraph)
    sentenceStart = 0
    
    for sentence in sentences:
        if (sentenceStart + len(sentence) >= answerStart):
            return sentence         
        
        sentenceStart += len(sentence) + 1

In [19]:
paragraph = df['data'][0]['paragraphs'][0]['context']
answerStart = df['data'][0]['paragraphs'][0]['qas'][0]['answers'][0]['answer_start']

sentence = extractSentence(paragraph, answerStart)
print(sentence)

It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.


In [20]:
def containedInText(text, question):
    
    questionWords = tokenize.word_tokenize(question.lower())
    textWords = tokenize.word_tokenize(text.lower())
    wordsContained = 0

    for questionWord in questionWords:
        for textWord in textWords:
            if (questionWord == textWord):
                wordsContained += 1
                break

    return wordsContained / len(questionWords)

In [21]:
question =  df['data'][0]['paragraphs'][0]['qas'][0]['question']

contained = containedInText(sentence, question)

In [22]:
printBold('Question')
print(question)
printBold('Sentence')
print(sentence)
printBold("Contained")
print(contained)

**Question**

To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?


**Sentence**

It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.


**Contained**

0.6428571428571429


In [23]:
# to print percent of work done
def printPercentage(currentStep, maxStep):
    stepSize = maxStep / 100
    
    if (int(currentStep / stepSize) > ((currentStep - 1) / stepSize)):
        clear_output()
        print('{}%'.format(int(currentStep / stepSize)))

In [25]:
questionContainmentDfPickleName = 'data/pickles/questionContainmentDf.pkl'

#If the dataframe is already generated, load it.
if (pickleExists(questionContainmentDfPickleName)):
    print("Pickle found. Saved some time.")
    questionContainmentDf = loadPickle(questionContainmentDfPickleName)
else:
    sentenceScore = []
    paragraphScore = []

    #For each title
    titlesCount = len(df['data'])
    for titleId in range(titlesCount):
        printPercentage(titleId, titlesCount)

        #For each paragraph
        for paragraphId in range(len(df['data'][titleId]['paragraphs'])):
            paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']

            #For each question
            for questionId in range(len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
                question = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['question']
                answerStart = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['answer_start']
                sentence = extractSentence(paragraph, answerStart)

                sentenceScore.append(containedInText(sentence, question))
                paragraphScore.append(containedInText(paragraph, question))           
                
    #Merge dataframes into one                
    sentenceScoreDf = pd.DataFrame(sentenceScore, columns=['sentence'])
    paragraphScoreDf = pd.DataFrame(paragraphScore, columns=['paragraph'])

    questionContainmentDf = pd.concat([sentenceScoreDf, paragraphScoreDf], axis=1)
    
    #Pickle the result
    dumpPickle(questionContainmentDfPickleName, questionContainmentDf)
    
    print("Result not pickled. Generating...")


99%
Result not pickled. Generating...


In [26]:
questionContainmentDf.head(10)

Unnamed: 0,sentence,paragraph
0,0.642857,0.571429
1,0.636364,0.636364
2,0.533333,0.6
3,0.375,0.5
4,0.333333,0.416667
5,0.272727,0.636364
6,0.3,0.8
7,0.363636,0.727273
8,0.0,0.545455
9,0.266667,0.733333


In [27]:
questionContainmentDf.describe()

Unnamed: 0,sentence,paragraph
count,98169.0,98169.0
mean,0.463967,0.582192
std,0.190377,0.159048
min,0.0,0.0
25%,0.333333,0.5
50%,0.461538,0.6
75%,0.6,0.7
max,1.0,1.0


In [28]:
def getQuestionAt(index):
    currentIndex = 0
    
    for titleId in range(len(df['data'])):
        for paragraphId in range(len(df['data'][titleId]['paragraphs'])):
            for questionId in range(len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
                if (currentIndex == index):
                    return titleId, paragraphId, questionId
                currentIndex += 1

In [29]:
getQuestionAt(8)

(0, 1, 3)

### 0% in senetence/paragraph

In [30]:
# 0%
titleId = 0
paragraphId = 1 
questionId = 3

showQuestion(titleId, paragraphId, questionId)

**Title**

University_of_Notre_Dame


**Paragraph**

As at most other universities, Notre Dame's students run a number of news media outlets. The nine student-run outlets include three newspapers, both a radio and television station, and several magazines and journals. Begun as a one-page journal in September 1876, the Scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the United States. The other magazine, The Juggler, is released twice a year and focuses on student literature and artwork. The Dome yearbook is published annually. The newspapers have varying publication interests, with The Observer published daily and mainly reporting university and other news, and staffed by students from both Notre Dame and Saint Mary's College. Unlike Scholastic and The Dome, The Observer is an independent publication and does not have a faculty advisor or any editorial oversight from the University. In 1987, when some students believed that The Observer began to show a conservative bias, a lib

**Question**

How many student news papers are found at Notre Dame?


**Answer**

126
three


In [31]:
questionContainmentDf[questionContainmentDf['paragraph'] == 0].head()

Unnamed: 0,sentence,paragraph
269,0.0,0.0
363,0.0,0.0
505,0.0,0.0
2781,0.0,0.0
3678,0.0,0.0


In [33]:
getQuestionAt(269)

(1, 0, 0)

In [34]:
titleId = 1
paragraphId = 0 
questionId = 0

showQuestion(titleId, paragraphId, questionId)

**Title**

Beyoncé


**Paragraph**

Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".


**Question**

When did Beyonce start becoming popular?


**Answer**

269
in the late 1990s


Synonyms are used in this question.

### 100% in sentence/paragraph

In [35]:
questionContainmentDf[questionContainmentDf['sentence'] == 1]

Unnamed: 0,sentence,paragraph
21911,1.0,1.0
39394,1.0,1.0
45064,1.0,1.0
48874,1.0,1.0
53226,1.0,1.0
67425,1.0,1.0


In [36]:
getQuestionAt(53226)

(258, 23, 0)

In [37]:
titleId = 258
paragraphId = 23 
questionId = 0

showQuestion(titleId, paragraphId, questionId)

**Title**

Utrecht


**Paragraph**

Utrecht city has an active cultural life, and in the Netherlands is second only to Amsterdam. There are several theatres and theatre companies. The 1941 main city theatre was built by Dudok. Besides theatres there is a large number of cinemas including three arthouse cinemas. Utrecht is host to the international Early Music Festival (Festival Oude Muziek, for music before 1800) and the Netherlands Film Festival. The city has an important classical music hall Vredenburg (1979 by Herman Hertzberger). Its acoustics are considered among the best of the 20th-century original music halls.[citation needed] The original Vredenburg music hall has been redeveloped as part of the larger station area redevelopment plan and in 2014 has gained additional halls that allowed its merger with the rock club Tivoli and the SJU jazzpodium. There are several other venues for music throughout the city. Young musicians are educated in the conservatory, a department of the Utrecht School of the Arts. There is 

**Question**

Cultural life in Utrecht is second to 


**Answer**

0
Utrecht city has an active cultural life, and in the Netherlands is second only to Amsterdam


### Summary

**Question is mostly consisted of words from the sentence the answer is present**

There are some obvious differences like:
- Question-like words are added - who, why...
- Synonyms are used.
- tense of the word.

### Answers in text

In [38]:
answersInText = 0
answersNotInText = 0

for titleId in range(len(df['data'])):
     for paragraphId in range(len(df['data'][titleId]['paragraphs'])):
        paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
        for questionId in range(len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
            answer = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['text']
            if (answer in paragraph):
                answersInText += 1
            else:
                answersNotInText += 1
                
printBold('Answers in text')
print(answersInText)
printBold('Answers not in text')
print(answersNotInText)

**Answers in text**

98169


**Answers not in text**

0


In [59]:
answers = []
sentences = []

for titleId in range(len(df['data'])):
    
     for paragraphId in range(len(df['data'][titleId]['paragraphs'])):
        paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
        
        for questionId in range(len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
            answer = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['text']
            answerStart = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['answer_start']
            
            sentence = extractSentence(paragraph, answerStart)
            
            answers.append(answer)
            sentences.append(sentence)

In [60]:
answerTextsDf = pd.DataFrame(answers, columns=['answer'])
sentenceDf = pd.DataFrame(sentences, columns=['sentence'])

answersDf = pd.concat([answerTextsDf, sentenceDf], axis=1)
answersDf.head()

Unnamed: 0,answer,sentence
0,Saint Bernadette Soubirous,"It is a replica of the grotto at Lourdes, Fran..."
1,a copper statue of Christ,Immediately in front of the Main Building and ...
2,the Main Building,Next to the Main Building is the Basilica of t...
3,a Marian place of prayer and reflection,"Immediately behind the basilica is the Grotto,..."
4,a golden statue of the Virgin Mary,Atop the Main Building's gold dome is a golden...


### Answer word length

In [61]:
wordCount = []

for i in range(len(answersDf)):
    wordCount.append(len(tokenize.word_tokenize(answersDf.iloc[i]['answer'])))

In [62]:
answersDf = pd.concat([answersDf, pd.DataFrame(wordCount, columns=['wordCount'])], axis=1)

In [63]:
answersDf['wordCount'].describe()

count    98169.000000
mean         3.355031
std          3.731700
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max         46.000000
Name: wordCount, dtype: float64

In [64]:
answersDf['wordCount'].value_counts()

1     32156
2     25228
3     14348
4      7562
5      4659
6      3051
7      2222
8      1676
9      1206
10      975
11      755
12      652
13      566
14      461
15      407
16      313
18      274
17      269
19      244
20      191
21      182
23      138
22      131
25      120
24      101
26       77
28       59
27       58
29       28
30       19
31       12
32       11
33        6
38        2
34        2
35        2
36        2
37        2
42        1
46        1
Name: wordCount, dtype: int64

we could see 1/3 answers are one word only and two words answers could be the names of places, people.

In [65]:
answersDf[answersDf['wordCount'] == 2].sample(n=20, random_state=5)

Unnamed: 0,answer,sentence,wordCount
62843,25 genes,"By the end of 2005, 25 genes had been associat...",2
4799,Notre Dame,"In 2006, Lee was awarded an honorary doctorate...",2
44145,Charles Pillsbury,There he met fellow student and later Green Pa...,2
68124,Lionel Robbins,"With the help of Mises, in the late 1920s Haye...",2
7152,The Beatles,"The single, ""A Moment Like This"", went on to b...",2
37185,Prince Albert,"Victoria married her first cousin, Prince Albe...",2
72091,Western Railroad,It was formerly used by the Milwaukee Road fro...,2
4851,Mockingbird groupies,"Local residents call them ""Mockingbird groupie...",2
89845,two points,"Luther's rediscovery of ""Christ and His salvat...",2
17988,Copeland Award,Kansas also won the 1981–82 Copeland Award.,2


### Word types

**Spacy** turned out be a pretty great tool which could provide me with *NER (Named entity recognition), part of speech detection, word embeddings similarity* and some more functions which may or may be useful in my case.

In [66]:
import spacy
from spacy import displacy
from collections import Counter
nlp = spacy.load('en_core_web_sm')

In [67]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
print([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'), ('Google', 'ORG'), ('$5.1 billion', 'MONEY'), ('Wednesday', 'DATE')]


In [68]:
def NerForWord(text):
    doc = nlp(text)
    
    entitiesFound = len(doc.ents)
    
    if (entitiesFound > 0):
        #TODO - Could potentially find multiple entities in the text. We're returning only the first one.
        return doc.ents[0].label_
    else:
        return ''

In [69]:
def isSingleToken(text):
    doc = nlp(text)
    
    #The entire text is a single named entity 
    entitiesFound = len(doc.ents)
    if(entitiesFound == 1 and doc.ents[0].text == text):
        return True
    
    #The text is not an named entity, but is a single token
    tokensFound = len(doc)
    if (tokensFound == 1):
        return True
    
    return False

In [70]:
answersDf['isSingleToken'] = False
answersDf['NER'] = ''
answersDf['POS'] = ''
answersDf['TAG'] = ''
answersDf['DEP'] = ''
answersDf['shape'] = ''
answersDf['isAlpha'] = False
answersDf['isStop'] = False

In [71]:
answersDf.head()

Unnamed: 0,answer,sentence,wordCount,isSingleToken,NER,POS,TAG,DEP,shape,isAlpha,isStop
0,Saint Bernadette Soubirous,"It is a replica of the grotto at Lourdes, Fran...",3,False,,,,,,False,False
1,a copper statue of Christ,Immediately in front of the Main Building and ...,5,False,,,,,,False,False
2,the Main Building,Next to the Main Building is the Basilica of t...,3,False,,,,,,False,False
3,a Marian place of prayer and reflection,"Immediately behind the basilica is the Grotto,...",7,False,,,,,,False,False
4,a golden statue of the Virgin Mary,Atop the Main Building's gold dome is a golden...,7,False,,,,,,False,False


In [72]:
singleTokenCount = 0

sampleSize = int(len(answersDf) / 10)

for i in range(sampleSize):
        
    printPercentage(i, sampleSize)
    
    answer = answersDf.iloc[i]['answer']
    if (isSingleToken(answer)):
        answersDf.at[i, 'isSingleToken'] = True
        
        answersDf.at[i, 'NER'] = NerForWord(answer)
        
        #At this point I've called spacy's nlp method 3 times for the same words...
        doc = nlp(answer)
        
        answersDf.at[i, 'POS'] = doc[0].pos_
        answersDf.at[i, 'TAG'] = doc[0].tag_
        answersDf.at[i, 'DEP'] = doc[0].dep_
        answersDf.at[i, 'isAlpha'] = doc[0].is_alpha
        answersDf.at[i, 'isStop'] = doc[0].is_stop
        
        shape = doc[0].shape_
        for wordIndex in range(1, len(doc)):
            shape += (' ' + doc[wordIndex].shape_)
            
        answersDf.at[i, 'shape'] = shape
        
        

99%


In [74]:
answersDf.head()

Unnamed: 0,answer,sentence,wordCount,isSingleToken,NER,POS,TAG,DEP,shape,isAlpha,isStop
0,Saint Bernadette Soubirous,"It is a replica of the grotto at Lourdes, Fran...",3,False,,,,,,False,False
1,a copper statue of Christ,Immediately in front of the Main Building and ...,5,False,,,,,,False,False
2,the Main Building,Next to the Main Building is the Basilica of t...,3,True,ORG,DET,DT,det,xxx Xxxx Xxxxx,True,True
3,a Marian place of prayer and reflection,"Immediately behind the basilica is the Grotto,...",7,False,,,,,,False,False
4,a golden statue of the Virgin Mary,Atop the Main Building's gold dome is a golden...,7,False,,,,,,False,False


In [75]:
answersDf['isStop'].value_counts()

False    97688
True       481
Name: isStop, dtype: int64

In [76]:
answersDf['NER'].value_counts()

               93991
CARDINAL         952
PERSON           948
DATE             927
ORG              527
GPE              315
PERCENT          152
MONEY             91
NORP              91
LOC               39
FAC               35
QUANTITY          30
ORDINAL           29
WORK_OF_ART       12
TIME               9
PRODUCT            8
LANGUAGE           7
EVENT              6
Name: NER, dtype: int64

In [77]:
spacy.explain('ORG')

'Companies, agencies, institutions, etc.'

In [78]:
answersDf[answersDf['POS'] == 'NUM'].sample(n=10, random_state=16)

Unnamed: 0,answer,sentence,wordCount,isSingleToken,NER,POS,TAG,DEP,shape,isAlpha,isStop
1168,1918,"In June 1917, the U.S. Congress passed the Esp...",1,True,DATE,NUM,CD,ROOT,dddd,False,False
1548,22 February 1810,The parish baptismal record gives his birthday...,3,True,DATE,NUM,CD,nummod,dd Xxxxx dddd,False,False
1134,6.3 percent,"According to the 2010 Census, 89.4 percent of ...",2,True,PERCENT,NUM,CD,nummod,d.d xxxx,False,False
1551,29,"Fryderyk Chopin was born in Żelazowa Wola, 46 ...",1,True,CARDINAL,NUM,CD,ROOT,dd,False,False
2519,2006,In 2006 Apple presented a special edition for ...,1,True,DATE,NUM,CD,ROOT,dddd,False,False
7573,9.9 million,"The first show drew 9.9 million viewers, givin...",2,True,CARDINAL,NUM,CD,compound,d.d xxxx,False,False
4221,2012,"In 2012, New York City topped the first Global...",1,True,,NUM,CD,ROOT,dddd,False,False
3443,18498,"According to Chinese state officials, the quak...",1,True,CARDINAL,NUM,CD,ROOT,"dd,ddd",False,False
2069,65,The last opus number that Chopin himself used ...,1,True,CARDINAL,NUM,CD,ROOT,dd,False,False
7580,21.7 million,The season attracted an average of 21.7 millio...,2,True,CARDINAL,NUM,CD,compound,dd.d xxxx,False,False


In [79]:
def highlightAnswers(titleId, paragraphId):

    paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
    
    answers = df['data'][titleId]['paragraphs'][paragraphId]['qas']

    #Get answer starts and answer length
    answerPosition = {}
    for answer in answers:
        answerStart = answer['answers'][0]['answer_start']
        answerLength = len(answer['answers'][0]['text'])

        answerPosition[answerStart] = answerLength

    #Bold answers
    shiftStart = 0
    highlightedText = ''
    currentPlaceInText = 0
    
    #Append text between previous answer and current answer + bold sign + answer + bold sign
    for answerStart in sorted(answerPosition.keys()):
        highlightedText += paragraph[currentPlaceInText:answerStart]
        highlightedText += '**'
        highlightedText += paragraph[answerStart:answerStart + answerPosition[answerStart]]
        highlightedText += '**'
        
        currentPlaceInText = answerStart + answerPosition[answerStart]
    
    #Append the remaining text after the last answer
    highlightedText += paragraph[currentPlaceInText:len(paragraph)]

    #Diplay the highlighted text
    display(Markdown(highlightedText))

In [80]:
titleId = 24
paragraphId = 0

highlightAnswers(titleId, paragraphId)

Located approximately 250 kilometres (**160** mi) east of Puerto Rico and the nearer Virgin Islands, St. Barthélemy lies immediately southeast of the islands of Saint Martin and Anguilla. It is one of **the Renaissance** Islands. St. Barthélemy is separated from Saint Martin by **the Saint-Barthélemy Channel**. It lies northeast of Saba and St Eustatius, and north of St Kitts. Some small **satellite islets** belong to St. Barthélemy including Île Chevreau (Île Bonhomme), Île Frégate, Île Toc Vers, Île Tortue and Gros Îlets (Îlots Syndare). A much bigger islet, Île Fourchue, lies on the north of the island, in the Saint-Barthélemy Channel. Other rocky islets which include Coco, the Roques (or **little Turtle rocks**), the Goat, and the Sugarloaf.