In [1]:
import pandas as pd
import json

In [2]:
def read_data(file_path, num_records = 100):
    """
    Reads the data from the jsonl file and returns a pandas dataframe
    :param file_path: File path of the jsonl file
    :param num_records: Number of records to read from the file
    :return: Pandas dataframe
    """
    current_record = 1
    records = []
    
    with open(file_path) as file:
        line = file.readline()
        while(line):
            records.append(json.loads(line))
            line = file.readline()
            if current_record > num_records:
                break             
            current_record = current_record + 1
    df = pd.DataFrame(records)
    return df

In [3]:
df = read_data('data/train.jsonl', 100)

In [4]:
df.head()

Unnamed: 0,document_text,long_answer_candidates,question_text,annotations,document_url,example_id
0,Email marketing - Wikipedia <H1> Email marketi...,"[{'start_token': 14, 'top_level': True, 'end_t...",which is the most common use of opt-in e-mail ...,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Em...,5655493461695504401
1,The Mother ( How I Met Your Mother ) - wikiped...,"[{'start_token': 28, 'top_level': True, 'end_t...",how i.met your mother who is the mother,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Th...,5328212470870865242
2,Human fertilization - wikipedia <H1> Human fer...,"[{'start_token': 14, 'top_level': True, 'end_t...",what type of fertilisation takes place in humans,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Hu...,4435104480114867852
3,List of National Football League career quarte...,"[{'start_token': 28, 'top_level': True, 'end_t...",who had the most wins in the nfl,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Li...,5289242154789678439
4,Roanoke Colony - wikipedia <H1> Roanoke Colony...,"[{'start_token': 32, 'top_level': True, 'end_t...",what happened to the lost settlement of roanoke,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Ro...,5489863933082811018


In [5]:
df['question_text'][0]

'which is the most common use of opt-in e-mail marketing'

In [6]:
df['document_text'][0:3]

0    Email marketing - Wikipedia <H1> Email marketi...
1    The Mother ( How I Met Your Mother ) - wikiped...
2    Human fertilization - wikipedia <H1> Human fer...
Name: document_text, dtype: object

In [7]:
df['document_text'][0]

"Email marketing - Wikipedia <H1> Email marketing </H1> Jump to : navigation , search <Table> <Tr> <Td> </Td> <Td> ( hide ) This article has multiple issues . Please help improve it or discuss these issues on the talk page . ( Learn how and when to remove these template messages ) <Table> <Tr> <Td> </Td> <Td> This article needs additional citations for verification . Please help improve this article by adding citations to reliable sources . Unsourced material may be challenged and removed . ( September 2014 ) ( Learn how and when to remove this template message ) </Td> </Tr> </Table> <Table> <Tr> <Td> </Td> <Td> This article possibly contains original research . Please improve it by verifying the claims made and adding inline citations . Statements consisting only of original research should be removed . ( January 2015 ) ( Learn how and when to remove this template message ) </Td> </Tr> </Table> ( Learn how and when to remove this template message ) </Td> </Tr> </Table> <Table> <Tr> <T

In [8]:
df['annotations'][0]

[{'yes_no_answer': 'NONE',
  'long_answer': {'start_token': 1952,
   'candidate_index': 54,
   'end_token': 2019},
  'short_answers': [{'start_token': 1960, 'end_token': 1969}],
  'annotation_id': 593165450220027640}]

In [9]:
df['yes_no_answer'] = [item[0]['yes_no_answer'] for item in df['annotations']]
df['long_answer'] = [item[0]['long_answer'] for item in df['annotations']]
df['short_answers'] = [item[0]['short_answers'] for item in df['annotations']]
df['annotation_id'] = [item[0]['annotation_id'] for item in df['annotations']]

In [10]:
df['yes_no_answer'].value_counts()

yes_no_answer
NONE    99
NO       2
Name: count, dtype: int64

In [11]:
start_vals = []
end_vals = []

for item in df['short_answers']:
    start = -1
    end = -1
    if len(item) > 0:
        start = item[0]['start_token']
        end = item[0]['end_token']
    start_vals.append(start)
    end_vals.append(end)
df['short_answer_start'] = start_vals
df['short_answer_end'] = end_vals

In [12]:
df.head()

Unnamed: 0,document_text,long_answer_candidates,question_text,annotations,document_url,example_id,yes_no_answer,long_answer,short_answers,annotation_id,short_answer_start,short_answer_end
0,Email marketing - Wikipedia <H1> Email marketi...,"[{'start_token': 14, 'top_level': True, 'end_t...",which is the most common use of opt-in e-mail ...,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Em...,5655493461695504401,NONE,"{'start_token': 1952, 'candidate_index': 54, '...","[{'start_token': 1960, 'end_token': 1969}]",593165450220027640,1960,1969
1,The Mother ( How I Met Your Mother ) - wikiped...,"[{'start_token': 28, 'top_level': True, 'end_t...",how i.met your mother who is the mother,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Th...,5328212470870865242,NONE,"{'start_token': 212, 'candidate_index': 15, 'e...","[{'start_token': 213, 'end_token': 215}]",12034874153783787365,213,215
2,Human fertilization - wikipedia <H1> Human fer...,"[{'start_token': 14, 'top_level': True, 'end_t...",what type of fertilisation takes place in humans,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Hu...,4435104480114867852,NONE,"{'start_token': 319, 'candidate_index': 24, 'e...",[],10527123009892725162,-1,-1
3,List of National Football League career quarte...,"[{'start_token': 28, 'top_level': True, 'end_t...",who had the most wins in the nfl,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Li...,5289242154789678439,NONE,"{'start_token': 509, 'candidate_index': 59, 'e...","[{'start_token': 512, 'end_token': 514}]",14634796365152556576,512,514
4,Roanoke Colony - wikipedia <H1> Roanoke Colony...,"[{'start_token': 32, 'top_level': True, 'end_t...",what happened to the lost settlement of roanoke,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Ro...,5489863933082811018,NONE,"{'start_token': -1, 'candidate_index': -1, 'en...",[],11038549994888625916,-1,-1


In [13]:
df_ = df[['document_text', 'question_text', 'short_answer_start', 'short_answer_end']]

In [25]:
df_.head()

Unnamed: 0,document_text,question_text,short_answer_start,short_answer_end
0,Email marketing - Wikipedia <H1> Email marketi...,which is the most common use of opt-in e-mail ...,1960,1969
1,The Mother ( How I Met Your Mother ) - wikiped...,how i.met your mother who is the mother,213,215
2,Human fertilization - wikipedia <H1> Human fer...,what type of fertilisation takes place in humans,-1,-1
3,List of National Football League career quarte...,who had the most wins in the nfl,512,514
4,Roanoke Colony - wikipedia <H1> Roanoke Colony...,what happened to the lost settlement of roanoke,-1,-1


In [27]:
df_['document_text'][0].split()[1960:1969]

['a',
 'newsletter',
 'sent',
 'to',
 'an',
 'advertising',
 'firm',
 "'s",
 'customers']