In [1]:
# coding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals



import argparse
import logging
import os
from pathlib import Path
import random
from io import open
import pickle
import math

import numpy as np
import requests


In [2]:
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)


In [3]:
_CURPATH = Path.cwd() 
_TMPDIR = _CURPATH / "squad_data"
_TRAINDIR = _TMPDIR / "squad_train"
_TESTFILE = "dev-v2.0.json"
_DATADIR = _CURPATH / "squad_data"
_TRAINFILE = "train-v2.0.json"
_URL = "https://rajpurkar.github.io/SQuAD-explorer/dataset/" + _TRAINFILE
_MODELS = _CURPATH / "models"

In [13]:
def maybe_download(directory, filename, uri):
  
    filepath = os.path.join(directory, filename)
    if not os.path.exists(directory):
        logger.info(f"Creating new dir: {directory}")
        os.makedirs(directory)
    if not os.path.exists(filepath):
        logger.info("Downloading und unpacking file, as file does not exist yet")
        r = requests.get(uri, allow_redirects=True)
        open(filepath, "wb").write(r.content)

    return filepath

In [14]:
filename = maybe_download(_TMPDIR, _TRAINFILE, _URL)

04/10/2019 23:24:47 - INFO - __main__ -   Downloading und unpacking file, as file does not exist yet


In [5]:
import json
for files in os.listdir(_TMPDIR):
    
    with open(_TMPDIR/files, "r", encoding="utf-8") as json_file:
        data_dict = json.load(json_file)
        data_dict = data_dict["data"]
        number_articles = len(data_dict)
        total = 0
        hundreds = 0
        twohundredf = 0
        rest = 0
        
        for article in range(number_articles):
            cur_number_context = len(data_dict[article]["paragraphs"])
            print(f"This is article number {article}")
            print(cur_number_context)
            
            for context in range(cur_number_context):
                num = len(data_dict[article]["paragraphs"][context]["context"].split())
                #print(num)
                if num < 101:
                    hundreds += 1
                elif num < 150:
                    twohundredf += 1
                else:
                    rest += 1
        print(f"Hundreds is {hundreds}")
        print(f"twohundreds is {twohundredf}")
        print(f"Rest is {rest}")
                

# data contains all the data
# title contains the title, paragrapsh contains qas, context (the paragraph)
# qas contains a list of dicts with the questions and the answers


# lets put all the stuff inside the get_item function, so that we get new data each epoch without rebuilding
# rebuild should just contain saving the json file or we dont even need rebuild


This is article number 0
66
109
160
176
93
124
228
239
134
184
143
181
204
107
150
312
178
119
67
100
157
51
197
106
264
188
26
160
133
103
151
101
77
36
326
166
79
97
140
127
114
153
95
105
129
99
98
91
59
81
99
205
96
229
198
193
98
125
78
113
148
128
135
50
52
149
286
This is article number 1
82
142
160
141
75
77
128
79
95
101
182
82
103
135
87
149
158
96
71
79
229
224
301
108
131
217
177
217
128
129
66
65
136
194
85
57
32
74
41
75
67
102
72
39
56
129
169
61
84
88
140
82
49
96
90
38
132
92
112
136
97
40
73
134
105
66
57
46
42
130
117
83
225
131
167
124
92
150
111
89
132
84
95
This is article number 2
72
181
142
123
57
74
74
114
133
178
54
117
79
69
94
100
144
74
208
151
223
237
140
162
89
64
122
172
102
39
96
201
94
77
81
48
96
116
102
50
136
161
91
75
197
53
63
220
329
164
192
167
181
192
225
208
74
106
141
128
85
172
167
65
194
127
78
102
346
132
239
228
76
This is article number 3
60
69
44
43
84
83
229
187
86
70
179
48
95
56
108
35
135
75
72
100
67
62
72
162
117
191
51
173
89
27


94
42
44
46
62
55
51
26
31
66
49
27
45
33
80
50
31
76
39
This is article number 56
79
124
129
96
104
90
65
127
37
66
64
153
204
71
217
151
26
89
155
27
225
321
133
152
92
138
81
142
134
44
102
68
125
37
102
51
101
175
102
196
54
85
144
108
25
61
104
236
77
145
157
32
117
72
132
129
191
117
78
56
61
129
85
75
54
47
105
102
47
44
98
143
140
110
76
74
81
80
101
116
This is article number 57
42
99
25
123
137
77
86
51
210
119
37
114
64
78
82
57
58
157
122
75
75
106
90
108
85
138
105
148
87
126
129
162
135
195
69
89
142
130
177
44
47
30
135
This is article number 58
33
41
56
39
59
69
56
26
32
47
69
89
24
70
33
108
61
45
43
52
54
147
107
89
85
21
134
68
28
84
31
30
69
93
This is article number 59
32
47
68
58
85
111
104
122
122
30
60
49
90
82
56
63
56
47
107
63
30
117
33
141
72
60
69
59
59
90
49
39
66
This is article number 60
53
74
60
54
104
72
109
70
78
74
68
72
69
147
59
65
70
100
149
75
87
118
53
58
63
88
64
63
59
121
98
71
102
53
56
65
75
97
93
116
31
52
71
97
50
76
74
75
59
83
123
60
102

135
100
85
99
112
106
121
167
96
108
141
168
124
100
148
98
110
136
195
161
This is article number 121
44
157
106
94
121
78
153
159
268
117
101
87
110
106
186
104
156
93
175
160
73
109
303
125
125
99
106
145
163
253
125
111
87
136
86
195
132
94
111
118
122
106
81
139
154
This is article number 122
24
100
93
98
104
104
90
111
129
162
134
150
96
96
106
86
118
116
129
125
107
91
119
123
103
This is article number 123
23
97
88
141
89
144
92
96
90
91
140
212
80
125
141
162
125
130
97
101
156
127
99
101
This is article number 124
34
153
208
124
158
99
149
89
155
141
126
150
91
118
138
148
161
83
128
81
195
140
187
128
175
123
135
102
108
92
84
97
99
97
139
This is article number 125
12
221
95
108
87
110
100
95
157
105
86
104
222
This is article number 126
20
118
80
117
107
214
96
145
115
94
137
101
91
115
238
143
99
90
156
109
89
This is article number 127
16
86
88
116
167
135
109
169
85
102
118
76
169
98
111
136
84
This is article number 128
21
114
80
84
89
99
132
140
75
82
128
103
138
94
9

104
199
175
154
185
87
87
138
291
199
110
90
95
128
113
210
137
119
109
158
124
87
146
This is article number 171
42
82
168
120
211
90
75
204
110
140
217
113
161
176
118
131
200
133
105
122
119
241
149
110
194
141
109
216
120
142
121
182
152
153
163
170
86
95
108
90
123
104
127
This is article number 172
37
94
90
123
94
122
138
107
129
101
128
95
84
89
138
101
191
112
91
84
101
125
106
108
121
85
148
231
129
87
132
93
129
132
103
117
142
84
This is article number 173
99
251
162
124
128
161
95
127
101
128
96
180
171
124
104
117
121
113
93
123
279
149
150
98
137
83
83
162
154
138
93
113
99
130
153
96
150
159
115
179
190
114
129
127
108
86
107
99
208
195
101
254
297
283
145
83
132
144
83
100
176
96
176
171
149
120
128
192
190
85
180
108
140
126
95
130
109
76
100
105
94
93
190
174
152
114
103
87
92
131
124
126
140
86
104
139
85
87
110
186
This is article number 174
94
183
210
105
118
99
107
90
137
96
137
171
92
130
99
88
273
110
173
83
116
131
133
91
143
151
131
144
177
151
106
179
109
107

149
214
99
This is article number 212
30
96
121
93
149
183
70
131
192
110
132
100
267
135
237
236
101
93
97
191
104
152
143
124
100
173
92
77
99
221
136
This is article number 213
26
101
136
84
124
190
110
82
77
109
78
121
80
125
81
94
118
88
100
87
74
93
111
112
132
117
155
This is article number 214
49
130
101
100
81
100
154
98
89
108
148
93
88
111
139
169
120
207
95
155
143
100
103
143
101
78
142
118
185
112
85
93
136
165
147
105
101
107
105
114
96
79
309
194
128
234
99
97
123
84
This is article number 215
26
116
93
68
195
71
212
88
139
129
111
101
74
130
77
70
91
113
105
77
97
77
138
140
75
179
133
This is article number 216
37
97
121
240
177
109
213
93
174
137
105
210
114
237
166
198
132
163
152
237
132
155
95
82
84
151
126
222
92
116
126
100
200
99
119
130
208
107
This is article number 217
46
111
122
122
261
162
257
139
143
119
128
133
133
159
123
120
126
102
91
125
252
151
146
130
158
106
105
102
206
82
89
103
214
80
99
80
85
219
124
78
358
155
134
115
110
346
126
This is artic

107
151
122
101
167
93
110
207
135
95
119
152
91
88
83
123
96
80
100
115
124
160
102
This is article number 260
23
110
144
100
129
190
91
109
372
104
80
106
128
91
154
94
151
86
147
83
164
139
95
109
This is article number 261
32
106
112
134
79
95
141
176
95
77
162
94
87
136
171
91
140
104
104
127
89
97
89
97
111
82
175
214
145
124
108
119
123
This is article number 262
85
139
164
97
273
189
179
95
127
139
88
182
292
221
152
127
321
99
89
95
171
167
112
104
143
84
126
84
121
124
131
150
132
116
166
201
144
114
112
87
96
97
199
190
113
108
114
95
207
150
93
151
149
130
102
533
111
111
121
105
214
136
161
106
91
72
99
123
81
101
119
86
99
92
86
139
115
104
139
127
100
122
198
88
128
95
This is article number 263
23
107
91
97
104
107
90
87
165
125
137
78
115
90
165
108
114
124
87
150
147
82
86
127
This is article number 264
25
96
95
99
123
87
115
89
99
100
125
94
64
118
156
103
91
123
171
108
128
105
65
152
104
111
This is article number 265
27
105
141
108
120
150
138
81
176
188
120
80
10

191
91
112
106
90
119
113
140
117
123
93
89
562
163
101
160
91
144
202
123
127
149
136
200
269
156
132
This is article number 307
60
127
118
179
133
134
107
121
104
78
145
142
79
115
151
125
176
196
109
98
93
88
137
102
111
162
180
144
145
199
141
118
108
163
265
114
135
86
195
99
113
88
139
168
100
132
172
181
108
130
95
232
176
121
115
89
233
99
128
125
93
This is article number 308
65
102
88
87
88
70
86
251
160
89
131
113
121
104
115
94
87
116
177
140
188
111
188
81
133
98
174
172
96
324
179
156
231
281
114
119
118
85
184
157
111
169
142
111
187
181
126
129
84
90
100
131
197
99
166
95
157
112
119
245
109
127
74
109
110
90
This is article number 309
94
80
131
92
95
118
94
178
90
92
80
79
96
119
106
76
104
163
177
128
151
91
166
101
97
105
118
99
67
191
171
185
140
91
134
147
78
162
101
84
99
103
140
91
85
96
155
142
77
122
101
114
116
152
168
183
164
198
157
111
93
175
141
121
88
105
224
129
84
109
89
161
122
259
128
231
163
71
95
88
138
83
119
81
130
76
99
144
99
130
137
130
79
139


83
120
154
83
95
86
102
101
126
102
81
98
85
99
108
106
95
163
100
89
144
83
113
187
129
113
154
156
89
143
149
91
135
88
86
115
101
92
110
113
91
This is article number 357
92
109
183
122
156
101
75
138
124
189
187
130
83
97
173
165
134
218
146
121
188
111
123
104
85
152
90
111
113
164
176
123
97
95
101
167
85
88
142
132
178
128
210
109
79
120
116
109
113
101
88
129
160
215
177
174
118
87
181
130
247
101
108
103
103
92
154
146
118
146
131
77
110
118
102
275
287
112
108
89
105
105
104
222
119
102
113
90
174
259
103
192
94
This is article number 358
31
145
107
90
95
97
80
222
94
80
81
75
89
164
127
109
89
90
87
137
120
77
106
84
97
113
81
169
149
83
83
96
This is article number 359
31
123
161
126
144
101
141
99
87
103
137
147
105
161
189
102
183
118
172
110
177
82
128
89
132
195
95
92
96
116
92
96
This is article number 360
65
130
121
135
152
108
152
175
172
185
101
94
130
168
112
110
147
187
98
116
137
176
103
97
144
155
90
96
138
182
131
90
83
191
159
84
110
139
100
113
89
112
236
110

132
103
124
194
197
78
168
86
87
81
103
92
64
110
88
73
103
92
104
134
112
101
114
111
137
100
91
113
102
142
95
99
82
100
134
155
181
106
128
199
84
149
80
202
84
141
98
156
96
139
100
102
175
122
113
102
This is article number 404
43
89
103
106
205
92
236
94
131
145
159
113
87
202
139
85
156
146
117
103
142
92
155
130
121
97
100
93
103
111
73
101
144
146
122
111
97
165
80
95
115
120
195
85
This is article number 405
21
122
161
108
106
107
84
131
167
196
85
152
138
145
148
89
116
87
83
81
133
154
This is article number 406
62
88
139
112
158
92
124
78
101
83
124
97
79
166
75
182
116
118
85
119
262
146
205
288
160
237
136
103
288
83
150
184
88
122
156
219
221
131
126
96
112
198
166
172
88
86
138
123
130
140
114
174
142
77
89
105
95
106
133
89
219
197
84
This is article number 407
81
86
236
216
99
220
177
139
83
122
97
257
93
84
151
113
91
125
108
338
171
151
210
242
104
104
407
94
292
190
246
87
87
120
207
101
232
110
147
122
219
82
101
146
192
179
135
146
99
106
135
88
255
100
96
131
1

In [29]:
a = data_dict["data"]

In [30]:
len(a)

442

In [33]:
a[0]

{'title': 'Beyoncé',
 'paragraphs': [{'qas': [{'question': 'When did Beyonce start becoming popular?',
     'id': '56be85543aeaaa14008c9063',
     'answers': [{'text': 'in the late 1990s', 'answer_start': 269}],
     'is_impossible': False},
    {'question': 'What areas did Beyonce compete in when she was growing up?',
     'id': '56be85543aeaaa14008c9065',
     'answers': [{'text': 'singing and dancing', 'answer_start': 207}],
     'is_impossible': False},
    {'question': "When did Beyonce leave Destiny's Child and become a solo singer?",
     'id': '56be85543aeaaa14008c9066',
     'answers': [{'text': '2003', 'answer_start': 526}],
     'is_impossible': False},
    {'question': 'In what city and state did Beyonce  grow up? ',
     'id': '56bf6b0f3aeaaa14008c9601',
     'answers': [{'text': 'Houston, Texas', 'answer_start': 166}],
     'is_impossible': False},
    {'question': 'In which decade did Beyonce become famous?',
     'id': '56bf6b0f3aeaaa14008c9602',
     'answers': [{'text

In [36]:
a[0]["paragraphs"][0]["context"]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [37]:
len(a[0]["paragraphs"])

66