### Step 1: Connect to Colab and GCP

In [1]:
#This will clone the BERT Repo

!git clone https://github.com/google-research/bert.git

Cloning into 'bert'...
remote: Enumerating objects: 340, done.[K
remote: Total 340 (delta 0), reused 0 (delta 0), pack-reused 340[K
Receiving objects: 100% (340/340), 317.20 KiB | 7.93 MiB/s, done.
Resolving deltas: 100% (185/185), done.


In [2]:
#Mount my drive so that I can access the split training sets. 

from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
# Download the SQUAD train and dev dataset

# I do not need the training set since I am using the split version above. 
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json

# Still download the Dev set.
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json

--2020-07-24 15:18:55--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.110.153, 185.199.109.153, 185.199.108.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.110.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘train-v2.0.json’


2020-07-24 15:18:56 (58.3 MB/s) - ‘train-v2.0.json’ saved [42123633/42123633]

--2020-07-24 15:18:56--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.110.153, 185.199.109.153, 185.199.108.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.110.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘dev-v2.0.json’


2020-07-24 15:18:57 (18.1 MB/s) - ‘dev-v2.0.json’ saved [4370528/4370528]



In [4]:
# Necessary installs so I can mount the files from my bucket onto colab

from google.colab import auth
auth.authenticate_user()

!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   653  100   653    0     0  36277      0 --:--:-- --:--:-- --:--:-- 36277
OK
105 packages can be upgraded. Run 'apt list --upgradable' to see them.
The following package was automatically installed and is no longer required:
  libnvidia-common-440
Use 'apt autoremove' to remove it.
The following NEW packages will be installed:
  gcsfuse
0 upgraded, 1 newly installed, 0 to remove and 105 not upgraded.
Need to get 4,278 kB of archives.
After this operation, 12.8 MB of additional disk space will be used.
Selecting previously unselected package gcsfuse.
(Reading database ... 144465 files and directories currently installed.)
Preparing to unpack .../gcsfuse_0.30.0_amd64.deb ...
Unpacking gcsfuse (0.30.0) ...
Setting up gcsfuse (0.30.0) ...


In [5]:
# Make a folder for the bucket, this will have all of the files inside. 

!mkdir folderOnColab
!gcsfuse thaddeussegura_final_project folderOnColab 

Using mount point: /content/folderOnColab
Opening GCS connection...
Opening bucket...
Mounting file system...
File system has been successfully mounted.


### Step 2: Helper Functions

In [6]:
#imports 

import json
import numpy as np
import pandas as pd
import collections, functools, operator 

In [45]:
# These are a number of helper functions that will be used below to combine the predictions.

#helper function to open json
def open_json(path):
    with open(path) as json_file:
        temp_json = json.load(json_file)
        return temp_json

#generate a list of file paths.
def generate_file_list(n):
  list_of_files = []
  if n == 1:
    for i in range(4):
      path = 'folderOnColab/self_ensemble_1/checkpoint'+str(i)+'/nbest_predictions.json'
      list_of_files.append(path)
  else:
    for i in range(n):
      path = 'folderOnColab/self_ensemble_'+str(n)+'/'+str(n)+'_way_'+str(i)+'/'+str(n)+'_way_'+str(i)+'_n_preds.json'
      list_of_files.append(path)
  return list_of_files

#extract the predicted text from each of the prediction files.
def extract_probs(data, top_n):
  new_dict = {}
  for key in data:
      sub_dict = {}
      if len(data[key]) >= top_n:
        for i in range(top_n):
            sub_dict[data[key][i]['text']] = data[key][i]['probability']
        new_dict[key] = sub_dict
      else: 
        sub_dict[data[key][0]['text']] = data[key][0]['probability']
        new_dict[key] = sub_dict
  return new_dict

#Get a dictonary of the keys and the sum of all the probabilities.
def sum_probs(dict_list):
  new_dict = {}
  for key in dict_list[0]:
      #go through all dictonaries in the list looking at that key.
      #add them into a list of dicts for map reduce
      kv_list = []
      for dictonary in dict_list:
          kv_list.append(dictonary[key])
      result = dict(functools.reduce(operator.add, map(collections.Counter, kv_list)))
      new_dict[key] = result
  return new_dict

#take the dictonary of probabilities and return the highest value.
# this will be passed into output_predictions to turn it back into a JSON file
#So that it can then be evaluated. 
def get_preds(prob_dict):
  predictions = {}
  for key in prob_dict:
      predictions[key] = max(prob_dict[key].items(), key=operator.itemgetter(1))[0]
  return predictions

#dump the prediction dict into a json file.
def output_predictions(predictions, file_name):
    with open(file_name, 'w', encoding = 'utf-8') as json_file:
        json.dump(predictions, json_file, ensure_ascii=True)

#this is used for factoring in the classification model. 
def classification_weight(file, weight, n_models):
  #append the null prediction to each question in the same
  #format used for the other models.
  #determine a weight by multipliying the classification by
  #an input weight * number of models to normalize.
  new_dict = {}
  data = open_json(file)
  for key in data:
    if data[key] == "":
      sub_dict = {"" : 1*weight*n_models}
    else:
      sub_dict = {"" : 0}
    new_dict[key] = sub_dict
  return new_dict

In [46]:
#this will loop through the self ensemble models. 
def full_loop(splits, top_n, file_name):
  #get the file list
  file_list = generate_file_list(splits)
  dict_list = []
  #open each file, and append the top n preds with probs for each 
  for f in file_list:
    data = open_json(f)
    probs = extract_probs(data, top_n)
    dict_list.append(probs)
  #map reduce to find sum across keys within each question
  prob_dict = sum_probs(dict_list)
  #pull out the top selection with the proper key
  predictions = get_preds(prob_dict)
  #output predictions as the file name
  output_predictions(predictions, file_name)

#this is if I want to combine multiple models from various folders.
def manual_list_loop(file_list, top_n, file_name):
  dict_list = []
  #open each file, and append the top n preds with probs for each 
  for f in file_list:
    data = open_json(f)
    probs = extract_probs(data, top_n)
    dict_list.append(probs)
  #map reduce to find sum across keys within each question
  prob_dict = sum_probs(dict_list)
  #pull out the top selection with the proper key
  predictions = get_preds(prob_dict)
  #output predictions as the file name
  output_predictions(predictions, file_name)

#this will add classification into the mix. 
def with_class_loop(file_list, top_n, file_in, weight, out_file_name):
  dict_list = []
  #open each file, and append the top n preds with probs for each 
  for f in file_list:
    data = open_json(f)
    probs = extract_probs(data, top_n)
    dict_list.append(probs)
  class_dict = classification_weight(file_in, weight, len(dict_list))
  dict_list.append(class_dict)
  #map reduce to find sum across keys within each question
  prob_dict = sum_probs(dict_list)
  #pull out the top selection with the proper key
  predictions = get_preds(prob_dict)
  #output predictions as the file name
  output_predictions(predictions, out_file_name)


In [39]:
#generate for SE4, with top 10 
full_loop(4, 10, 'se4_preds.json')

In [47]:
#generate for SE8, with top 3 
full_loop(8, 3, 'se8_preds.json')

In [56]:
#generate for SE1, with top 10
full_loop(1, 10, 'se1_preds.json')

In [49]:
#this is for a manual list to check through.  

path1 = '/content/folderOnColab/overtrain/nbest_predictions.json'
path2 = '/content/folderOnColab/baseline_test2/nbest_predictions.json'
path3 = '/content/folderOnColab/baseline_test3/nbest_predictions.json'
path4 = '/content/folderOnColab/baseline_test4/nbest_predictions.json'
path5 = '/content/folderOnColab/self_ensemble_1/nbest_predictions.json'
path_list = [path2, path3, path4, path5]
#use the top 10 votes from the models selected above
manual_list_loop(path_list, 10, 'man_preds.json')

In [63]:
#add all of the Self Ensemble models into the prediction.  
se4_list = generate_file_list(4)
se8_list = generate_file_list(8)
for l in se4_list:
  path_list.append(l)
for l in se8_list:
  path_list.append(l)

manual_list_loop(path_list, 10, 'man_preds2.json')

In [60]:
#with_class_loop(file_list, top_n, file_in, weight, out_file_name)
class_dict = '/content/drive/My Drive/classification_save/preds.json'

with_class_loop(path_list, 10, class_dict, 0.1, 'class_preds.json')



### Step 3: Test weighted voting

* Self Ensemble
* 4 Way Data Split
* 8 Way data split 
* Misc Models.

In [23]:
# Clone the SQUAD Repo so that I can get the evaluation file. 

!git clone https://github.com/white127/SQUAD-2.0-bidaf.git

Cloning into 'SQUAD-2.0-bidaf'...
remote: Enumerating objects: 125, done.[K
Receiving objects:   0% (1/125)   Receiving objects:   1% (2/125)   Receiving objects:   2% (3/125)   Receiving objects:   3% (4/125)   Receiving objects:   4% (5/125)   Receiving objects:   5% (7/125)   Receiving objects:   6% (8/125)   Receiving objects:   7% (9/125)   Receiving objects:   8% (10/125)   Receiving objects:   9% (12/125)   Receiving objects:  10% (13/125)   Receiving objects:  11% (14/125)   Receiving objects:  12% (15/125)   Receiving objects:  13% (17/125)   Receiving objects:  14% (18/125)   Receiving objects:  15% (19/125)   Receiving objects:  16% (20/125)   Receiving objects:  17% (22/125)   Receiving objects:  18% (23/125)   Receiving objects:  19% (24/125)   Receiving objects:  20% (25/125)   Receiving objects:  21% (27/125)   Receiving objects:  22% (28/125)   Receiving objects:  23% (29/125)   Receiving objects:  24% (30/125)   Receiving objects:  25% (32/125

In [24]:
# Move evaluate-v2.0 into the content folder

%mv /content/SQUAD-2.0-bidaf/evaluate-v2.0.py /content/

In [40]:
# Evaluate the Results. 

print("Results for SE 4, 10 weighted votes")
!python evaluate-v2.0.py dev-v2.0.json se4_preds.json

Results for SE 4, 5 weighted votes
{
  "exact": 73.69662258906763,
  "f1": 77.04776635528744,
  "total": 11873,
  "HasAns_exact": 74.59514170040485,
  "HasAns_f1": 81.30703946294318,
  "HasAns_total": 5928,
  "NoAns_exact": 72.80067283431455,
  "NoAns_f1": 72.80067283431455,
  "NoAns_total": 5945
}


In [67]:
print("Results for SE 8, 5 weighted votes")
!python evaluate-v2.0.py dev-v2.0.json se8_preds.json 

Results for SE 8, 5 weighted votes
Traceback (most recent call last):
  File "evaluate-v2.0.py", line 276, in <module>
    main()
  File "evaluate-v2.0.py", line 235, in main
    with open(OPTS.pred_file) as f:
FileNotFoundError: [Errno 2] No such file or directory: 'se8_preds.json'


In [57]:
print("Results for SE 1, weighted votes")
!python evaluate-v2.0.py dev-v2.0.json se1_preds.json

Results for SE 1, weighted votes
{
  "exact": 40.46155141918639,
  "f1": 44.57816406295368,
  "total": 11873,
  "HasAns_exact": 80.97165991902834,
  "HasAns_f1": 89.2166906071945,
  "HasAns_total": 5928,
  "NoAns_exact": 0.0672834314550042,
  "NoAns_f1": 0.0672834314550042,
  "NoAns_total": 5945
}


In [70]:
print("Results for manual list, weighted votes")
!python evaluate-v2.0.py dev-v2.0.json man_preds.json

Results for manual list, weighted votes
{
  "exact": 78.80906257896066,
  "f1": 81.80238664792371,
  "total": 11873,
  "HasAns_exact": 78.67746288798921,
  "HasAns_f1": 84.67269511990524,
  "HasAns_total": 5928,
  "NoAns_exact": 78.94028595458369,
  "NoAns_f1": 78.94028595458369,
  "NoAns_total": 5945
}


In [64]:
print("Results for All Models, 10 weighted votes")
!python evaluate-v2.0.py dev-v2.0.json man_preds2.json

Results for All Models, 10 weighted votes
{
  "exact": 75.0357955024004,
  "f1": 77.8917591375123,
  "total": 11873,
  "HasAns_exact": 76.26518218623482,
  "HasAns_f1": 81.9852996355741,
  "HasAns_total": 5928,
  "NoAns_exact": 73.8099243061396,
  "NoAns_f1": 73.8099243061396,
  "NoAns_total": 5945
}


### Step 4: Include weight from Sequence Model.

In [79]:
print("Results for Top Models + classification @ 20% weight")
with_class_loop(path_list, 10, class_dict, 0.20, 'class_preds.json')
!python evaluate-v2.0.py dev-v2.0.json class_preds.json 



Results for Top Models + classification @ 20% weight
{
  "exact": 79.03646930009265,
  "f1": 81.9903969939297,
  "total": 11873,
  "HasAns_exact": 78.18825910931174,
  "HasAns_f1": 84.10458561216727,
  "HasAns_total": 5928,
  "NoAns_exact": 79.88225399495374,
  "NoAns_f1": 79.88225399495374,
  "NoAns_total": 5945
}
