In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# https://github.com/zzeng13/DISC

In [2]:
!pip install transformers
!pip install sentencepiece



In [3]:
from transformers import BertTokenizer, XLNetTokenizer
import json 
from tqdm import tqdm
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import numpy as np
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
# I/O Helper functions

def read_json_lines(path_to_file): 
    with open(path_to_file) as f:
        content = f.readlines()
    f.close()
    raw_data  = [json.loads(x) for x in content] 
    return raw_data

def read_json_file(path):
    with open(path, 'r') as f:
        return json.load(f)
    
def write_json_file(path, data):
    with open(path, 'w') as f:
        json.dump(data, f)
    return


In [5]:
!mkdir /home/data
!mkdir /home/data/processed
import requests 
file_url = "https://raw.githubusercontent.com/zzeng13/DISC/master/data/MAGPIE/raw/MAGPIE_filtered_split_random_raw_processed.json"
	
r = requests.get(file_url, stream = True) 

with open("/home/data/filtered_split_random_raw_processed.json", "wb") as file: 
	for block in r.iter_content(chunk_size = 1024): 
		if block: 
			file.write(block) 

mkdir: cannot create directory ‘/home/data’: File exists
mkdir: cannot create directory ‘/home/data/processed’: File exists


In [6]:
# Settings 
split = 'random'

# Input Paths
PATH_TO_DATA = '/home/data/filtered_split_{}_raw_processed.json'.format(split)

# Output paths
PATH_TO_SAVE_DATA = '/home/data/processed/{}_data.json'.format(split)
PATH_TO_SAVE_TARGET_VOCAB = '/home/data/processed/target_vocab.json'
PATH_TO_SAVE_GLOVE_VOCAB = '/home/data/processed/{}_glove_vocab.json'.format(split)
PATH_TO_SAVE_CHAR_VOCAB = '/home/data/processed/{}_char_vocab.json'.format(split)
PATH_TO_SAVE_GLOVE_EMB = '/home/data/processed/{}_glove_embed.npy'.format(split)
PATH_TO_SAVE_POS_VOCAB = '/home/data/processed/{}_pos_vocab.json'.format(split)
PATH_TO_SAVE_DATA_IDX = '/home/data/processed/{}_data_idx.json'.format(split)
PATH_TO_SAVE_XLNET_VOCAB = '/home/data/processed/{}_xlnet_vocab.json'.format(split)
PATH_TO_SAVE_XLNET_INDICES = '/home/data/processed/{}_xlnet_indices.json'.format(split)

# Other settings 
max_seq_len = 50

## 1. Constructing dictionaries

In [7]:
target_vocab = {
    '<PAD>': 0,
    '<s>': 1, 
    '<e>': 2,
    '<l>': 3,
    '<i>': 4
}


## 2. Model construction and sentence tokenization

In [8]:
# initialize tokenizer
# model parameters
pretrained_model_name = 'bert-base-uncased'
# tokenizer parameters
do_lower_case = True
# model and tokenizer initialization
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name, 
                                          do_lower_case = do_lower_case)

# load processed data 
raw_data = read_json_file(PATH_TO_DATA)


In [9]:
raw_data['train'][12]

{'genre': 'W fict prose',
 'id': 13,
 'idiom': 'in the hole',
 'label': 'l',
 'offsets': [[27, 29], [32, 36]],
 'sentence': "At least back when I lived in a hole I knew everything there was to know about living in a hole , and now it 's a year later and I 'm at a place so far away I do n't even know how far away it is , watching something I do n't understand go to a place so far up there is no down .",
 'split': 'training'}

In [10]:
xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased', do_lower_case = True)


In [11]:
def process_source_and_target_sequence(raw_data, new_data, data_indices): 
    num_exceptions = 0
    for data_entry in tqdm(raw_data): 
        # ==============================================================================
        # STRING PRE-PROCESSING
        # ------------------------------------------------------------------------------
        try: 
            # extract raw source and target sentence
            source_sentence = data_entry['sentence'].lower()
            if len(source_sentence.split()) > max_seq_len: 
                continue 
            label = data_entry['label']
            data_id = data_entry['id']
            
            offsets = [data_entry['offsets'][0][0], data_entry['offsets'][-1][0]]

            while source_sentence[offsets[0]] != " " and offsets[0] != -1: 
                offsets[0] -= 1
            offsets[0] += 1
            while offsets[1] != len(source_sentence) and source_sentence[offsets[1]] not in [" ", ",", ".", "’"]: 
                offsets[1] += 1

            if source_sentence[offsets[0]] in ["‘", ","]: 
                offsets[0] += 1
            if source_sentence[offsets[1]-1] in ["’", "!", '?', ')']: 
                offsets[1] -= 1

            idiom_in_sentence = ''.join(source_sentence[offsets[0]: offsets[1]].split())
            target_sentence = source_sentence[: offsets[0]] + ''.join([' [SEP]' for i in source_sentence[offsets[0]: offsets[1]].split()]) + source_sentence[offsets[1]:]

            def apply_contraction_change(s): 
                s = s.replace(" n't", "n't")
                s = s.replace("\n", "")
                s = s.replace("‘", " ‘ ")
                s = s.replace("’", " ’ ")
                s = s.replace(",", " , ")
                s = s.replace(".", " . ")
                s = s.replace('?', ' ? ')
                s = s.replace('!', ' ! ')
                s = s.replace('-', ' - ')
                return s
            # apply contraction 
            source_sentence = apply_contraction_change(source_sentence)   
            source_sequence_xlnet = xlnet_tokenizer.batch_encode_plus([source_sentence])['input_ids'][0]
            target_sentence = apply_contraction_change(target_sentence) 


            # ==============================================================================
            # TOKENIZATION
            # ------------------------------------------------------------------------------

            # BERT TOKENIZATION 
            # ------------------------------------------------
            # process source sequence 
            source_sequence = tokenizer.batch_encode_plus([source_sentence])['input_ids'][0]
            target_sequence = tokenizer.batch_encode_plus([target_sentence])['input_ids'][0]

            # GLOVE TOKENIZATION
            # ------------------------------------------------
            # add start and end symbols
            source_sentence_glove_tknz =  word_tokenize(source_sentence)

            # add start and end symbols
            tags_tokens = nltk.pos_tag(source_sentence_glove_tknz)
            source_sentence_glove_tknz =[t[0] for t in tags_tokens]
            source_sentence_pos_taggs = [t[1] for t in tags_tokens]
            source_sentence_glove_tknz = ['<S>'] + source_sentence_glove_tknz + ['<E>']
            source_sentence_pos_taggs = ['<S>'] + source_sentence_pos_taggs + ['<E>']

            # STRING TOKENIZATION
            # ------------------------------------------------
            source_sentence_char_tknz = []
            for word in source_sentence_glove_tknz: 
                if word in ['<S>', '<E>']: 
                    source_sentence_char_tknz.append(['<SPEC>'])
                else: 
                    source_sentence_char_tknz.append(list(word))
            #print(source_sentence_char_tknz)
            #raise

            # ==============================================================================
            # PROCESS TARGET SEQUENCE
            # ------------------------------------------------------------------------------

            # Generate target sentence
            # identify the start and end indices 
            target_indices = []
            for t_idx, t in enumerate(target_sequence): 
                if tokenizer._convert_id_to_token(t) == '[SEP]': 
                    target_indices.append(t_idx)
            target_indices = [min(target_indices[:-1]), max(target_indices[:-1])]
            idiom_in_target = ''.join([tokenizer._convert_id_to_token(t) for t in source_sequence[target_indices[0]: target_indices[1]+1]]).replace('#', '')
            while idiom_in_target != idiom_in_sentence: 
                target_indices[1] += 1
                idiom_in_target = ''.join([tokenizer._convert_id_to_token(t) for t in source_sequence[target_indices[0]: target_indices[1]+1]]).replace('#', '')
                if target_indices[1] > len(source_sequence): 
                    raise

            # process target sequence
            target_sequence = [target_vocab['<s>']] + \
                              [target_vocab['<l>'] for i in source_sequence[1:-1]] + \
                              [target_vocab['<e>']]
            if label == 'i': 
                for t_idx in range(target_indices[0], target_indices[1]+1): 
                    target_sequence[t_idx] = target_vocab['<i>']
                # print([tokenizer._convert_id_to_token(source_sequence[t_idx]) for t_idx, t in enumerate(target_sequence) if t == target_vocab['<i>']])
            try: 
                # assert len(source_sequence) == len(source_sentence_glove_tknz)
                assert len(source_sequence) == len(target_sequence)
            except: 
                # print(' '.join(source_sentence_bert_tknz))
                print(' '.join(source_sentence_glove_tknz))
                raise
            new_data_entry = [source_sequence, source_sentence_glove_tknz, source_sentence_char_tknz, source_sentence_pos_taggs, target_sequence, label, source_sequence_xlnet]
            new_data.append(new_data_entry)
            data_indices.append(data_id)
        except:
            num_exceptions += 1
        
    print('Final Number of Exceptions: {}'.format(num_exceptions)) 

In [12]:
# data processing 
processed_data = {'train': [], 'valid': [], 'test': []}
data_indices = {'train': [], 'valid': [], 'test': []}
process_source_and_target_sequence(raw_data['train'], processed_data['train'], data_indices['train'])
process_source_and_target_sequence(raw_data['test'], processed_data['valid'], data_indices['valid'])

print('Final Number of train data: {}'.format(len(processed_data['train'])))
print('Final Number of test data: {}'.format(len(processed_data['test'])))
print('Final Number of valid data: {}'.format(len(processed_data['valid'])))

100%|██████████| 35533/35533 [02:10<00:00, 272.17it/s]


Final Number of Exceptions: 25


100%|██████████| 4451/4451 [00:15<00:00, 279.48it/s]

Final Number of Exceptions: 4
Final Number of train data: 32162
Final Number of test data: 0
Final Number of valid data: 4030





In [13]:
# data_indices['train']

## XLNET DATA PROCESSING

In [14]:
raw_data['train'][0]['sentence']

'For example , with fell running and mountain marathons gaining in popularity , how about some ideas for safe running off the beaten track ?'

In [15]:
s = xlnet_tokenizer.batch_encode_plus("<S> " + raw_data['train'][0]['sentence'] + " <E>")

In [16]:
s

{'input_ids': [[7739, 4, 3], [17, 23, 4, 3], [17, 3151, 4, 3], [4, 3], [2830, 4, 3], [17, 155, 4, 3], [17, 213, 4, 3], [4, 3], [17, 93, 4, 3], [3512, 4, 3], [24, 4, 3], [17, 98, 4, 3], [943, 4, 3], [17, 368, 4, 3], [17, 93, 4, 3], [4, 3], [17, 19, 4, 3], [4, 3], [17, 694, 4, 3], [17, 150, 4, 3], [17, 46, 4, 3], [17, 409, 4, 3], [4, 3], [2830, 4, 3], [17, 93, 4, 3], [17, 368, 4, 3], [17, 368, 4, 3], [4, 3], [17, 213, 4, 3], [17, 660, 4, 3], [17, 180, 4, 3], [17, 180, 4, 3], [17, 150, 4, 3], [17, 180, 4, 3], [17, 299, 4, 3], [4, 3], [24, 4, 3], [17, 180, 4, 3], [17, 66, 4, 3], [4, 3], [17, 98, 4, 3], [17, 155, 4, 3], [17, 660, 4, 3], [17, 180, 4, 3], [17, 46, 4, 3], [24, 4, 3], [17, 150, 4, 3], [17, 180, 4, 3], [4, 3], [17, 98, 4, 3], [24, 4, 3], [17, 213, 4, 3], [24, 4, 3], [17, 46, 4, 3], [17, 409, 4, 3], [17, 155, 4, 3], [17, 180, 4, 3], [17, 23, 4, 3], [4, 3], [17, 299, 4, 3], [24, 4, 3], [17, 150, 4, 3], [17, 180, 4, 3], [17, 150, 4, 3], [17, 180, 4, 3], [17, 299, 4, 3], [4, 3], [17

In [17]:
write_json_file(PATH_TO_SAVE_DATA_IDX, data_indices)

In [18]:
print(processed_data['train'][0])

[[101, 2005, 2742, 1010, 2007, 3062, 2770, 1998, 3137, 8589, 2015, 8550, 1999, 6217, 1010, 2129, 2055, 2070, 4784, 2005, 3647, 2770, 2125, 1996, 7854, 2650, 1029, 102], ['<S>', 'for', 'example', ',', 'with', 'fell', 'running', 'and', 'mountain', 'marathons', 'gaining', 'in', 'popularity', ',', 'how', 'about', 'some', 'ideas', 'for', 'safe', 'running', 'off', 'the', 'beaten', 'track', '?', '<E>'], [['<SPEC>'], ['f', 'o', 'r'], ['e', 'x', 'a', 'm', 'p', 'l', 'e'], [','], ['w', 'i', 't', 'h'], ['f', 'e', 'l', 'l'], ['r', 'u', 'n', 'n', 'i', 'n', 'g'], ['a', 'n', 'd'], ['m', 'o', 'u', 'n', 't', 'a', 'i', 'n'], ['m', 'a', 'r', 'a', 't', 'h', 'o', 'n', 's'], ['g', 'a', 'i', 'n', 'i', 'n', 'g'], ['i', 'n'], ['p', 'o', 'p', 'u', 'l', 'a', 'r', 'i', 't', 'y'], [','], ['h', 'o', 'w'], ['a', 'b', 'o', 'u', 't'], ['s', 'o', 'm', 'e'], ['i', 'd', 'e', 'a', 's'], ['f', 'o', 'r'], ['s', 'a', 'f', 'e'], ['r', 'u', 'n', 'n', 'i', 'n', 'g'], ['o', 'f', 'f'], ['t', 'h', 'e'], ['b', 'e', 'a', 't', 'e', 'n

## 3. Constructing glove dictionary 

In [19]:
def get_glove_vocab(raw_dataset):
    """
    return vocab set, and prints out the vocab size
    :param raw_dataset: a list of lists: each inner list is a triple:
                a sentence: string
                a list of labels:
                a list of pos:
    :return: a set: the vocabulary in the raw_dataset
    """
    vocab = []
    for example in raw_dataset:
        vocab.extend(example[1])  # index 1 is the source sentence tkn for glove
    vocab = set(vocab)
    vocab.remove('<S>')
    vocab.remove('<E>')
    print("vocab size: ", len(vocab))
    return vocab

def get_glove_word2idx_idx2word(vocab):
    """
    :param vocab: a set of strings: vocabulary
    :return: word2idx: string to an int
             idx2word: int to a string
    """
    word2idx = {"<PAD>": 0, "<UNK>": 1, '<S>': 2, '<E>': 3}
    idx2word = {0: "<PAD>", 1: "<UNK>", 2: '<S>', 3: '<E>'}
    for word in vocab:
        assigned_index = len(word2idx)
        word2idx[word] = assigned_index
        idx2word[assigned_index] = word
    return word2idx, idx2word

In [20]:
glove_vocab =  get_glove_vocab(processed_data['train'] + processed_data['valid'])

vocab size:  38667


In [21]:
word2idx_glove, idx2word_glove = get_glove_word2idx_idx2word(glove_vocab)
len(word2idx_glove.keys())


38671

In [22]:
glove_vocab = list(glove_vocab)
glove_tknz_idx = 1
# replace the words with indices 
for i, d in enumerate(processed_data['train'] ): 
    toknz_sent = []
    for w in d[glove_tknz_idx]: 
        toknz_sent.append(word2idx_glove[w])
    d[glove_tknz_idx] = toknz_sent
    processed_data['train'][i] = d
    
for i, d in enumerate(processed_data['valid'] ): 
    toknz_sent = []
    for w in d[glove_tknz_idx]: 
        toknz_sent.append(word2idx_glove[w])
    d[glove_tknz_idx] = toknz_sent
    processed_data['valid'][i] = d

## 4. Constructing character dictionary

In [23]:
char_tknz_idx = 2
def get_char_vocab(raw_dataset):
    """
    return vocab set, and prints out the vocab size
    :param raw_dataset: a list of lists: each inner list is a triple:
                a sentence: string
                a list of labels:
                a list of pos:
    :return: a set: the vocabulary in the raw_dataset
    """
    vocab = []
    for example in raw_dataset:
        #print(example)
        for cs in example[char_tknz_idx]: 
         #   print(cs)
            vocab.extend(cs)  # index 2 is the source sentence tkn for char tokens
        
    vocab = set(vocab)

    vocab.remove('<SPEC>')
    # vocab.remove('<E>')
    print("vocab size: ", len(vocab))
    return vocab

def get_char_word2idx_idx2word(vocab):
    """
    :param vocab: a set of strings: vocabulary
    :return: word2idx: string to an int
             idx2word: int to a string
    """
    word2idx = {"<PAD>": 0, "<UNK>": 1, '<SPEC>': 2}
    idx2word = {0: "<PAD>", 1: "<UNK>", 2: '<SPEC>'}
    for word in vocab:
        assigned_index = len(word2idx)
        word2idx[word] = assigned_index
        idx2word[assigned_index] = word
    return word2idx, idx2word


In [24]:
char_vocab =  get_char_vocab(processed_data['train'] + processed_data['valid'])

vocab size:  117


In [25]:
word2idx_char, idx2word_char = get_char_word2idx_idx2word(char_vocab)

In [26]:
char_vocab = list(char_vocab)
# replace the words with indices 
for i, d in enumerate(processed_data['train'] ): 
    toknz_sent = []
    for w in d[char_tknz_idx]:
        toknz_word = []
        for c in w: 
            toknz_word.append(word2idx_char[c])
        toknz_sent.append(toknz_word)
    d[char_tknz_idx] = toknz_sent
    processed_data['train'][i] = d
    
for i, d in enumerate(processed_data['valid'] ): 
    toknz_sent = []
    for w in d[char_tknz_idx]:
        toknz_word = []
        for c in w: 
            toknz_word.append(word2idx_char[c])
        toknz_sent.append(toknz_word)
    d[char_tknz_idx] = toknz_sent
    processed_data['valid'][i] = d

In [27]:
processed_data['train'][12]

[[101,
  1998,
  2059,
  1010,
  2065,
  2673,
  2743,
  2995,
  2000,
  2433,
  1010,
  1996,
  6151,
  8067,
  5999,
  5726,
  2015,
  2052,
  9498,
  2091,
  2000,
  1037,
  4899,
  4873,
  2041,
  1997,
  4356,
  1012,
  102],
 [2,
  27056,
  20196,
  27017,
  37231,
  11065,
  30994,
  33869,
  36425,
  37024,
  27017,
  7154,
  14695,
  5781,
  33944,
  26692,
  29959,
  36425,
  30560,
  15543,
  15643,
  17028,
  27035,
  11145,
  13279,
  3],
 [[2],
  [17, 115, 53],
  [82, 70, 75, 115],
  [56],
  [61, 80],
  [75, 90, 75, 113, 96, 82, 70, 61, 115, 74],
  [113, 17, 115],
  [82, 113, 95, 75],
  [82, 114],
  [80, 114, 113, 22],
  [56],
  [82, 70, 75],
  [95, 115, 53, 17, 22, 17, 74, 75, 53],
  [6, 89, 17, 96, 106],
  [32, 114, 95, 89, 53],
  [106, 17, 61, 89],
  [53, 114, 32, 115],
  [82, 114],
  [17],
  [89, 17, 115, 53, 61, 115, 74],
  [106, 114, 22, 75, 32, 70, 75, 113, 75],
  [114, 95, 82],
  [114, 80],
  [106, 61, 74, 70, 82],
  [29],
  [2]],
 ['<S>',
  'CC',
  'RB',
  ',',
 

## 4. Constructing POS tag dictionary

In [28]:
pos_tag_idx = 3

def get_pos_vocab(raw_dataset):
    """
    return vocab set, and prints out the vocab size
    :param raw_dataset: a list of lists: each inner list is a triple:
                a sentence: string
                a list of labels:
                a list of pos:
    :return: a set: the vocabulary in the raw_dataset
    """
    vocab = []
    for example in raw_dataset:
        vocab.extend(example[pos_tag_idx])  # index 3 is the source sentence tkn - pos tags 
    vocab = set(vocab)
    vocab.remove('<S>')
    vocab.remove('<E>')
    print("vocab size: ", len(vocab))
    return vocab

def get_pos_word2idx_idx2word(vocab):
    """
    :param vocab: a set of strings: vocabulary
    :return: word2idx: string to an int
             idx2word: int to a string
    """
    word2idx = {"<PAD>": 0, "<UNK>": 1, '<S>': 2, '<E>': 3}
    idx2word = {0: "<PAD>", 1: "<UNK>", 2: '<S>', 3: '<E>'}
    for word in vocab:
        assigned_index = len(word2idx)
        word2idx[word] = assigned_index
        idx2word[assigned_index] = word
    return word2idx, idx2word

In [29]:
pos_vocab =  get_pos_vocab(processed_data['train'] + processed_data['valid'])
word2idx_pos, idx2word_pos = get_pos_word2idx_idx2word(pos_vocab)


vocab size:  44


In [30]:
pos_vocab = list(pos_vocab)
# replace the words with indices 
for i, d in enumerate(processed_data['train'] ): 
    toknz_sent = []
    for w in d[pos_tag_idx]: 
        toknz_sent.append(word2idx_pos[w])
    d[pos_tag_idx] = toknz_sent
    processed_data['train'][i] = d
    
for i, d in enumerate(processed_data['valid'] ): 
    toknz_sent = []
    for w in d[pos_tag_idx]: 
        toknz_sent.append(word2idx_pos[w])
    d[pos_tag_idx] = toknz_sent
    processed_data['valid'][i] = d



## 5. Creating glove embedding

In [31]:
import os
import urllib.request
import mmap
urllib.request.urlretrieve('https://nlp.stanford.edu/data/glove.840B.300d.zip','glove.840B.300d.zip')

('glove.840B.300d.zip', <http.client.HTTPMessage at 0x7fc6ab925750>)

In [38]:
!unzip "glove.840B.300d.zip" -d "/home/"

Archive:  glove.840B.300d.zip
  inflating: /home/glove.840B.300d.txt  


In [39]:
PATH_TO_STATIC_GLOVE_EMBEDDINGS = '/home/glove.840B.300d.txt'
GLOVE_EMBEDDING_DIM = 300
GLOVE_EMBED_NORAM = False

In [40]:
def get_num_lines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    return lines

In [41]:
glove_vectors = {}
print('Loading Pre-trained GLOVE word Embeddings...')
with open(PATH_TO_STATIC_GLOVE_EMBEDDINGS) as glove_file:
    for line in tqdm(glove_file, total=get_num_lines(PATH_TO_STATIC_GLOVE_EMBEDDINGS)):
        split_line = line.rstrip().split()
        word = split_line[0]
        if len(split_line) != (GLOVE_EMBEDDING_DIM + 1) or word not in word2idx_glove:
            continue
        assert (len(split_line) == GLOVE_EMBEDDING_DIM + 1)
        vector = np.array([float(x) for x in split_line[1:]], dtype="float32")
        if GLOVE_EMBED_NORAM:
            vector = vector / np.linalg.norm(vector)
        assert len(vector) == GLOVE_EMBEDDING_DIM
        glove_vectors[word] = vector
print("Number of pre-trained word vectors loaded: ", len(glove_vectors))


Loading Pre-trained GLOVE word Embeddings...


100%|██████████| 2196017/2196017 [00:41<00:00, 52667.17it/s]

Number of pre-trained word vectors loaded:  35040





In [42]:
all_embeddings = np.array(list(glove_vectors.values()))
embeddings_mean = float(np.mean(all_embeddings))
embeddings_stdev = float(np.std(all_embeddings))
print("Embeddings mean: ", embeddings_mean)
print("Embeddings stdev: ", embeddings_stdev)

# Randomly initialize an embedding matrix of (vocab_size, embedding_dim) shape
# with a similar distribution as the pretrained embeddings for words in vocab.
vocab_size = len(word2idx_glove)
embedding_matrix = np.random.normal(embeddings_mean, embeddings_stdev, size=(vocab_size, GLOVE_EMBEDDING_DIM))
# Go through the embedding matrix and replace the random vector with a
# pretrained one if available. Start iteration at 2 since 0, 1 are PAD, UNK
hit, miss = 0, 0 
for i in range(2, vocab_size):
    word = idx2word_glove[i]
    if word in glove_vectors:
        hit += 1
        embedding_matrix[i] = np.array(glove_vectors[word])
    else: 
        miss += 1
        
if GLOVE_EMBED_NORAM:
    for i in range(vocab_size):
        embedding_matrix[i] = embedding_matrix[i] / float(np.linalg.norm(embedding_matrix[i]))

print('Glove Embedding shape: ')
print(embedding_matrix.shape)
print('Hit ratio: {}'.format(hit/(hit + miss)))


Embeddings mean:  -0.0039416104555130005
Embeddings stdev:  0.39029982686042786
Glove Embedding shape: 
(38671, 300)
Hit ratio: 0.9061522149525459


In [44]:
write_json_file(PATH_TO_SAVE_DATA, processed_data)
write_json_file(PATH_TO_SAVE_TARGET_VOCAB, target_vocab)
write_json_file(PATH_TO_SAVE_GLOVE_VOCAB, idx2word_glove)
write_json_file(PATH_TO_SAVE_CHAR_VOCAB, idx2word_char)
write_json_file(PATH_TO_SAVE_POS_VOCAB, idx2word_pos)
#write_json_file(PATH_TO_SAVE_XLNET_VOCAB, xlnet_processed_data)
#write_json_file(PATH_TO_SAVE_XLNET_INDICES, xlnet_data_indices)
np.save(PATH_TO_SAVE_GLOVE_EMB, embedding_matrix)


In [45]:
!zip -r /home/data/processed.zip /home/data/processed


updating: home/data/processed/ (stored 0%)
updating: home/data/processed/random_xlnet_indices.json (deflated 98%)
updating: home/data/processed/random_xlnet_vocab.json (deflated 82%)
updating: home/data/processed/random_glove_embed.npy (deflated 41%)
updating: home/data/processed/random_data_idx.json (deflated 66%)
updating: home/data/processed/random_char_vocab.json (deflated 66%)
updating: home/data/processed/random_data.json (deflated 75%)
updating: home/data/processed/random_pos_vocab.json (deflated 61%)
updating: home/data/processed/read_comp_target_vocab.json (deflated 27%)
updating: home/data/processed/random_glove_vocab.json (deflated 64%)
updating: home/data/processed/target_vocab.json (deflated 27%)


In [48]:
from google.colab import files
files.download('/home/data/processed.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [49]:
files.download('/home/data/processed.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!rm /home/data/processed.zip

In [None]:
with open('/home/processed_data.txt', 'w') as f:
    f.write(str(processed_data))
f.close()

In [None]:
files.download('/home/processed_data.txt')

In [47]:
print(processed_data['train'][0])

[[101, 2005, 2742, 1010, 2007, 3062, 2770, 1998, 3137, 8589, 2015, 8550, 1999, 6217, 1010, 2129, 2055, 2070, 4784, 2005, 3647, 2770, 2125, 1996, 7854, 2650, 1029, 102], [2, 31739, 20464, 27017, 30437, 5996, 29425, 27056, 17314, 23914, 38102, 28004, 7232, 27017, 5888, 21726, 18865, 26766, 31739, 15209, 29425, 2031, 7154, 19753, 4061, 2317, 3], [[2], [80, 114, 113], [75, 110, 17, 22, 93, 89, 75], [56], [32, 61, 82, 70], [80, 75, 89, 89], [113, 95, 115, 115, 61, 115, 74], [17, 115, 53], [22, 114, 95, 115, 82, 17, 61, 115], [22, 17, 113, 17, 82, 70, 114, 115, 106], [74, 17, 61, 115, 61, 115, 74], [61, 115], [93, 114, 93, 95, 89, 17, 113, 61, 82, 96], [56], [70, 114, 32], [17, 99, 114, 95, 82], [106, 114, 22, 75], [61, 53, 75, 17, 106], [80, 114, 113], [106, 17, 80, 75], [113, 95, 115, 115, 61, 115, 74], [114, 80, 80], [82, 70, 75], [99, 75, 17, 82, 75, 115], [82, 113, 17, 6, 12], [107], [2]], [2, 20, 7, 6, 20, 40, 31, 38, 7, 45, 31, 20, 7, 6, 33, 20, 28, 45, 20, 4, 31, 18, 28, 7, 7, 44, 3]