In [1]:
import os
from argparse import Namespace
import json
from collections import defaultdict
from itertools import chain
import math
import json
import random
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import warnings
import re
import string

import numpy as np
import torch
import torch.nn.functional as F

from transformers import GPT2Tokenizer

import logging
from pprint import pformat

logger = logging.getLogger("FinetuningGPT2")

print(torch.cuda.get_device_name(torch.cuda.current_device()))



Tesla T4


In [3]:
# special tokens and a function to add them
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
                         'additional_special_tokens': ['<speaker1>', '<speaker2>', '<eou>']}
MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"]
PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]

def add_special_tokens(model, tokenizer):
    """
    Add special tokens to the tokenizer and the model if they have not already been added.
    """
    orig_num_tokens = len(tokenizer.encoder)
    num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
    if num_added_tokens > 0 and model is not None:
        model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
add_special_tokens(None, tokenizer)

## Creating Data

In [23]:
with open('data/squad-train.json') as json_file:
    train_stuff = json.load(json_file)
    
with open('data/squad-valid.json') as json_file:
    valid_stuff = json.load(json_file)

In [24]:
def get_distractor(qas, index):
    """
    Get distractor answer from qas where its not at index.
    """
    has_answers = len([x for x in qas if not x['is_impossible']])
    if has_answers < 3:
        return "I think it's France."
    new_idx = random.randint(0, len(qas) - 1)
    if new_idx != index and not qas[new_idx]['is_impossible']:
        return qas[new_idx]['answers'][0]['text']
    else:
        return get_distractor(qas, index)

def create_qa_dataset(data):
    new_data = []
    for idx in tqdm(range(len(data))):
        group = data[idx]
        topic = group['title']
        for p in group['paragraphs']:
            data_chunk = {}
            context = p['context']
            data_chunk['context'] = context
            data_chunk['topic'] = topic
            data_chunk['utterances'] = []
            for i in range(len(p['qas'])):
                qa = p['qas'][i]
                question = qa['question']
                if qa['is_impossible']:
                    answer = "I'm sorry, I don't know."
                    candidates = [get_distractor(p['qas'], i), get_distractor(p['qas'], i), answer]
                else:
                    answer = qa['answers'][0]['text']
                    candidates = ["I'm sorry, I don't know.", get_distractor(p['qas'], i), answer]
                data_chunk['utterances'].append({
                    'history': [question],
                    'candidates':candidates,
                })
            new_data.append(data_chunk)
    return new_data
            
            

In [33]:
squad_train = create_qa_dataset(train_stuff['data'])
with open('data/squad_train.json', 'w') as json_file:
    json.dump(squad_train, json_file, indent=2)
    
squad_valid = create_qa_dataset(valid_stuff['data'])
with open('data/squad_valid.json', 'w') as json_file:
    json.dump(squad_train, json_file, indent=2)

## Dataset Tokenizing

In [None]:
def get_tokenized_dataset(tokenizer, dataset_path):
    """
    Get tokenized dataset.
    """
    if dataset_path.endswith('tokenized.json'):
        print("Loading tokenized dataset from " + dataset_path)
        tokenize = False
    else:
        if os.path.isfile(dataset_path[:-5] + '_tokenized.json'):
            print("Detected existing tokenized file.")
            dataset_path = dataset_path[:-5] + '_tokenized.json'
            print("Loading dataset from " + dataset_path)
            tokenize=False
        else:
            print("Loading dataset from " + dataset_path)
            tokenize = True
    
    with open(dataset_path, 'r') as f:
        dataset = json.load(f)
        
    if tokenize:
        print("Tokenizing the dataset")
        def tokenize(obj):
            if isinstance(obj, str):
                return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
            if isinstance(obj, dict):
                return dict((n, tokenize(o)) for n, o in obj.items())
            return list(tokenize(o) for o in obj)

        dataset = tokenize(dataset)
        
        new_name = dataset_path[:-5] + '_tokenized.json'
        print("Saving dataset to " + new_name)
        with open(new_name, 'w') as outfile:
            json.dump(dataset, outfile, indent=2)

    if split:
        print("Fetched " + split + " dataset")
        return dataset[split]
    else:
        return dataset

In [None]:
squad_train = get_tokenized_dataset('data/squad_train.json')
squad_valid = get_tokenized_dataset('data/squad_valid.json')