In [1]:
train_dir_high = 'train/high/'
train_dir_middle = 'train/middle/'

dev_dir_high = 'dev/high/'
dev_dir_middle = 'dev/middle/'

test_dir_high = 'test/high/'
test_dir_middle = 'test/middle/'

import json
import nltk
import os

from pytorch_pretrained_bert.tokenization import BertTokenizer

sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def read_race(file_dir, output_file):
    output = []
    
    for root, dirs, files in os.walk(file_dir):
        for file in files:
            with open(os.path.join(file_dir, file), 'r') as f:
                json_str = f.read()
                output.append(json.loads(json_str))
#     print(output[0])
    
    with open(output_file, 'w') as f:
        json.dump(output, f)
        
bert_vocab = '../../CoQA-Challenge/BERT/bert-base-uncased-vocab.txt'
bert_tokenizer = BertTokenizer.from_pretrained(bert_vocab)

In [2]:
from collections import Counter

#### Collect all single text files

In [2]:
read_race(dev_dir_high, 'dev-high.json')

In [3]:
read_race(dev_dir_middle, 'dev-middle.json')
read_race(test_dir_middle, 'test-middle.json')
read_race(test_dir_high, 'test-high.json')
read_race(train_dir_middle, 'train-middle.json')
read_race(train_dir_high, 'train-high.json')

#### Combine dataset for data augmentation

In [3]:
def combine(middle_file, high_file, output_file):
    middle = json.load(open(middle_file, 'r'))
    high = json.load(open(high_file, 'r'))
    for instance in middle:
        instance['id'] = 'm-' + instance['id']
    for instance in high:
        instance['id'] = 'h-' + instance['id']
    combined = middle + high
    with open(output_file, 'w') as f:
        json.dump(combined, f)
    print(f'Write {middle_file} and {high_file} into {output_file}, in total {len(combined)}')
    
combine('train-middle.json', 'train-high.json', 'train-combine.json')
combine('dev-middle.json', 'dev-high.json', 'dev-combine.json')
combine('test-middle.json', 'test-high.json', 'test-combine.json')

Write train-middle.json and train-high.json into train-combine.json, in total 25137
Write dev-middle.json and dev-high.json into dev-combine.json, in total 1389
Write test-middle.json and test-high.json into test-combine.json, in total 1407


In [3]:
from tqdm import tqdm

def read_race_articles(input_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    articles = []
    for instance in tqdm(data):
        articles.append({
            'id': instance['id'],
            'article': instance['article']
        })
    return articles

def read_coqa_articles(input_file):
    with open(input_file, 'r') as f:
        data = json.load(f)['data']
    articles = []
    for instance in tqdm(data):
        articles.append({
            'id': instance['id'],
            'article': instance['story']
        })
    return articles

In [17]:
race_train_articles = []
race_dev_articles = []
for file in ['dev-high.json', 'dev-middle.json', 'train-middle.json', 'train-high.json']:
    data = read_race_articles(file)
    if 'train' in file:
        race_train_articles.extend(data)
    else:
        race_dev_articles.extend(data)
        
coqa_train = '/home/jiaofangkai/CoQA-Challenge/BERT_RC/data-set/coqa-train-v1.0.json'
coqa_dev = '/home/jiaofangkai/CoQA-Challenge/BERT_RC/data-set/coqa-dev-v1.0.json'

coqa_train_articles = read_coqa_articles(coqa_train)
coqa_dev_articles = read_coqa_articles(coqa_dev)

with open('race-coqa-train-articles.json', 'w') as f:
    json.dump(coqa_train_articles + race_train_articles, f)
with open('race-coqa-dev-articles.json', 'w') as f:
    json.dump(coqa_dev_articles + race_dev_articles, f)

100%|██████████| 1021/1021 [00:00<00:00, 609418.58it/s]
100%|██████████| 368/368 [00:00<00:00, 550072.66it/s]
100%|██████████| 6409/6409 [00:00<00:00, 965078.42it/s]
100%|██████████| 18728/18728 [00:00<00:00, 594362.33it/s]
100%|██████████| 7199/7199 [00:00<00:00, 845745.18it/s]
100%|██████████| 500/500 [00:00<00:00, 336459.49it/s]


In [7]:
coqa_train = '/home/jiaofangkai/CoQA-Challenge/BERT_RC/data-set/coqa-train-v1.0.json'
coqa_dev = '/home/jiaofangkai/CoQA-Challenge/BERT_RC/data-set/coqa-dev-v1.0.json'

coqa_train_articles = read_coqa_articles(coqa_train)
coqa_dev_articles = read_coqa_articles(coqa_dev)

with open('coqa-dev-articles.json', 'w') as f:
    json.dump(coqa_dev_articles, f)

100%|██████████| 7199/7199 [00:00<00:00, 881831.56it/s]
100%|██████████| 500/500 [00:00<00:00, 407372.18it/s]


In [23]:
len_cnt = Counter()

for instance in tqdm(race_train_articles):
    len_cnt[len(sentence_tokenizer.tokenize(instance['article']))] += 1

100%|██████████| 25137/25137 [00:16<00:00, 1495.66it/s]


In [26]:
a_cnt = 0
for key in len_cnt:
    if key > 90:
        a_cnt += 1
print(a_cnt)

4


In [4]:
race_train_articles = []
race_dev_articles = []
for file in ['dev-high.json', 'dev-middle.json', 'train-middle.json', 'train-high.json']:
    data = read_race_articles(file)
    if 'train' in file:
        race_train_articles.extend(data)
    else:
        race_dev_articles.extend(data)

100%|██████████| 1021/1021 [00:00<00:00, 413317.67it/s]
100%|██████████| 368/368 [00:00<00:00, 356549.75it/s]
100%|██████████| 6409/6409 [00:00<00:00, 565588.59it/s]
100%|██████████| 18728/18728 [00:00<00:00, 154299.46it/s]


In [7]:
max_seq_length = 500
sen_cnt = Counter()

for instance in tqdm(race_train_articles):
    article = instance['article']
    
    sentences = sentence_tokenizer.tokenize(article)
    
    total_piece = 0
    sentence_num = 0
    for sentence in sentences:
        piece_num = len(bert_tokenizer.tokenize(sentence))
        if total_piece + piece_num > max_seq_length:
            total_piece = piece_num
            if sentence_num <= 10:
                sentence_num = 1
                continue
            sen_cnt[sentence_num] += 1
            sentence_num = 1
        else:
            total_piece += piece_num
            sentence_num += 1
        
    if sentence_num > 10:
        sen_cnt[sentence_num] += 1
        
print(sum(sen_cnt.values()))
print(json.dumps(sen_cnt, indent=2))

100%|██████████| 25137/25137 [02:55<00:00, 143.40it/s]

21845
{
  "20": 1404,
  "23": 847,
  "13": 1341,
  "11": 1050,
  "12": 1189,
  "14": 1527,
  "16": 1746,
  "27": 405,
  "17": 1693,
  "24": 752,
  "15": 1635,
  "21": 1152,
  "29": 289,
  "26": 497,
  "22": 999,
  "36": 64,
  "25": 668,
  "19": 1502,
  "18": 1656,
  "28": 369,
  "39": 34,
  "30": 227,
  "37": 47,
  "31": 179,
  "33": 115,
  "38": 38,
  "32": 117,
  "40": 34,
  "43": 13,
  "35": 73,
  "34": 101,
  "41": 21,
  "45": 4,
  "44": 13,
  "46": 7,
  "49": 4,
  "48": 4,
  "42": 14,
  "47": 6,
  "57": 1,
  "51": 1,
  "50": 3,
  "52": 2,
  "53": 1,
  "58": 1
}





In [11]:
max_seq_length = 400

train_segments = []

for instance in tqdm(race_train_articles):
    article = instance['article']
    
    sentences = sentence_tokenizer.tokenize(article)
    
    total_piece = 0
    segment = []
    for sentence in sentences:
        piece_num = len(bert_tokenizer.tokenize(sentence))
        if total_piece + piece_num > max_seq_length:
            total_piece = piece_num
            if len(segment) > 10:
                train_segments.append(" ".join(segment))
            segment = [sentence]
        else:
            total_piece += piece_num
            segment.append(sentence)
        
    if len(segment) > 10:
        train_segments.append(" ".join(segment))

with open('train-segments.json', 'w') as f:
    json.dump(train_segments, f)
print(len(train_segments))
    

100%|██████████| 25137/25137 [02:46<00:00, 151.28it/s]


21889


In [12]:
dev_segments = []
for instance in tqdm(race_dev_articles):
    article = instance['article']
    sentences = sentence_tokenizer.tokenize(article)
    
    total_piece = 0
    segment = []
    for sentence in sentences:
        piece_num = len(bert_tokenizer.tokenize(sentence))
        if total_piece + piece_num > max_seq_length:
            total_piece = piece_num
            if len(segment) > 10:
                dev_segments.append(" ".join(segment))
            segment = [sentence]
        else:
            segment.append(sentence)
            total_piece += piece_num
    
    if len(segment) > 10:
        dev_segments.append(" ".join(segment))
with open('dev-segments.json', 'w') as f:
    json.dump(dev_segments, f)
print(len(dev_segments))

100%|██████████| 1389/1389 [00:10<00:00, 132.37it/s]

1188



