In [3]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES']=''

import sys;
sys.path.append('..')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import json
wiki = json.load(open('/home/kuratov/data/ThePile/Wikipedia/wikipedia-en-0.json', 'r'))

In [9]:
{
    'title': '',
    'text': '',
}

{'title': '', 'text': ''}

In [10]:
# articles to skip:
# List of ...
# {title} may refer to: (disambiguation pages)
# wiki key words to clean
# \nCategory:
# \n\nSee also\n\n
# \n\nReferences\n\n

In [5]:
import re
category_regexp = re.compile(r'\ACategory:[\S ]+\Z')
keywords_regexp = re.compile(r'\n(References|See also|External Links)\s?\n')
title_regexp = re.compile(r'\A[\S ]+?\\n\\n')
list_of_regexp = re.compile(r'\AList of')

def clean_article(text):
    lines = []
    title = ''
    for i, line in enumerate(text.split('\n')):
        if i == 0:
            title = line
            continue
        if category_regexp.match(line):
            continue
        lines += [line]
    text = '\n'.join(lines)
    m = re.search(keywords_regexp, text)
    if m:
        text = text[:m.span()[0]]
    return title, text.strip()

In [6]:
# preprocess wiki
from tqdm import tqdm as tqdm
import json

path = '/home/kuratov/data/ThePile/Wikipedia/wikipedia-en-{}.json'

total_articles = 0
filtered_articles = 0

articles = []

for i in tqdm(range(0, 10)):
    wiki = json.load(open(path.format(i), 'r'))
    for text in wiki:
        total_articles +=1
        title, text = clean_article(text)
        if re.search(list_of_regexp, title):
            filtered_articles += 1
            continue
        if f'{title} may refer to:' in text:
            filtered_articles += 1
            continue
        articles += [{'title': title, 'text': text}]
print(f'{filtered_articles}/{total_articles}')


  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:34<05:10, 34.50s/it][A
 20%|██        | 2/10 [01:10<04:40, 35.07s/it][A
 30%|███       | 3/10 [01:46<04:06, 35.15s/it][A
 40%|████      | 4/10 [02:21<03:30, 35.11s/it][A
 50%|█████     | 5/10 [02:57<02:57, 35.56s/it][A
 60%|██████    | 6/10 [03:32<02:20, 35.21s/it][A
 70%|███████   | 7/10 [04:07<01:45, 35.30s/it][A
 80%|████████  | 8/10 [04:44<01:11, 35.84s/it][A
 90%|█████████ | 9/10 [05:19<00:35, 35.50s/it][A
100%|██████████| 10/10 [05:54<00:00, 35.46s/it][A

279906/6033151





In [7]:
print(f'{filtered_articles}/{len(articles)}/{total_articles}')

279906/5753245/6033151


In [8]:
# save shards

from pathlib import Path

shards_path = '/home/kuratov/data/ThePile/Wikipedia/preprocessed_shards_train'

if not Path(shards_path).exists():
    Path(shards_path).mkdir(parents=True)

import random, math
random.shuffle(articles)
n_shards = 81  # (we possibly have to take n_gpus shards for validation, so we need more shards) add +1 for validation

shard_size = math.ceil(len(articles) / n_shards)

for i, start in enumerate(tqdm(range(0, len(articles), shard_size))):
    with (Path(shards_path) / f'shard_{i:02d}.jsonl').open('w', encoding='utf8') as fout:
        print(start, start + shard_size)
        for j in range(start, min(len(articles), start+shard_size)):
            jsoned = json.dumps(articles[j])
            try:
                tmp = json.loads(jsoned)
            except json.JSONDecodeError as e:
                print(f'skipping article {j} in shard {i} cause of JSONDecodeError')
                continue
            fout.write(jsoned+'\n')


  0%|          | 0/81 [00:00<?, ?it/s][A

0 71028



  1%|          | 1/81 [00:02<03:47,  2.85s/it][A

71028 142056



  2%|▏         | 2/81 [00:05<03:44,  2.84s/it][A

142056 213084



  4%|▎         | 3/81 [00:08<03:34,  2.75s/it][A

213084 284112



  5%|▍         | 4/81 [00:10<03:27,  2.70s/it][A

284112 355140



  6%|▌         | 5/81 [00:13<03:26,  2.72s/it][A

355140 426168



  7%|▋         | 6/81 [00:16<03:26,  2.75s/it][A

426168 497196



  9%|▊         | 7/81 [00:19<03:20,  2.71s/it][A

497196 568224



 10%|▉         | 8/81 [00:21<03:13,  2.65s/it][A

568224 639252



 11%|█         | 9/81 [00:23<03:06,  2.58s/it][A

639252 710280



 12%|█▏        | 10/81 [00:26<03:00,  2.54s/it][A

710280 781308



 14%|█▎        | 11/81 [00:28<02:56,  2.51s/it][A

781308 852336



 15%|█▍        | 12/81 [00:31<02:52,  2.50s/it][A

852336 923364



 16%|█▌        | 13/81 [00:33<02:50,  2.51s/it][A

923364 994392



 17%|█▋        | 14/81 [00:36<02:50,  2.55s/it][A

994392 1065420



 19%|█▊        | 15/81 [00:38<02:46,  2.52s/it][A

1065420 1136448



 20%|█▉        | 16/81 [00:41<02:46,  2.56s/it][A

1136448 1207476



 21%|██        | 17/81 [00:44<02:44,  2.57s/it][A

1207476 1278504



 22%|██▏       | 18/81 [00:46<02:45,  2.63s/it][A

1278504 1349532



 23%|██▎       | 19/81 [00:49<02:46,  2.68s/it][A

1349532 1420560



 25%|██▍       | 20/81 [00:52<02:44,  2.69s/it][A

1420560 1491588



 26%|██▌       | 21/81 [00:55<02:41,  2.69s/it][A

1491588 1562616



 27%|██▋       | 22/81 [00:57<02:37,  2.68s/it][A

1562616 1633644



 28%|██▊       | 23/81 [01:00<02:33,  2.65s/it][A

1633644 1704672



 30%|██▉       | 24/81 [01:03<02:31,  2.65s/it][A

1704672 1775700



 31%|███       | 25/81 [01:05<02:26,  2.61s/it][A

1775700 1846728



 32%|███▏      | 26/81 [01:08<02:26,  2.66s/it][A

1846728 1917756



 33%|███▎      | 27/81 [01:10<02:22,  2.63s/it][A

1917756 1988784



 35%|███▍      | 28/81 [01:13<02:19,  2.64s/it][A

1988784 2059812



 36%|███▌      | 29/81 [01:16<02:15,  2.60s/it][A

2059812 2130840



 37%|███▋      | 30/81 [01:18<02:14,  2.64s/it][A

2130840 2201868



 38%|███▊      | 31/81 [01:21<02:10,  2.61s/it][A

2201868 2272896



 40%|███▉      | 32/81 [01:24<02:09,  2.63s/it][A

2272896 2343924



 41%|████      | 33/81 [01:26<02:05,  2.62s/it][A

2343924 2414952



 42%|████▏     | 34/81 [01:29<02:03,  2.62s/it][A

2414952 2485980



 43%|████▎     | 35/81 [01:31<01:58,  2.58s/it][A

2485980 2557008



 44%|████▍     | 36/81 [01:34<01:57,  2.61s/it][A

2557008 2628036



 46%|████▌     | 37/81 [01:36<01:53,  2.58s/it][A

2628036 2699064



 47%|████▋     | 38/81 [01:39<01:51,  2.59s/it][A

2699064 2770092



 48%|████▊     | 39/81 [01:42<01:47,  2.56s/it][A

2770092 2841120



 49%|████▉     | 40/81 [01:44<01:46,  2.61s/it][A

2841120 2912148



 51%|█████     | 41/81 [01:47<01:44,  2.62s/it][A

2912148 2983176



 52%|█████▏    | 42/81 [01:50<01:43,  2.65s/it][A

2983176 3054204



 53%|█████▎    | 43/81 [01:52<01:40,  2.63s/it][A

3054204 3125232



 54%|█████▍    | 44/81 [01:55<01:38,  2.66s/it][A

3125232 3196260



 56%|█████▌    | 45/81 [01:57<01:33,  2.61s/it][A

3196260 3267288



 57%|█████▋    | 46/81 [02:00<01:31,  2.60s/it][A

3267288 3338316



 58%|█████▊    | 47/81 [02:03<01:28,  2.60s/it][A

3338316 3409344



 59%|█████▉    | 48/81 [02:05<01:25,  2.59s/it][A

3409344 3480372



 60%|██████    | 49/81 [02:08<01:22,  2.58s/it][A

3480372 3551400



 62%|██████▏   | 50/81 [02:10<01:20,  2.58s/it][A

3551400 3622428



 63%|██████▎   | 51/81 [02:13<01:18,  2.62s/it][A

3622428 3693456



 64%|██████▍   | 52/81 [02:16<01:17,  2.66s/it][A

3693456 3764484



 65%|██████▌   | 53/81 [02:18<01:13,  2.63s/it][A

3764484 3835512



 67%|██████▋   | 54/81 [02:21<01:10,  2.60s/it][A

3835512 3906540



 68%|██████▊   | 55/81 [02:24<01:07,  2.61s/it][A

3906540 3977568



 69%|██████▉   | 56/81 [02:26<01:04,  2.58s/it][A

3977568 4048596



 70%|███████   | 57/81 [02:29<01:02,  2.62s/it][A

4048596 4119624



 72%|███████▏  | 58/81 [02:31<00:58,  2.56s/it][A

4119624 4190652



 73%|███████▎  | 59/81 [02:34<00:56,  2.57s/it][A

4190652 4261680



 74%|███████▍  | 60/81 [02:36<00:53,  2.54s/it][A

4261680 4332708



 75%|███████▌  | 61/81 [02:39<00:50,  2.55s/it][A

4332708 4403736



 77%|███████▋  | 62/81 [02:41<00:47,  2.52s/it][A

4403736 4474764



 78%|███████▊  | 63/81 [02:44<00:46,  2.58s/it][A

4474764 4545792



 79%|███████▉  | 64/81 [02:47<00:43,  2.58s/it][A

4545792 4616820



 80%|████████  | 65/81 [02:49<00:42,  2.68s/it][A

4616820 4687848



 81%|████████▏ | 66/81 [02:52<00:40,  2.67s/it][A

4687848 4758876



 83%|████████▎ | 67/81 [02:55<00:37,  2.71s/it][A

4758876 4829904



 84%|████████▍ | 68/81 [02:57<00:34,  2.66s/it][A

4829904 4900932



 85%|████████▌ | 69/81 [03:00<00:31,  2.66s/it][A

4900932 4971960



 86%|████████▋ | 70/81 [03:03<00:29,  2.70s/it][A

4971960 5042988



 88%|████████▊ | 71/81 [03:06<00:26,  2.69s/it][A

5042988 5114016



 89%|████████▉ | 72/81 [03:08<00:23,  2.63s/it][A

5114016 5185044



 90%|█████████ | 73/81 [03:11<00:20,  2.60s/it][A

5185044 5256072



 91%|█████████▏| 74/81 [03:13<00:17,  2.57s/it][A

5256072 5327100



 93%|█████████▎| 75/81 [03:16<00:15,  2.62s/it][A

5327100 5398128



 94%|█████████▍| 76/81 [03:18<00:13,  2.62s/it][A

5398128 5469156



 95%|█████████▌| 77/81 [03:21<00:10,  2.62s/it][A

5469156 5540184



 96%|█████████▋| 78/81 [03:24<00:07,  2.59s/it][A

5540184 5611212



 98%|█████████▊| 79/81 [03:26<00:05,  2.60s/it][A

5611212 5682240



 99%|█████████▉| 80/81 [03:29<00:02,  2.56s/it][A

5682240 5753268



100%|██████████| 81/81 [03:31<00:00,  2.61s/it][A


In [23]:
# save shards, with valid shard 0.5 of train_shard size

from pathlib import Path

shards_path = '/home/kuratov/data/ThePile/Wikipedia/preprocessed_shards_train'

if not Path(shards_path).exists():
    Path(shards_path).mkdir(parents=True)

import random, math
random.shuffle(articles)
n_train_shards = 80  # (we possibly have to take n_gpus shards for validation, so we need more shards)
n_valid_shards = 1 #  add +1 for validation, only 1 works currently
valid_shard_size = 0.5 # half of the train shard

shard_size = math.ceil(len(articles) / (n_train_shards + n_valid_shards))
shard_size = math.ceil((len(articles) - shard_size * valid_shard_size) / n_train_shards)

for i, start in enumerate(tqdm(range(0, len(articles), shard_size))):
    with (Path(shards_path) / f'shard_{i:02d}.jsonl').open('w', encoding='utf8') as fout:
        print(start, start + shard_size)
        for j in range(start, min(len(articles), start+shard_size)):
            jsoned = json.dumps(articles[j])
            try:
                tmp = json.loads(jsoned)
            except json.JSONDecodeError as e:
                print(f'skipping article {j} in shard {i} cause of JSONDecodeError')
                continue
            fout.write(jsoned+'\n')


  0%|          | 0/81 [00:00<?, ?it/s][A

0 71472



  1%|          | 1/81 [00:02<03:25,  2.57s/it][A

71472 142944



  2%|▏         | 2/81 [00:05<03:21,  2.56s/it][A

142944 214416



  4%|▎         | 3/81 [00:07<03:21,  2.59s/it][A

214416 285888



  5%|▍         | 4/81 [00:10<03:16,  2.55s/it][A

285888 357360



  6%|▌         | 5/81 [00:12<03:12,  2.54s/it][A

357360 428832



  7%|▋         | 6/81 [00:15<03:08,  2.52s/it][A

428832 500304



  9%|▊         | 7/81 [00:17<03:05,  2.50s/it][A

500304 571776



 10%|▉         | 8/81 [00:20<03:00,  2.47s/it][A

571776 643248



 11%|█         | 9/81 [00:22<02:58,  2.47s/it][A

643248 714720



 12%|█▏        | 10/81 [00:24<02:54,  2.46s/it][A

714720 786192



 14%|█▎        | 11/81 [00:27<02:51,  2.45s/it][A

786192 857664



 15%|█▍        | 12/81 [00:29<02:51,  2.48s/it][A

857664 929136



 16%|█▌        | 13/81 [00:32<02:47,  2.47s/it][A

929136 1000608



 17%|█▋        | 14/81 [00:34<02:43,  2.44s/it][A

1000608 1072080



 19%|█▊        | 15/81 [00:37<02:40,  2.43s/it][A

1072080 1143552



 20%|█▉        | 16/81 [00:39<02:38,  2.43s/it][A

1143552 1215024



 21%|██        | 17/81 [00:42<02:37,  2.46s/it][A

1215024 1286496



 22%|██▏       | 18/81 [00:44<02:33,  2.43s/it][A

1286496 1357968



 23%|██▎       | 19/81 [00:47<02:33,  2.48s/it][A

1357968 1429440



 25%|██▍       | 20/81 [00:49<02:29,  2.45s/it][A

1429440 1500912



 26%|██▌       | 21/81 [00:51<02:28,  2.47s/it][A

1500912 1572384



 27%|██▋       | 22/81 [00:54<02:27,  2.50s/it][A

1572384 1643856



 28%|██▊       | 23/81 [00:57<02:27,  2.55s/it][A

1643856 1715328



 30%|██▉       | 24/81 [00:59<02:25,  2.56s/it][A

1715328 1786800



 31%|███       | 25/81 [01:02<02:25,  2.60s/it][A

1786800 1858272



 32%|███▏      | 26/81 [01:04<02:20,  2.56s/it][A

1858272 1929744



 33%|███▎      | 27/81 [01:07<02:17,  2.54s/it][A

1929744 2001216



 35%|███▍      | 28/81 [01:09<02:13,  2.52s/it][A

2001216 2072688



 36%|███▌      | 29/81 [01:12<02:10,  2.52s/it][A

2072688 2144160



 37%|███▋      | 30/81 [01:14<02:07,  2.50s/it][A

2144160 2215632



 38%|███▊      | 31/81 [01:17<02:06,  2.53s/it][A

2215632 2287104



 40%|███▉      | 32/81 [01:19<02:01,  2.48s/it][A

2287104 2358576



 41%|████      | 33/81 [01:22<02:02,  2.55s/it][A

2358576 2430048



 42%|████▏     | 34/81 [01:25<01:58,  2.51s/it][A

2430048 2501520



 43%|████▎     | 35/81 [01:27<01:54,  2.50s/it][A

2501520 2572992



 44%|████▍     | 36/81 [01:29<01:51,  2.47s/it][A

2572992 2644464



 46%|████▌     | 37/81 [01:32<01:48,  2.47s/it][A

2644464 2715936



 47%|████▋     | 38/81 [01:34<01:46,  2.48s/it][A

2715936 2787408



 48%|████▊     | 39/81 [01:37<01:44,  2.49s/it][A

2787408 2858880



 49%|████▉     | 40/81 [01:39<01:41,  2.47s/it][A

2858880 2930352



 51%|█████     | 41/81 [01:42<01:38,  2.46s/it][A

2930352 3001824



 52%|█████▏    | 42/81 [01:44<01:36,  2.46s/it][A

3001824 3073296



 53%|█████▎    | 43/81 [01:47<01:32,  2.43s/it][A

3073296 3144768



 54%|█████▍    | 44/81 [01:49<01:31,  2.47s/it][A

3144768 3216240



 56%|█████▌    | 45/81 [01:52<01:30,  2.50s/it][A

3216240 3287712



 57%|█████▋    | 46/81 [01:55<01:31,  2.61s/it][A

3287712 3359184



 58%|█████▊    | 47/81 [01:57<01:27,  2.59s/it][A

3359184 3430656



 59%|█████▉    | 48/81 [02:00<01:25,  2.59s/it][A

3430656 3502128



 60%|██████    | 49/81 [02:02<01:21,  2.54s/it][A

3502128 3573600



 62%|██████▏   | 50/81 [02:05<01:19,  2.56s/it][A

3573600 3645072



 63%|██████▎   | 51/81 [02:07<01:15,  2.52s/it][A

3645072 3716544



 64%|██████▍   | 52/81 [02:10<01:12,  2.51s/it][A

3716544 3788016



 65%|██████▌   | 53/81 [02:12<01:10,  2.53s/it][A

3788016 3859488



 67%|██████▋   | 54/81 [02:15<01:09,  2.58s/it][A

3859488 3930960



 68%|██████▊   | 55/81 [02:18<01:07,  2.60s/it][A

3930960 4002432



 69%|██████▉   | 56/81 [02:20<01:04,  2.57s/it][A

4002432 4073904



 70%|███████   | 57/81 [02:22<01:00,  2.52s/it][A

4073904 4145376



 72%|███████▏  | 58/81 [02:25<00:57,  2.49s/it][A

4145376 4216848



 73%|███████▎  | 59/81 [02:27<00:54,  2.47s/it][A

4216848 4288320



 74%|███████▍  | 60/81 [02:30<00:52,  2.49s/it][A

4288320 4359792



 75%|███████▌  | 61/81 [02:32<00:49,  2.46s/it][A

4359792 4431264



 77%|███████▋  | 62/81 [02:35<00:46,  2.47s/it][A

4431264 4502736



 78%|███████▊  | 63/81 [02:37<00:43,  2.43s/it][A

4502736 4574208



 79%|███████▉  | 64/81 [02:40<00:41,  2.44s/it][A

4574208 4645680



 80%|████████  | 65/81 [02:42<00:38,  2.43s/it][A

4645680 4717152



 81%|████████▏ | 66/81 [02:44<00:36,  2.44s/it][A

4717152 4788624



 83%|████████▎ | 67/81 [02:47<00:34,  2.47s/it][A

4788624 4860096



 84%|████████▍ | 68/81 [02:50<00:33,  2.54s/it][A

4860096 4931568



 85%|████████▌ | 69/81 [02:52<00:30,  2.53s/it][A

4931568 5003040



 86%|████████▋ | 70/81 [02:55<00:27,  2.54s/it][A

5003040 5074512



 88%|████████▊ | 71/81 [02:57<00:25,  2.54s/it][A

5074512 5145984



 89%|████████▉ | 72/81 [03:00<00:22,  2.53s/it][A

5145984 5217456



 90%|█████████ | 73/81 [03:02<00:20,  2.53s/it][A

5217456 5288928



 91%|█████████▏| 74/81 [03:05<00:17,  2.54s/it][A

5288928 5360400



 93%|█████████▎| 75/81 [03:07<00:15,  2.52s/it][A

5360400 5431872



 94%|█████████▍| 76/81 [03:10<00:12,  2.54s/it][A

5431872 5503344



 95%|█████████▌| 77/81 [03:12<00:09,  2.49s/it][A

5503344 5574816



 96%|█████████▋| 78/81 [03:15<00:07,  2.49s/it][A

5574816 5646288



 98%|█████████▊| 79/81 [03:18<00:05,  2.58s/it][A

5646288 5717760



 99%|█████████▉| 80/81 [03:20<00:02,  2.67s/it][A

5717760 5789232



100%|██████████| 81/81 [03:22<00:00,  2.50s/it][A
