In [7]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES']=''

import sys;
sys.path.append('..')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import json
wiki = json.load(open('/home/kuratov/data/ThePile/Wikipedia/wikipedia-en-0.json', 'r'))

In [9]:
{
    'title': '',
    'text': '',
}

{'title': '', 'text': ''}

In [10]:
# articles to skip:
# List of ...
# {title} may refer to: (disambiguation pages)
# wiki key words to clean
# \nCategory:
# \n\nSee also\n\n
# \n\nReferences\n\n

In [11]:
import re
category_regexp = re.compile(r'\ACategory:[\S ]+\Z')
keywords_regexp = re.compile(r'\n(References|See also|External Links)\s?\n')
title_regexp = re.compile(r'\A[\S ]+?\\n\\n')
list_of_regexp = re.compile(r'\AList of')

def clean_article(text):
    lines = []
    title = ''
    for i, line in enumerate(text.split('\n')):
        if i == 0:
            title = line
            continue
        if category_regexp.match(line):
            continue
        lines += [line]
    text = '\n'.join(lines)
    m = re.search(keywords_regexp, text)
    if m:
        text = text[:m.span()[0]]
    return title, text.strip()

In [12]:
# preprocess wiki
from tqdm import tqdm as tqdm
import json

path = '/home/kuratov/data/ThePile/Wikipedia/wikipedia-en-{}.json'

total_articles = 0
filtered_articles = 0

articles = []

for i in tqdm(range(0, 10)):
    wiki = json.load(open(path.format(i), 'r'))
    for text in wiki:
        total_articles +=1
        title, text = clean_article(text)
        if re.search(list_of_regexp, title):
            filtered_articles += 1
            continue
        if f'{title} may refer to:' in text:
            filtered_articles += 1
            continue
        articles += [{'title': title, 'text': text}]
print(f'{filtered_articles}/{total_articles}')

100%|██████████| 10/10 [04:54<00:00, 29.43s/it]

279906/6033151





In [14]:
print(f'{filtered_articles}/{len(articles)}/{total_articles}')

279906/5753245/6033151


In [15]:
# save shards

from pathlib import Path

shards_path = '/home/kuratov/data/ThePile/Wikipedia/preprocessed_shards'

if not Path(shards_path).exists():
    Path(shards_path).mkdir(parents=True)

import random, math
random.shuffle(articles)
n_shards = 80  # (we possibly have to take n_gpus shards for validation, so we need more shards)

shard_size = math.ceil(len(articles) / n_shards)

for i, start in enumerate(tqdm(range(0, len(articles), shard_size))):
    with (Path(shards_path) / f'shard_{i:02d}.jsonl').open('w', encoding='utf8') as fout:
        print(start, start + shard_size)
        for j in range(start, min(len(articles), start+shard_size)):
            jsoned = json.dumps(articles[j])
            try:
                tmp = json.loads(jsoned)
            except json.JSONDecodeError as e:
                print(f'skipping article {j} in shard {i} cause of JSONDecodeError')
                continue
            fout.write(jsoned+'\n')

  0%|          | 0/80 [00:00<?, ?it/s]

0 71916


  1%|▏         | 1/80 [00:02<03:16,  2.48s/it]

71916 143832


  2%|▎         | 2/80 [00:04<03:13,  2.48s/it]

143832 215748


  4%|▍         | 3/80 [00:07<03:15,  2.54s/it]

215748 287664


  5%|▌         | 4/80 [00:10<03:11,  2.52s/it]

287664 359580


  6%|▋         | 5/80 [00:12<03:10,  2.54s/it]

359580 431496


  8%|▊         | 6/80 [00:15<03:06,  2.52s/it]

431496 503412


  9%|▉         | 7/80 [00:17<03:00,  2.47s/it]

503412 575328


 10%|█         | 8/80 [00:20<03:09,  2.63s/it]

575328 647244


 11%|█▏        | 9/80 [00:25<03:48,  3.22s/it]

647244 719160


 12%|█▎        | 10/80 [00:27<03:30,  3.01s/it]

719160 791076


 14%|█▍        | 11/80 [00:31<03:46,  3.28s/it]

791076 862992


 15%|█▌        | 12/80 [00:38<05:00,  4.41s/it]

862992 934908


 16%|█▋        | 13/80 [00:45<05:47,  5.19s/it]

934908 1006824


 18%|█▊        | 14/80 [00:49<05:22,  4.89s/it]

1006824 1078740


 19%|█▉        | 15/80 [01:06<09:10,  8.47s/it]

1078740 1150656


 20%|██        | 16/80 [01:20<10:51, 10.18s/it]

1150656 1222572


 21%|██▏       | 17/80 [01:24<08:31,  8.12s/it]

1222572 1294488


 22%|██▎       | 18/80 [01:26<06:36,  6.40s/it]

1294488 1366404


 24%|██▍       | 19/80 [01:28<05:18,  5.22s/it]

1366404 1438320


 25%|██▌       | 20/80 [01:31<04:23,  4.40s/it]

1438320 1510236


 26%|██▋       | 21/80 [01:33<03:43,  3.79s/it]

1510236 1582152


 28%|██▊       | 22/80 [01:36<03:17,  3.40s/it]

1582152 1654068


 29%|██▉       | 23/80 [01:38<02:57,  3.11s/it]

1654068 1725984


 30%|███       | 24/80 [01:41<02:43,  2.92s/it]

1725984 1797900


 31%|███▏      | 25/80 [01:43<02:32,  2.77s/it]

1797900 1869816


 32%|███▎      | 26/80 [01:46<02:29,  2.77s/it]

1869816 1941732


 34%|███▍      | 27/80 [01:48<02:19,  2.62s/it]

1941732 2013648


 35%|███▌      | 28/80 [01:51<02:13,  2.57s/it]

2013648 2085564


 36%|███▋      | 29/80 [01:55<02:34,  3.03s/it]

2085564 2157480


 38%|███▊      | 30/80 [01:57<02:23,  2.87s/it]

2157480 2229396


 39%|███▉      | 31/80 [02:11<05:06,  6.25s/it]

2229396 2301312


 40%|████      | 32/80 [02:22<06:02,  7.55s/it]

2301312 2373228


 41%|████▏     | 33/80 [02:33<06:47,  8.67s/it]

2373228 2445144


 42%|████▎     | 34/80 [02:46<07:35,  9.90s/it]

2445144 2517060


 44%|████▍     | 35/80 [02:53<06:52,  9.16s/it]

2517060 2588976


 45%|████▌     | 36/80 [02:56<05:14,  7.15s/it]

2588976 2660892


 46%|████▋     | 37/80 [02:58<04:06,  5.73s/it]

2660892 2732808


 48%|████▊     | 38/80 [03:01<03:17,  4.71s/it]

2732808 2804724


 49%|████▉     | 39/80 [03:03<02:44,  4.00s/it]

2804724 2876640


 50%|█████     | 40/80 [03:05<02:20,  3.50s/it]

2876640 2948556


 51%|█████▏    | 41/80 [03:08<02:03,  3.16s/it]

2948556 3020472


 52%|█████▎    | 42/80 [03:10<01:53,  3.00s/it]

3020472 3092388


 54%|█████▍    | 43/80 [03:13<01:44,  2.81s/it]

3092388 3164304


 55%|█████▌    | 44/80 [03:15<01:37,  2.70s/it]

3164304 3236220


 56%|█████▋    | 45/80 [03:18<01:32,  2.63s/it]

3236220 3308136


 57%|█████▊    | 46/80 [03:20<01:27,  2.57s/it]

3308136 3380052


 59%|█████▉    | 47/80 [03:22<01:23,  2.52s/it]

3380052 3451968


 60%|██████    | 48/80 [03:25<01:19,  2.47s/it]

3451968 3523884


 61%|██████▏   | 49/80 [03:27<01:16,  2.45s/it]

3523884 3595800


 62%|██████▎   | 50/80 [03:30<01:15,  2.53s/it]

3595800 3667716


 64%|██████▍   | 51/80 [03:32<01:12,  2.51s/it]

3667716 3739632


 65%|██████▌   | 52/80 [03:35<01:09,  2.50s/it]

3739632 3811548


 66%|██████▋   | 53/80 [03:37<01:07,  2.49s/it]

3811548 3883464


 68%|██████▊   | 54/80 [03:40<01:05,  2.53s/it]

3883464 3955380


 69%|██████▉   | 55/80 [03:43<01:04,  2.60s/it]

3955380 4027296


 70%|███████   | 56/80 [03:45<01:00,  2.53s/it]

4027296 4099212


 71%|███████▏  | 57/80 [03:47<00:56,  2.47s/it]

4099212 4171128


 72%|███████▎  | 58/80 [03:50<00:55,  2.51s/it]

4171128 4243044


 74%|███████▍  | 59/80 [03:52<00:51,  2.46s/it]

4243044 4314960


 75%|███████▌  | 60/80 [03:55<00:49,  2.46s/it]

4314960 4386876


 76%|███████▋  | 61/80 [03:57<00:46,  2.44s/it]

4386876 4458792


 78%|███████▊  | 62/80 [04:03<01:04,  3.59s/it]

4458792 4530708


 79%|███████▉  | 63/80 [04:16<01:44,  6.15s/it]

4530708 4602624


 80%|████████  | 64/80 [04:20<01:31,  5.72s/it]

4602624 4674540


 81%|████████▏ | 65/80 [04:23<01:12,  4.86s/it]

4674540 4746456


 82%|████████▎ | 66/80 [04:26<01:00,  4.33s/it]

4746456 4818372


 84%|████████▍ | 67/80 [04:29<00:50,  3.89s/it]

4818372 4890288


 85%|████████▌ | 68/80 [04:33<00:44,  3.74s/it]

4890288 4962204


 86%|████████▋ | 69/80 [04:41<00:58,  5.28s/it]

4962204 5034120


 88%|████████▊ | 70/80 [04:58<01:27,  8.74s/it]

5034120 5106036


 89%|████████▉ | 71/80 [05:03<01:08,  7.58s/it]

5106036 5177952


 90%|█████████ | 72/80 [05:06<00:49,  6.15s/it]

5177952 5249868


 91%|█████████▏| 73/80 [05:09<00:35,  5.12s/it]

5249868 5321784


 92%|█████████▎| 74/80 [05:11<00:26,  4.38s/it]

5321784 5393700


 94%|█████████▍| 75/80 [05:14<00:19,  3.92s/it]

5393700 5465616


 95%|█████████▌| 76/80 [05:17<00:14,  3.53s/it]

5465616 5537532


 96%|█████████▋| 77/80 [05:19<00:09,  3.23s/it]

5537532 5609448


 98%|█████████▊| 78/80 [05:22<00:06,  3.03s/it]

5609448 5681364


 99%|█████████▉| 79/80 [05:24<00:02,  2.89s/it]

5681364 5753280


100%|██████████| 80/80 [05:27<00:00,  4.09s/it]
