In [1]:
import os
from pathlib import Path
from PIL import Image, UnidentifiedImageError
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import shutil
import random
import string
import gzip
import json
def path_replace(path, old, new):
    return Path(str(path).replace(str(old), str(new)))
def crop_and_resize(img, target_size, mode="center"):
    """Crop to square and resize to target_size
    mode:
      center: center crop
      left: keep left or top most pixels
      right: keep right or bottom most pixels
      random: random crop
    """
    crop_size = min(img.size)
    width, height = img.size
    if width==target_size and height==target_size:
        return img
    left = (width - crop_size)/2
    top = (height - crop_size)/2
    right = (width + crop_size)/2
    bottom = (height + crop_size)/2
    if mode == "center":
        img = img.crop((left, top, right, bottom))
    elif mode == "left":
        img = img.crop((0, 0, right-left, bottom-top))
    elif mode == "right":
        img = img.crop((width-right+left, height-bottom+top, width, height))
    elif mode == "random":
        left_r = random.randint(0, width-right+left)
        top_r = random.randint(0, height-bottom+top)
        img = img.crop((left_r, top_r, left_r+right-left, top_r+bottom-top))
    else:
        raise AssertionError("Wrong mode")
    img = img.resize((target_size, target_size),resample=Image.Resampling.LANCZOS)
    return img
def random_string(n):
    return ''.join(random.sample(string.ascii_letters + string.digits, n))

# C4 text

7126345 train text, 45576 val text

In [3]:
# train split
FOLDER_IN = Path("/root/autodl-tmp/c4/en")
FOLDER_OUT = Path("/root/EvoEnc/data/stage2/text-c4/train")
os.makedirs(FOLDER_OUT,exist_ok=True)
START = 100
END =  120
files = sorted(list(FOLDER_IN.glob("c4-train*.json.gz")))
cnt = 0
for i in tqdm(range(START,END)):
    with gzip.open(files[i], "r") as f:
        for line in f.readlines():
            text = json.loads(line)["text"].replace("\n"," ").strip()
            path = FOLDER_OUT / ("%08d.txt"%(cnt))
            with open(path, "w") as f:
                f.write(text)
            cnt += 1
print(cnt)

  0%|          | 0/20 [00:00<?, ?it/s]

7126345


In [4]:
# validation split
FOLDER_IN = Path("/root/autodl-tmp/c4/en")
FOLDER_OUT = Path("/root/EvoEnc/data/stage2/text-c4/val")
os.makedirs(FOLDER_OUT,exist_ok=True)
START = 2
END =  3
files = sorted(list(FOLDER_IN.glob("c4-val*.json.gz")))
cnt = 0
for i in tqdm(range(START,END)):
    with gzip.open(files[i], "r") as f:
        for line in f.readlines():
            text = json.loads(line)["text"].replace("\n"," ").strip()
            path = FOLDER_OUT / ("%08d.txt"%(cnt))
            with open(path, "w") as f:
                f.write(text)
            cnt += 1
print(cnt)

  0%|          | 0/1 [00:00<?, ?it/s]

45576


In [9]:
# check
FOLDER_OUT = Path("/root/EvoEnc/data/stage2/text-c4/val")
files = sorted(list(FOLDER_OUT.glob("*.txt")))
print(files[0])
with open(files[4], "r") as f:
    print(f.read())

/root/EvoEnc/data/stage2/text-c4/val/00000000.txt
This was the very first recipe I ever made by myself. I was about 11 years old. Yes I have been cooking pretty much my whole life, remember this picture? You can read about it here. Anyways, I was an interesting child. Normal children ask for toys for Christmas. Not me. I asked for a Mixer. Yeah, like a kitchen aid. And no, I didn't actually get a Kitchen Aid, I got a cheap knock-off. But I didn't care. I was thrilled about my new mixer, and ended up making these cookies to test it out! I grew up eating these cookies, especially during Christmas time. They are basically a shortbread cookie, and they are delicious! I actually made this dough a while back and froze some cookie dough to see if they would turn out. It turns out that it freezes great, and I just had to cook them a little bit longer and they were just as delicious! PS: I still don't own a Kitchen Aid mixer. These are my all time favorite cookie! They're a must on our Christma

# C4 sub

7126346 train sub, 45576 val sub

In [6]:
from nltk.tokenize import sent_tokenize
# train split
FOLDER_IN = Path("/root/autodl-tmp/c4/en")
FOLDER_OUT = Path("/root/EvoEnc/data/stage2/sub-c4/train")
os.makedirs(FOLDER_OUT,exist_ok=True)
START = 120
END =  140
files = sorted(list(FOLDER_IN.glob("c4-train*.json.gz")))
cnt = 0
for i in tqdm(range(START,END)):
    with gzip.open(files[i], "r") as f:
        for line in f.readlines():
            text = json.loads(line)["text"].replace("\n"," ").strip()
            path = FOLDER_OUT / ("%08d.txt"%(cnt))
            with open(path, "w") as f:
                f.write("\n".join(sent_tokenize(text)))
            cnt += 1
print(cnt)

  0%|          | 0/20 [00:00<?, ?it/s]

7126346


In [7]:
# validation split
FOLDER_IN = Path("/root/autodl-tmp/c4/en")
FOLDER_OUT = Path("/root/EvoEnc/data/stage2/sub-c4/val")
os.makedirs(FOLDER_OUT,exist_ok=True)
START = 3
END =  4
files = sorted(list(FOLDER_IN.glob("c4-val*.json.gz")))
cnt = 0
for i in tqdm(range(START,END)):
    with gzip.open(files[i], "r") as f:
        for line in f.readlines():
            text = json.loads(line)["text"].replace("\n"," ").strip()
            path = FOLDER_OUT / ("%08d.txt"%(cnt))
            with open(path, "w") as f:
                f.write("\n".join(sent_tokenize(text)))
            cnt += 1
print(cnt)

  0%|          | 0/1 [00:00<?, ?it/s]

45576


In [8]:
# check
FOLDER_OUT = Path("/root/EvoEnc/data/stage2/sub-c4/val")
files = sorted(list(FOLDER_OUT.glob("*.txt")))
print(files[0])
with open(files[4], "r") as f:
    print(f.read())

/root/EvoEnc/data/stage2/sub-c4/val/00000000.txt
JAMES RIVER COAL COMPANY BANKRUPTCY RECOVERING INVESTOR LOSSES | Goodman & Nekvasil, P.A.
Goodman & Nekvasil, P.A., a Clearwater, Florida, law firm with a national practice representing victimized investors, continues to investigate brokerage firms that placed elderly retirees and other conservative investors in high-risk oil and gas related investments, including JAMES RIVER COAL COMPANY.
Goodman & Nekvasil, P.A., has filed hundreds of cases against brokerage firms selling high-risk investments, including JAMES RIVER COAL COMPANY.
Goodman & Nekvasil, P.A., has recovered hundreds of millions of dollars on behalf of investors victimized by these investments.
We allege in these cases that these investment recommendations were unsuitable for our clients in view of their financial situation, needs and investment objectives.
All our cases are handled on a purely contingency fee basis.
We would like to discuss the possibility of your retaining