In [None]:
from datasets import load_dataset
from multiprocessing import Pool
import numpy as np
import os
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [133]:
dataset = load_dataset("Stegvean/CakeGenomes")["reduced"]
dataset

Dataset({
    features: ['text'],
    num_rows: 20222
})

In [None]:
# split dataset
split_dataset = dataset.train_test_split(test_size=0.03, shuffle=True)
split_dataset["val"] = split_dataset.pop('test') 
print([split_dataset["val"][i] for i in range(3)])
print("training size:", len(split_dataset["train"]))
print("validation size:", len(split_dataset["val"]))

[{'text': 'trxB arsB lanKC fahA pip nuoK murF ssuE hxlB uppS pfkB ptsP rpmF lnt zwf ltrA dpgA dpgB dpgC mltG xylA xylB asnB ltrA hpnH shc hpnE hpnD hpnC idi rsgA secD ligD rbsD eboE fdhD narH narJ narI ccrA pssA hydA ggt allB aceB ctaD pucL uraH uraD gcl pucD rsgA lpdA mmuM cysC cysD glgX treY treZ msrB murC zapE sigJ hemQ hemG hemE pgsB dxs ngcE acnA mnhG argF ctaD rpmG metE ureG ureC ureA dut sepH nrdR lexA hflX dapF miaA miaB recX recA pgsA rimO dapA thyX dapB rpsO truB rbfA infB nusA rimP ispG dxr gabT rlmN frr pyrH tsf rpsB whiG dprA lepB lepB lepB lepB rplS trmD rimM rpsP ftsH ffh ftsY mutM rnc rpmF coaD rsmD recG rpmB thiD cofC leuD leuC ndgR gltX murJ cimA ureA pruA serA ilvC ilvN gatB gatA gatC ligA mnmA gcvT gcvH glyA kdpF kdpA kdpB glgX glgP treS glgB pta pyk ureG ureC meaB mce scy nucS atpD atpA atpE atpB prmC prfA rpmE rho thrB thrC lysA nrtL ligD lon argG fxsT fxsA sodX sodN nagB def rsrA aroA rsgA hisN nudC moeZ sigE folP dapE dapC fdxA mshB typA ehuB ehuC ehuD ehuA sugE

In [135]:
# seq lengths
get_len = lambda x: len(x.split(" "))
lens = list(map(get_len, dataset["text"]))
lens.sort()
min_len = min(lens)
global max_len
max_len = max(lens)
avg_len = np.mean(lens)
med_len = np.median(lens)
print("min:", min_len)
print("max:", max_len)
print("avg:", avg_len)
print("median:", med_len)
n = 20
print(f"greatest {n}:", lens[-n:])

min: 6
max: 5572
avg: 719.5609237464148
median: 679.0
greatest 20: [2544, 2547, 2560, 2603, 2607, 2712, 2761, 2771, 2832, 2867, 2953, 3052, 3098, 3846, 3873, 4171, 4259, 4309, 4548, 5572]


### tokenizer

In [136]:
# start, end, and padding tokens
reduced_vocabulary = ["[PAD]", "[EOS]", "[SOS]"]

# import vocab info
import json
vocab_info = json.load(open("tokenizer_data_train.json"))
reduced_vocabulary.extend(vocab_info["reduced_vocab"])

n = len(reduced_vocabulary)
reduced_vocabulary

['[PAD]',
 '[EOS]',
 '[SOS]',
 'ytbD',
 'PA3695',
 'glxA',
 'ypdA',
 'SF0708',
 'lmo0792',
 'pqqC',
 'styE',
 'pksA',
 'PA3759',
 'dpaL',
 'psb28',
 'galC',
 'peuA',
 'Rv2640c',
 'cyoB',
 'arsS',
 'mcsB',
 'PA5205',
 'prxU',
 'lmo0963',
 'hslV',
 'kapB',
 'Rv0797',
 'PA3808',
 'pdaB',
 'lmo2840',
 'eptA',
 'Rv2974c',
 'Rv0775',
 'PA0827',
 'zitB',
 'fadD4',
 'plnJ',
 'braC',
 'avs1a',
 'topAI',
 'trpD',
 'SF1302',
 'vioD',
 'Rv1707',
 'PA1442',
 'rosA',
 'lmo2160',
 'ygiH',
 'yozW',
 'lmo0625',
 'padH',
 'lmo1257',
 'lmo1860',
 'senS',
 'scyA',
 'yvbG',
 'Rv3909',
 'lmo2287',
 'oppF',
 'agaI',
 'lmo0517',
 'chuS',
 'hfq',
 'ydgQ',
 'lmo1239',
 'ybjP',
 'rhcF',
 'PA1262',
 'comEB',
 'SF1593',
 'PA3315',
 'secE',
 'mxiC',
 'flhP',
 'PA3023',
 'yifZ',
 'aldY',
 'aroD',
 'yiiR',
 'STM3794',
 'mexY',
 'lmo1526',
 'albC',
 'cagU',
 'ygfK',
 'brz',
 'PA0421',
 'ypbF',
 'ovoA',
 'sugB',
 'leo',
 'lmo1922',
 'apbE',
 'PA4612',
 'lmo0622',
 'PA2662',
 'rpoC',
 'PA1204',
 'Rv1582c',
 'yjcE',
 'pg

In [137]:
id_to_int = { id:i for i,id in enumerate(reduced_vocabulary)}
int_to_id = { i:id for i,id in id_to_int.items()}
encode = lambda id_list: [id_to_int[id] for id in id_list]
decode = lambda int_list: [int_to_id[i] for i in int_list]

In [142]:
# padding and special tokens
def process(example):
    id_str = example#["text"]
    char_arr = ["[SOS]"]
    char_arr.extend(id_str.split(" "))
    char_arr.append("[EOS]")
    char_arr.extend(["[PAD]"] * (max_len - len(char_arr)))
    tokenized = encode(char_arr)
    return tokenized

In [155]:
def process(example):
    char_arr = ["[SOS]"] + example["text"].split(" ") + ["[EOS]"]
    char_arr += ["[PAD]"] * (max_len - len(char_arr))
    tokenized = encode(char_arr)
    return {'ids': tokenized, 'len': len(tokenized)}

if __name__ == "__main__":
    tokenized_dataset = dataset.map(
        process, 
        remove_columns=['text'], 
        num_proc=os.cpu_count(),  # Use all available CPU cores
        desc="Tokenizing dataset"
    )
    tokenized_dataset

Tokenizing dataset (num_proc=20):   0%|          | 0/20222 [00:04<?, ? examples/s]


NameError: name 'max_len' is not defined

In [143]:
# tokenized = dataset_fake.map(
#     process,
#     remove_columns=['text'],
#     desc="tokenizing the splits",
#     num_proc=os.cpu_count(),
# )
tokenized = list(map(process, dataset["text"]))

print(dataset["text"][0])
tokenized[0]

pgk pgk pgk pgk glnR glnA psrA glnA glnR rpoE trkA lacG pflB dinB ilvA ilvC ilvN rpmB gatC gatA gatB efp nusB briC accD accC fabZ accB fabF fabG fabD fabK serS zapA rnhC lepB tig cbpG liaF fni mvaD mvk cbpC gndA mapZ gpsB recU pbp1a rfbD rfbB rfbA gla cpsD cpsC cps4B mraY pbp2X ftsL rsmH yajC tnpA celB rpsI rplM folK folE folP adhP pepC fusA rpsG rpsL spuA glmS bglA rseP ruvB leuS rplQ rpsK rpsM rpmJ infA secY rplO rpmD rpsE rplR rplF rpsH rpsN rplE rplX rplN rpsQ rpmC rplP rpsC rplV rpsS rplB rplD rplC rpsJ nrdG nrdD cls ruvX spx uvrA ruvA ribD ribE ribE mutL nrdI tsaD rimI tsaB cibA cibB rnjA mnmG mnmA pspA sdaAB sdaAA rpsD pavB strH purB purK purE purD purH purN purM purF purC comB comA blpU plsX recO polA radA tadA comW ftsH hpt tilS mfd pth ychF dnaN dnaA rlmH comC comD comE trpS guaB recF yaaA yfmF yfmH rodZ pgsA mreC mreD pcsB rpsB tsf tnpB cysK hpf rplI dnaB cbpD rr06 hk06 pspC dusB hslO glpK glpO ltrA dltA dltB dltC dltD adcR adcA arcC argF arcA rpmG rpmF ilvD rgg hisS aspS ma

[2,
 18237,
 18237,
 18237,
 18237,
 15321,
 16671,
 2785,
 16671,
 15321,
 2063,
 24031,
 7533,
 17001,
 12417,
 17526,
 9189,
 18494,
 27025,
 9847,
 18456,
 13002,
 8432,
 24462,
 6047,
 18992,
 13386,
 1783,
 664,
 364,
 20835,
 10848,
 9308,
 26375,
 26353,
 16085,
 27664,
 13055,
 10826,
 12057,
 17584,
 27142,
 17892,
 15136,
 20985,
 12546,
 17582,
 10964,
 947,
 22283,
 1675,
 27811,
 9921,
 15232,
 14198,
 26673,
 24750,
 1716,
 16792,
 168,
 22534,
 24045,
 19574,
 23989,
 24521,
 18288,
 17857,
 6749,
 5121,
 4707,
 3198,
 20041,
 12751,
 14060,
 10230,
 16767,
 23982,
 3111,
 19186,
 10965,
 18021,
 9097,
 4577,
 11861,
 11595,
 11659,
 15627,
 9813,
 8731,
 24843,
 15633,
 8408,
 3664,
 4314,
 1289,
 1206,
 5981,
 27416,
 10363,
 6653,
 2497,
 8589,
 17172,
 8102,
 3896,
 3238,
 18711,
 13607,
 9762,
 2449,
 3006,
 16236,
 24438,
 18253,
 18253,
 3059,
 9885,
 7670,
 12249,
 26605,
 26440,
 3808,
 11436,
 28049,
 19462,
 19114,
 11239,
 11306,
 7444,
 919,
 3336,
 23722,


In [None]:
# concatenate all the ids in each dataset into one large file we can use for training
for split, dset in tokenized.items():
    arr_len = np.sum(dset['len'], dtype=np.uint64)
    filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
    dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
    total_batches = 1024

    idx = 0
    for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
        # Batch together samples for faster write
        batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
        arr_batch = np.concatenate(batch['ids'])
        # Write into mmap
        arr[idx : idx + len(arr_batch)] = arr_batch
        idx += len(arr_batch)
    arr.flush()
