In [None]:
#@title Setup & Imports

!pip install transformers torch datasets sentencepiece zstandard
!git clone https://github.com/EleutherAI/openwebtext2.git
!pip install -r openwebtext2/requirements.txt

import glob
import os
import math
import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import zstandard
import openwebtext2
import gc
import sentencepiece as spm

from os import listdir
from os.path import isfile, join
from datasets import load_dataset
from openwebtext2.utils import archiver
from transformers import GPT2Tokenizer, BertTokenizer


!nvidia-smi # Run this to see what GPU you have

!df | grep shm
!sudo mount -o remount,size=32G /dev/shm
!df | grep shm

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", DEVICE)

fatal: destination path 'openwebtext2' already exists and is not an empty directory.
Fri Apr  5 05:04:11 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+-------

In [None]:
%%script echo skipping

#@title Prepare dataset


# @article{pile,
#     title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
#     author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
#     journal={arXiv preprint arXiv:2101.00027},
#     year={2020}
# }

# Download
!wget https://huggingface.co/datasets/ccss4/openwebtext2/resolve/main/openwebtext2.jsonl.zst.tar
!mkdir -p /content/dataset/openwebtext
!mkdir -p /content/dataset/openwebtextraw
!tar -xf /content/openwebtext2.jsonl.zst.tar -C /content/dataset/openwebtextraw/

# Convert jsonl.zst into text files
document_count = 0
total_text_size = 0

dataset_directory = "/content/dataset/openwebtextraw"
text_file_directory = "/content/dataset/openwebtext"

files = glob.glob(os.path.join(dataset_directory, "*jsonl.zst"))
for file_path in tqdm.tqdm(files, dynamic_ncols=True):
    reader = archiver.Reader()
    text_file_name = file_path.replace(".jsonl.zst", ".txt")
    text_file_name = text_file_name.replace(dataset_directory, text_file_directory)
    text_file = open(text_file_name, "a")
    for document, metadata in reader.read_jsonl(file_path, get_meta=True):
        document_count += 1
        total_text_size += len(document)

        text_file.write(document)
    text_file.close()


print(document[:1000])
billion = math.pow(10, 9)
print(f"Total Document Count: {document_count:,}")
print(f"Total Uncompressed Text Size: {(total_text_size / billion):.2f} GB")

!rm -R /content/dataset/openwebtextraw/

skipping


In [None]:
#@title Global Variables

dataset_directory = "/content/dataset/openwebtextraw"
text_file_directory = "/content/dataset/openwebtext"

text_file_names = [f for f in listdir(text_file_directory) if isfile(join(text_file_directory, f))]
text_file_names.sort()

In [None]:
#@title Hyper Parameters
wandb_name = 'test1'

config = {
    "lr"         : 0.1,
    "epochs"     : 120,
    "batch_size" : 16,
    "block_size" : 128,
    "drop_out"   : 0.2,
    "tokenizer"  : "gpt2"
}

In [None]:
#@title Tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")


# print(tokenizer.encode('This is a test'))
# print(tokenizer.decode(tokenizer.encode('This is a test')))

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

print(tokenizer.vocab_size)
print(tokenizer.encode('This is a test'))
print(tokenizer.decode(tokenizer.encode('This is a test')))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

30522
[101, 2023, 2003, 1037, 3231, 102]
[CLS] this is a test [SEP]


In [None]:
#@title Load Dataset
def create_tokenized_dataset(dataset):
  result_dataset = []
  for text in dataset:
    result_dataset.extend(tokenizer.encode(text))

  return torch.tensor(result_dataset, dtype=torch.long)

train_files = []
val_files = []
test_files = []
for text_file in text_file_names:
  full_path = text_file_directory + "/" + text_file

  if ("-01" in full_path):
    test_files.append(full_path)
    continue

  if ("-02" in full_path):
    val_files.append(full_path)
    continue

  train_files.append(full_path)


dataset = load_dataset('text', data_files={'train': train_files, 'validate': val_files, 'test': test_files})

print(dataset["validate"]["text"][:100])
val_data = create_tokenized_dataset(dataset["validate"]["text"])
torch.save(val_data, 'val_data_all.pt')

print(dataset["test"]["text"][:100])
test_data = create_tokenized_dataset(dataset["test"]["text"])
torch.save(test_data, 'test_data_all.pt')

# print(dataset["train"]["text"][:1000])
# dataset["train"] = dataset["train"].shuffle()
print(dataset["train"]["text"][:100])
train_data = create_tokenized_dataset(dataset["train"]["text"])
torch.save(train_data, 'train_data_all.pt')



print(train_data[:100])
print(val_data[:100])
print(test_data[:100])
print(train_data.shape)
print(val_data.shape)
print(test_data.shape)

Resolving data files:   0%|          | 0/149 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/113 [00:00<?, ?it/s]

['A grave where skeletons of Africans were found in the cemetery in Campeche, Mexico.', '', 'Archaeologists have found what they think are the oldest remains of slaves brought from Africa to the New World.', '', 'The remains, in a colonial era graveyard in one of the oldest European cities in Mexico, date between the late-16th century and the mid-17th century, not long after Columbus first set foot in the Americas.', '', 'The African origin of the slaves was determined by studying a chemical in their tooth enamel that reveals plant and rock types of their native land. The chemical enters the body through the food chain as nutrients pass from bedrock through soil and water to plants and animals. It is an indelible signature of birthplace, the researchers said, because it can be directly linked to the bedrock of specific locales.', '', 'Researchers examined remains of four individuals from among 180 burials found in a multiethnic burial ground associated with the ruins of a colonial chur

Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors


['Caucho Resin adds PHP Go to top ] Posted by: Emil Kirschner', '', 'Posted on: December 19 2005 09:26 EST', '', "in response to Mark N But won't that (Java specific PHP) take away from the attraction of PHP? One of the biggest reasons people use PHP is because it can run on just Apache (+ mod).", '', "No, it doesn't take away any abstraction. It's just yet another platform on which you php code runs. (6x faster :-) ). Its just like running PHP on top of IIS.", '', '', '', "It doesn't mean you cannot run you php code on apache /mod_php any more.", '', '', '', 'cheers,', '', "Emil No, it doesn't take away any abstraction. It's just yet another platform on which you php code runs. (6x faster :-) ). Its just like running PHP on top of IIS.It doesn't mean you cannot run you php code on apache /mod_php any more.cheers, Reply to this Reply to original", '', 'Caucho Resin adds PHP Go to top ] Posted by: Juozas Baliuka', '', 'Posted on: December 19 2005 10:10 EST', '', 'in response to Emil Kir

In [None]:
train_data = torch.load('train_data.pt')
val_data = torch.load('val_data.pt')
test_data = torch.load('test_data.pt')

print(train_data[:100])
print(val_data[:100])
print(test_data[:100])
print(train_data.shape)
print(val_data.shape)
print(test_data.shape)

tensor([  101,  3648,  4727,  1999, 12098, 19761,  2553,  3587,  8343,  1999,
         9968,  2044,  1057,  1012,  1055,  1012,  9458,  1005,  1055, 13406,
         2703,  3158,  4315, 22889, 17206,  2001,  4727,  2044,  2108,  8781,
         2011,  2610,  2004,  1037,  7409,  2058,  1996,  5353,  1012,  3141,
         3916,  1024,  4394, 12098, 19761,  9458,  3146,  2136,  2000,  4681,
         3945, 11845,  1037,  5142,  2005,  6813,  2001,  2679,  1037,  5387,
         1999, 17615,  1029,  9123,  1024,  3808, 10247,  2005, 15183,  2115,
         1041,  1011,  5653,  9499,  2015, 12098, 19761,  6041,  4126,  1010,
         2375,  1998,  3425,  2030,  2030,  3443,  2115,  2219,  2030,  2319,
         6460, 16917,  1010, 12098, 19761,  1006, 13229,  1007,  1011,  1011])
tensor([  101,  1037,  6542,  2073, 24365,  1997, 18076,  2020,  2179,  1999,
         1996,  4528,  1999,  3409, 27635,  1010,  3290,  1012,   102,   101,
          102,   101, 19254,  2031,  2179,  2054,  2027,  2228,

In [None]:
print(train_data[:100])
t1 = train_data
chunks = torch.chunk(t1,5)
# print(chunks)
# torch.cat(chunks)
i = 0
for chunk in chunks:
  # print(chunk.clone())
  torch.save(chunk.clone(), f"train_all_{i}.pt")
  i = i + 1

tensor([  101,  3648,  4727,  1999, 12098, 19761,  2553,  3587,  8343,  1999,
         9968,  2044,  1057,  1012,  1055,  1012,  9458,  1005,  1055, 13406,
         2703,  3158,  4315, 22889, 17206,  2001,  4727,  2044,  2108,  8781,
         2011,  2610,  2004,  1037,  7409,  2058,  1996,  5353,  1012,  3141,
         3916,  1024,  4394, 12098, 19761,  9458,  3146,  2136,  2000,  4681,
         3945, 11845,  1037,  5142,  2005,  6813,  2001,  2679,  1037,  5387,
         1999, 17615,  1029,  9123,  1024,  3808, 10247,  2005, 15183,  2115,
         1041,  1011,  5653,  9499,  2015, 12098, 19761,  6041,  4126,  1010,
         2375,  1998,  3425,  2030,  2030,  3443,  2115,  2219,  2030,  2319,
         6460, 16917,  1010, 12098, 19761,  1006, 13229,  1007,  1011,  1011])
