In [2]:
# from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
# import torch
# from torch.utils.data import DataLoader, Dataset
# from glob import glob
# import os
# from tqdm import tqdm
# import time

# # Initialize the tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# # Retrieve and print the correct vocabulary size from the tokenizer
# correct_vocab_size = tokenizer.vocab_size

# # Update your GPT2 configuration with the correct vocabulary size
# config = GPT2Config(
#     vocab_size=correct_vocab_size,  # Updated size
#     n_positions=1024,
#     n_ctx=1024,
#     n_embd=768,
#     n_layer=12,
#     n_head=12
# )

# # Initialize the model with the updated configuration
# model = GPT2LMHeadModel(config)

# # Define your TextFolderDataset
# class TextFolderDataset(Dataset):
#     def __init__(self, file_directory, file_pattern, max_length):
#         self.filepaths = glob(os.path.join(file_directory, file_pattern))
#         self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
#         self.tokenizer.pad_token = self.tokenizer.eos_token  # Set pad_token
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.filepaths)

#     def __getitem__(self, idx):
#         with open(self.filepaths[idx], 'r', encoding='utf-8') as file:
#             text = file.read()

#         encoding = self.tokenizer.encode_plus(
#             text,
#             add_special_tokens=True, 
#             max_length=self.max_length, 
#             truncation=True, 
#             padding='max_length', 
#             return_tensors='pt'
#         )

#         input_ids = encoding['input_ids'][0]
#         attention_mask = encoding['attention_mask'][0]
#         labels = input_ids.clone()

#         return {
#             'input_ids': input_ids,
#             'attention_mask': attention_mask,
#             'labels': labels
#         }

# # Create your dataset and data loader
# file_directory = "openwebtext/"
# file_pattern = "urlsf_subset01-32*"
# max_length = 512
# dataset = TextFolderDataset(file_directory, file_pattern, max_length)
# train_dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# # Training Loop
# from transformers import AdamW
# from tqdm import tqdm

# optimizer = AdamW(model.parameters(), lr=5e-5)
# accumulation_steps = 2
# num_epochs = 10

# start_time = time.time()

# for epoch in range(tqdm(num_epochs)):
#     total_loss = 0
#     optimizer.zero_grad()
#     for step, batch in enumerate(train_dataloader):
#         inputs = batch['input_ids']
#         labels = batch['labels']
#         outputs = model(inputs, labels=labels)
#         loss = outputs.loss

#         loss = loss / accumulation_steps
#         loss.backward()

#         if (step + 1) % accumulation_steps == 0:
#             optimizer.step()
#             optimizer.zero_grad()

#         total_loss += loss.item()
    
#     avg_loss = total_loss / (len(train_dataloader) / accumulation_steps)
#     print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

# end_time = time.time()
# print(f"The time needed for training is {end_time - start_time}")

In [1]:
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
import torch
from torch.utils.data import DataLoader, Dataset
from glob import glob
import os
from tqdm import tqdm
import time

torch.manual_seed(0)
# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Retrieve the correct vocabulary size from the tokenizer
correct_vocab_size = tokenizer.vocab_size

# Update your GPT2 configuration with the correct vocabulary size
config = GPT2Config(
    vocab_size=correct_vocab_size,  # Updated size
    n_positions=1024,
    n_ctx=1024,
    n_embd=768,
    n_layer=12,
    n_head=12
)

# Initialize the model with the updated configuration
model = GPT2LMHeadModel(config)

# Check for CUDA GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Transfer the model to the GPU (if available)
model.to(device)

# Define your TextFolderDataset
# Define your TextFolderDataset
class TextFolderDataset(Dataset):
    def __init__(self, file_directory, file_pattern, max_length):
        self.filepaths = glob(os.path.join(file_directory, file_pattern))
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.tokenizer.pad_token = self.tokenizer.eos_token  # Set pad_token
        self.max_length = max_length

    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, idx):
        with open(self.filepaths[idx], 'r', encoding='utf-8') as file:
            text = file.read()

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True, 
            max_length=self.max_length, 
            truncation=True, 
            padding='max_length', 
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'][0]
        attention_mask = encoding['attention_mask'][0]
        labels = input_ids.clone()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Create your dataset and data loader
file_directory = "openwebtext/"
file_pattern = "urlsf_subset01-3[2,3]*"
max_length = 512
dataset = TextFolderDataset(file_directory, file_pattern, max_length)
train_dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Training Loop
from transformers import AdamW
from tqdm import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)
accumulation_steps = 2
num_epochs = 50

start_time = time.time()

for epoch in tqdm(range(num_epochs)):
    total_loss = 0
    optimizer.zero_grad()
    for step, batch in enumerate(train_dataloader):
        # Transfer the data to the GPU
        inputs = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(inputs, labels=labels)
        loss = outputs.loss

        # Normalize loss to account for accumulation
        loss = loss / accumulation_steps
        loss.backward()

        if (step + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item()
    
    avg_loss = total_loss / (len(train_dataloader) / accumulation_steps)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
    if avg_loss < 1.5:
        break

end_time = time.time()
print(f"The time needed for training is {end_time - start_time} seconds.")


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


  2%|█▋                                                                                  | 1/50 [00:26<21:23, 26.20s/it]

Epoch 1/50, Average Loss: 5.9198


  4%|███▎                                                                                | 2/50 [00:48<19:20, 24.18s/it]

Epoch 2/50, Average Loss: 2.9534


  6%|█████                                                                               | 3/50 [01:11<18:24, 23.51s/it]

Epoch 3/50, Average Loss: 2.8821


  8%|██████▋                                                                             | 4/50 [01:34<17:50, 23.26s/it]

Epoch 4/50, Average Loss: 2.8183


 10%|████████▍                                                                           | 5/50 [01:57<17:17, 23.06s/it]

Epoch 5/50, Average Loss: 2.7591


 12%|██████████                                                                          | 6/50 [02:20<16:52, 23.01s/it]

Epoch 6/50, Average Loss: 2.7092


 14%|███████████▊                                                                        | 7/50 [02:43<16:32, 23.07s/it]

Epoch 7/50, Average Loss: 2.6508


 16%|█████████████▍                                                                      | 8/50 [03:06<16:08, 23.05s/it]

Epoch 8/50, Average Loss: 2.6099


 18%|███████████████                                                                     | 9/50 [03:29<15:44, 23.03s/it]

Epoch 9/50, Average Loss: 2.5546


 20%|████████████████▌                                                                  | 10/50 [03:52<15:20, 23.02s/it]

Epoch 10/50, Average Loss: 2.5002


 22%|██████████████████▎                                                                | 11/50 [04:15<14:54, 22.94s/it]

Epoch 11/50, Average Loss: 2.4358


 24%|███████████████████▉                                                               | 12/50 [04:37<14:28, 22.85s/it]

Epoch 12/50, Average Loss: 2.3464


 26%|█████████████████████▌                                                             | 13/50 [05:00<14:06, 22.89s/it]

Epoch 13/50, Average Loss: 2.2394


 28%|███████████████████████▏                                                           | 14/50 [05:23<13:42, 22.86s/it]

Epoch 14/50, Average Loss: 2.2473


 30%|████████████████████████▉                                                          | 15/50 [05:46<13:16, 22.76s/it]

Epoch 15/50, Average Loss: 2.1698


 32%|██████████████████████████▌                                                        | 16/50 [06:09<12:57, 22.87s/it]

Epoch 16/50, Average Loss: 2.1125


 34%|████████████████████████████▏                                                      | 17/50 [06:31<12:33, 22.85s/it]

Epoch 17/50, Average Loss: 2.0875


 36%|█████████████████████████████▉                                                     | 18/50 [06:54<12:11, 22.87s/it]

Epoch 18/50, Average Loss: 2.0462


 38%|███████████████████████████████▌                                                   | 19/50 [07:17<11:49, 22.89s/it]

Epoch 19/50, Average Loss: 2.0101


 40%|█████████████████████████████████▏                                                 | 20/50 [07:40<11:27, 22.92s/it]

Epoch 20/50, Average Loss: 1.9825


 42%|██████████████████████████████████▊                                                | 21/50 [08:04<11:08, 23.05s/it]

Epoch 21/50, Average Loss: 1.9551


 44%|████████████████████████████████████▌                                              | 22/50 [08:27<10:47, 23.11s/it]

Epoch 22/50, Average Loss: 1.9291


 46%|██████████████████████████████████████▏                                            | 23/50 [08:50<10:22, 23.07s/it]

Epoch 23/50, Average Loss: 1.8969


 48%|███████████████████████████████████████▊                                           | 24/50 [09:13<09:59, 23.07s/it]

Epoch 24/50, Average Loss: 1.8754


 50%|█████████████████████████████████████████▌                                         | 25/50 [09:36<09:36, 23.07s/it]

Epoch 25/50, Average Loss: 1.8461


 52%|███████████████████████████████████████████▏                                       | 26/50 [09:59<09:12, 23.02s/it]

Epoch 26/50, Average Loss: 1.8243


 54%|████████████████████████████████████████████▊                                      | 27/50 [10:22<08:49, 23.01s/it]

Epoch 27/50, Average Loss: 1.8013


 56%|██████████████████████████████████████████████▍                                    | 28/50 [10:45<08:25, 22.96s/it]

Epoch 28/50, Average Loss: 1.7788


 58%|████████████████████████████████████████████████▏                                  | 29/50 [11:08<08:02, 22.98s/it]

Epoch 29/50, Average Loss: 1.7579


 60%|█████████████████████████████████████████████████▊                                 | 30/50 [11:31<07:39, 22.96s/it]

Epoch 30/50, Average Loss: 1.7373


 62%|███████████████████████████████████████████████████▍                               | 31/50 [11:54<07:16, 23.00s/it]

Epoch 31/50, Average Loss: 1.7169


 64%|█████████████████████████████████████████████████████                              | 32/50 [12:17<06:54, 23.03s/it]

Epoch 32/50, Average Loss: 1.6976


 66%|██████████████████████████████████████████████████████▊                            | 33/50 [12:40<06:30, 22.96s/it]

Epoch 33/50, Average Loss: 1.6797


 68%|████████████████████████████████████████████████████████▍                          | 34/50 [13:03<06:07, 22.94s/it]

Epoch 34/50, Average Loss: 1.6573


 70%|██████████████████████████████████████████████████████████                         | 35/50 [13:26<05:44, 22.97s/it]

Epoch 35/50, Average Loss: 1.6408


 72%|███████████████████████████████████████████████████████████▊                       | 36/50 [13:48<05:20, 22.92s/it]

Epoch 36/50, Average Loss: 1.6234


 74%|█████████████████████████████████████████████████████████████▍                     | 37/50 [14:11<04:58, 22.95s/it]

Epoch 37/50, Average Loss: 1.6079


 76%|███████████████████████████████████████████████████████████████                    | 38/50 [14:35<04:35, 22.99s/it]

Epoch 38/50, Average Loss: 1.5859


 78%|████████████████████████████████████████████████████████████████▋                  | 39/50 [14:57<04:12, 22.92s/it]

Epoch 39/50, Average Loss: 1.5707


 80%|██████████████████████████████████████████████████████████████████▍                | 40/50 [15:20<03:49, 22.91s/it]

Epoch 40/50, Average Loss: 1.5580


 82%|████████████████████████████████████████████████████████████████████               | 41/50 [15:43<03:26, 22.92s/it]

Epoch 41/50, Average Loss: 1.5363


 84%|█████████████████████████████████████████████████████████████████████▋             | 42/50 [16:06<03:03, 22.99s/it]

Epoch 42/50, Average Loss: 1.5203


 86%|███████████████████████████████████████████████████████████████████████▍           | 43/50 [16:29<02:40, 22.99s/it]

Epoch 43/50, Average Loss: 1.5094


 88%|█████████████████████████████████████████████████████████████████████████          | 44/50 [16:52<02:17, 22.96s/it]

Epoch 44/50, Average Loss: 1.4916


 90%|██████████████████████████████████████████████████████████████████████████▋        | 45/50 [17:15<01:54, 22.98s/it]

Epoch 45/50, Average Loss: 1.4781


 92%|████████████████████████████████████████████████████████████████████████████▎      | 46/50 [17:38<01:31, 22.94s/it]

Epoch 46/50, Average Loss: 1.4580


 94%|██████████████████████████████████████████████████████████████████████████████     | 47/50 [18:01<01:08, 22.88s/it]

Epoch 47/50, Average Loss: 1.4490


 96%|███████████████████████████████████████████████████████████████████████████████▋   | 48/50 [18:23<00:45, 22.69s/it]

Epoch 48/50, Average Loss: 1.4349


 98%|█████████████████████████████████████████████████████████████████████████████████▎ | 49/50 [18:45<00:22, 22.57s/it]

Epoch 49/50, Average Loss: 1.4168


100%|███████████████████████████████████████████████████████████████████████████████████| 50/50 [19:07<00:00, 22.96s/it]

Epoch 50/50, Average Loss: 1.4053
The time needed for training is 1147.9099595546722 seconds.



