# Dependencies

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import json

# Check GPU availability

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load openwebtext_100k dataset

In [None]:
dataset_file = "openwebtext_100k/dataset_info.json"
text_data_file = "openwebtext_100k/openwebtext-100k-train.arrow"


# Load dataset metadata

In [None]:
with open(dataset_file, "r", encoding="utf-8") as f:
    dataset_info = json.load(f)
print(f"Dataset Info: {dataset_info}")

# Load Tokenizer and Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)

# Define Dataset Class

In [None]:
class OpenWebTextDataset(Dataset):
    def __init__(self, text_file, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = self.load_data(text_file)
        self.max_length = max_length
    
    def load_data(self, text_file):
        # Placeholder for loading text data (modify based on actual format)
        return ["Sample text 1", "Sample text 2"]  # Replace with actual dataset loading logic
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data[idx]
        encoding = self.tokenizer(text, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
        }


# Create Dataset

In [None]:
dataset = OpenWebTextDataset(text_data_file, tokenizer)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Dataloaders


In [None]:
batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)


# Training Setup

In [None]:
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)
criterion = nn.CrossEntropyLoss()

# Training Loop

In [None]:
epochs = 10
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    scheduler.step()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss / len(train_dataloader)}")
    torch.save(model.state_dict(), "openwebtext_checkpoint.pth")


# Evaluation

In [None]:
total_loss = 0.0
model.eval()
with torch.no_grad():
    for batch in tqdm(val_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        total_loss += loss.item()

print(f"Validation Loss: {total_loss / len(val_dataloader):.4f}")