In [1]:
import torch
from train_model import *
from model import BigramLanguageModel
# Create DataLoader
from torch.utils.data import DataLoader
import json

In [2]:
torch.manual_seed(1337)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Hyperparameters based on device
if device == 'cuda':
    batch_size = 64
    max_iteration = 5000
    block_size = 192
    eval_iters = 200
    learning_rate = 1e-4
    eval_interval = 500
    n_embed = 384
    dropout = 0.2
    n_head = 6
    n_layer = 6
else:
    batch_size = 32
    max_iteration = 3000
    block_size = 128
    eval_iters = 100
    learning_rate = 2e-4
    eval_interval = 300
    n_embed = 128
    dropout = 0.1
    n_head = 4
    n_layer = 4

Using device: cpu


In [3]:
file_path = "reddit_chat.jsonl"

Read the file

In [4]:
conversations = []
with open(file_path, 'r') as f:
    for line in f:
        data = json.loads(line)
        messages = data['message']

        formatted = ""
        for msg in messages:
            if msg['role'] == 'user':
                formatted += f"Human: {msg['content']}\n"
            else:
                formatted += f"Assistant: {msg['content']}\n"
        conversations.append(formatted.strip())

Split Train and Test

In [5]:
train_data, test_data = split_conversations(conversations)

print("Train --> ", train_data)
print("_"*20)
print("Test --> ", test_data)

Train -->  Dataset({
    features: ['text'],
    num_rows: 47979
})
____________________
Test -->  Dataset({
    features: ['text'],
    num_rows: 5331
})


Tokenizer - to encode them

In [6]:
all_text = "\n".join(conversations) 
data_encoder = EncodingDecoding(all_text)
buildVocabulary = data_encoder.stoi

train_encoded = [data_encoder.encode(c) for c in train_data['text']]
test_encoded  = [data_encoder.encode(c) for c in test_data['text']]

# Check sequence lengths
train_lengths = [len(seq) for seq in train_encoded]
print(f"Train sequences - Min: {min(train_lengths)}, Max: {max(train_lengths)}, Avg: {sum(train_lengths)/len(train_lengths):.1f}")
print(f"Block size: {block_size}")
print(f"Sequences longer than block_size: {sum(1 for l in train_lengths if l > block_size)}")

Train sequences - Min: 47, Max: 478, Avg: 159.5
Block size: 128
Sequences longer than block_size: 33786


Just testing

In [7]:
# sample_text = train_data['text'][0]
# print("Original text:\n", sample_text)

# sample_encoded = data_encoder.encode(sample_text)
# print("\nEncoded (as integers):\n", sample_encoded)

# sample_decoded = data_encoder.decode(sample_encoded)
# print("\nDecoded (as text):\n", sample_decoded)

Fine-tune the model

In [8]:
from train_model import CustomDataset, collate_fn

train_tensors = [torch.tensor(seq, dtype=torch.long) for seq in train_encoded]
test_tensors = [torch.tensor(seq, dtype=torch.long) for seq in test_encoded]

train_dataset = CustomDataset(train_tensors, block_size=block_size)
test_dataset = CustomDataset(test_tensors, block_size=block_size)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [9]:
vocab_size = len(data_encoder.stoi)
print(f"Vocabulary size: {vocab_size}")

model = BigramLanguageModel(
    vocab_size=vocab_size,
    n_embed=n_embed,
    block_size=block_size,
    n_head=n_head,
    n_layer=n_layer,
    dropout=dropout,
    device=device
)

model.to(device)
print(f"Model initialized with {sum(p.numel() for p in model.parameters())} parameters")

Vocabulary size: 973
Model initialized with 1058253 parameters


In [10]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

model.train()
iteration = 0
while iteration < max_iteration:
    for batch in train_loader:
        if iteration >= max_iteration:
            break
            
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        
        logits, loss = model(input_ids, targets=labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        iteration += 1
        
        if iteration % eval_interval == 0:
            print(f"Iteration {iteration}: loss = {loss.item():.4f}")
        
        if iteration % eval_interval == 0 and iteration > 0:
            model.eval()
            test_losses = []
            with torch.no_grad():
                for test_batch in test_loader:
                    test_input = test_batch['input_ids'].to(device)
                    test_labels = test_batch['labels'].to(device)
                    _, test_loss = model(test_input, targets=test_labels)
                    test_losses.append(test_loss.item())
            avg_test_loss = sum(test_losses) / len(test_losses)
            print(f"  Test loss: {avg_test_loss:.4f}")
            model.train()
print("Training complete!")


KeyboardInterrupt: 