In [1]:
import importlib
import utils
importlib.reload(utils)

  from .autonotebook import tqdm as notebook_tqdm


<module 'utils' from '/home/zhaox/ProcessGPT/utils.py'>

In [2]:
import argparse
import logging
from datetime import datetime

import numpy as np
import torch
import torch.nn as nn
import wandb
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    GPTNeoConfig, 
    GPTNeoForCausalLM
)

from model import GPT
from utils import *  
from huggingface_hub import login
from sklearn.model_selection import train_test_split

login(token="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")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/zhaox/.cache/huggingface/token
Login successful


In [3]:
seed = 3407
epochs = 2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
cfg_param = "33M"
cfg = load_config(f"configs/config-{cfg_param}.json")
batch_size = 8
window_size = cfg["window_size"]
lr = cfg["learning_rate"]

In [4]:
# set up logger

current_time = datetime.now().strftime("%m%d_%H%M%S")
log_filename = f"logs/training_{cfg_param}_{current_time}.log"
logging.basicConfig(filename=log_filename, level=logging.INFO,
                    format='%(asctime)s %(levelname)s: %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')

In [5]:
# load dataset and tokenizer

model_name = 'roneneldan/TinyStories'
dataset = load_dataset('2Xm7/25PofTinyStories')
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token



In [6]:
# instantiate dataloaders

train_loader = DataLoader(dataset['train'], batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(dataset['validation'], batch_size=batch_size, shuffle=True)

In [7]:
# instantiate model and optimizer

config = GPTNeoConfig(
    vocab_size=cfg.get('vocab_size', tokenizer.vocab_size),
    max_position_embeddings=cfg.get('max_position_embeddings', 2048),
    hidden_size=cfg.get('hidden_size', 1024),
    num_layers=cfg.get('num_layers', 8),
    num_heads=cfg.get('num_heads', 16),
    activation_function=cfg.get('activation_function', 'gelu_new'),
    attention_types=cfg.get('attention_types', [['global', 'local'], 4]),
    attention_layers=cfg.get('attention_layers', ['global', 'local'] * 4),
    bos_token_id=cfg.get('bos_token_id', tokenizer.bos_token_id),
    eos_token_id=cfg.get('eos_token_id', tokenizer.eos_token_id),
    layer_norm_epsilon=cfg.get('layer_norm_epsilon', 1e-5),
    initializer_range=cfg.get('initializer_range', 0.02),
    use_cache=cfg.get('use_cache', True),
    attention_dropout=cfg.get('attention_dropout', 0.0),
    resid_dropout=cfg.get('resid_dropout', 0.0),
    embed_dropout=cfg.get('embed_dropout', 0.0),
    pad_token_id=tokenizer.pad_token_id,
)

# Instantiate model
model = GPTNeoForCausalLM(config)

In [8]:
# adjust tokenizer and embeddings 

if tokenizer.vocab_size != config.vocab_size:
    tokenizer.add_tokens(['<|endoftext|>'])
    model.resize_token_embeddings(len(tokenizer))

if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
model.to(device)
optim = torch.optim.AdamW(
    model.parameters(),
    lr=cfg["learning_rate"],
    betas=(cfg["adam_beta1"], cfg["adam_beta2"]),
    weight_decay=cfg["weight_decay"]
)

gradient_accumulation_steps = cfg["gradient_accumulation_steps"]

updates, start_epoch, start_step = 0, 0, 0
model_dir = f"models/model_{cfg_param}_{current_time}"
resume_training = False
if resume_training:
    logging.info(f"Resuming training from {model_dir}")
    model, tokenizer, updates, start_epoch, start_step = load_checkpoint(model_dir, optimizer=optim)
    model.to(device)

In [9]:
# setup Weights & Biases
run = wandb.init(
    project="gpt-tinystories-25P",
    name=f"gpt-tinystories-25P-{cfg_param}-{current_time}",
    config={
        "cfg_param": cfg_param,
        "learning_rate": lr,
        "batch_size": batch_size,
        "model_dir": model_dir,
        "log_filename": log_filename,
        "seed": seed,
        "epochs": epochs
    },
)
logging.info(f"cfg_param: {cfg_param}, lr: {lr}, batch_size: {batch_size}, "
             f"model_dir: {model_dir}, log_filename: {log_filename}, "
             f"seed: {seed}, epochs: {epochs}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mxm7zhao[0m ([33mxm7zhao-lafayette-college[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
for epoch in range(start_epoch, epochs):
    logging.info(f"Epoch: {epoch + 1}")
    model.train()

    if epoch > start_epoch:
        start_step = 0

    for step, batch in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"), start=start_step):
        optim.zero_grad()
        inputs = tokenizer(batch['text'], padding=True, return_tensors='pt',
                           max_length=window_size, truncation=True)['input_ids'].to(device)
        outputs = model(input_ids=inputs, labels=inputs)
        loss = outputs.loss
        if torch.cuda.device_count() > 1:
            loss = loss.mean()
        loss.backward()
        optim.step()
        updates += 1

        if updates % 200 == 0:
            validation_loss = estimate_loss(model, tokenizer, valid_loader, device)
            tqdm.write(f"Epoch {epoch + 1}, Update {updates}, Validation Loss: {validation_loss:.4f}")
            logging.info(f"Epoch {epoch + 1}, Update {updates}, Validation Loss: {validation_loss:.4f}")
            wandb.log({"train_loss": loss.item(), "val_loss": validation_loss}, step=updates)

        if updates % 2000 == 0:
            save_checkpoint(model, tokenizer, optim, updates, model_dir, epoch, step)
            logging.info(f"Model checkpoint saved at update {updates}")

    start_step = 0

    logging.info("Epoch training complete")
    logging.info("Computing final validation loss...")

    validation_loss = estimate_loss(model, tokenizer, valid_loader, device)
    logging.info(f"Final validation loss after epoch {epoch + 1}: {validation_loss:.4f}")
    wandb.log({"final_val_loss": validation_loss}, step=updates)

    save_checkpoint(model, tokenizer, optim, updates, model_dir, epoch, step=0)
    logging.info(f"Model saved after epoch {epoch + 1}")

    # log the model to wandb
    model_artifact = wandb.Artifact(f'model_{cfg_param}_{current_time}', type='model')
    model_artifact.add_dir(model_dir)
    wandb.log_artifact(model_artifact)
    logging.info("Model artifact logged to wandb")

wandb.finish()

Training Epoch 1:   0%|                                       | 90/62159 [00:55<10:55:56,  1.58it/s]

In [2]:
from utils import *  


In [7]:
# test with trained model

model_dir = "models/model_28M_1029_161840"  

prompt = "What color do you like"

test_language_modeling(model_dir, prompt, device='cuda')

Generated Text:
--------------------------------------------------------------------------------
What color do you like the park old man, "Why are you, Tim, and said, I am a big, but he was very happy. He had aummy.
Once upon a time, there was a little girl named Tim. Tim loved to play with his mom. One day, he would run and saw a small, so he could not find a tree. The bird was sad. She was not know what was so happy and the bird. They had to be friends. It was too. But, she was happy to the cat. 



In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

model = AutoModelForCausalLM.from_pretrained('roneneldan/TinyStories-33M')
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
prompt = "what school do you like"
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Generate completion
output = model.generate(input_ids, max_length = 1000, num_beams=1)

# Decode the completion
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated text
print(output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


what school do you like?"

The teacher smiled and said, "I like to learn new things every day. I like to write and draw pictures. I also like to teach other children about the world."

The little girl was excited. She wanted to learn more about the world. She asked the teacher, "Can I learn more about the world?"

The teacher said, "Of course! You can learn anything you want. Just remember to be kind and curious and explore the world."

The little girl smiled and said, "I will! I want to learn more!"

And so, the little girl went off to learn more about the world. She was excited to learn more and explore the world with her new teacher.

