In [None]:
import yaml
import json
from typing import Any, Dict
import sys
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


def load_yaml(file_path: str) -> Dict[str, Any]:
    """
    Load YAML data from a file.

    Args:
        file_path (str): Path to the YAML file.

    Returns:
        Dict[str, Any]: Parsed YAML data as a dictionary.

    Raises:
        FileNotFoundError: If the file does not exist.
        yaml.YAMLError: If there is an error parsing the YAML.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as yaml_file:
            return yaml.safe_load(yaml_file)
    except FileNotFoundError:
        logger.error(f"File not found: {file_path}")
        raise
    except yaml.YAMLError as e:
        logger.error(f"Error parsing YAML file: {e}")
        raise


def save_json(data: Dict[str, Any], file_path: str) -> None:
    """
    Save JSON data to a file.

    Args:
        data (Dict[str, Any]): Data to be saved as JSON.
        file_path (str): Path to the output JSON file.

    Raises:
        IOError: If there is an error writing to the file.
    """
    try:
        with open(file_path, "w", encoding="utf-8") as json_file:
            json.dump(data, json_file, indent=2)
        logger.info(f"JSON data successfully saved to {file_path}")
    except IOError as e:
        logger.error(f"Error writing JSON file: {e}")
        raise


def convert_yaml_to_json(yaml_file_path: str, json_file_path: str) -> None:
    """
    Convert a YAML file to a JSON file.

    Args:
        yaml_file_path (str): Path to the input YAML file.
        json_file_path (str): Path to the output JSON file.

    Raises:
        Exception: If any error occurs during the conversion process.
    """
    try:
        # Load YAML data
        yaml_data = load_yaml(yaml_file_path)
        logger.info("YAML data loaded successfully.")

        # Save JSON data
        save_json(yaml_data, json_file_path)
    except Exception as e:
        logger.error(f"An error occurred during conversion: {e}")
        raise


if __name__ == "__main__":
    # Input and output file paths
    input_yaml_file = "config_smollm2_135M.yaml"
    output_json_file = "config_smollm2_135M.json"

    # Perform conversion
    try:
        convert_yaml_to_json(input_yaml_file, output_json_file)
    except Exception:
        sys.exit(1)  # Exit with a non-zero status code to indicate failure

In [None]:
import json

def load_json(file_path: str) -> dict:
    """
    Load JSON data from a file.

    Args:
        file_path (str): Path to the JSON file.

    Returns:
        dict: Parsed JSON data as a dictionary.

    Raises:
        FileNotFoundError: If the file does not exist.
        json.JSONDecodeError: If there is an error parsing the JSON.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as json_file:
            return json.load(json_file)
    except FileNotFoundError:
        print(f"Error: File not found - {file_path}")
        raise
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON format in {file_path} - {e}")
        raise

def print_model_architecture(model_config: dict) -> None:
    """
    Print the architecture of the model based on its configuration.

    Args:
        model_config (dict): The model configuration dictionary.
    """
    print("Model Architecture:")
    print("-------------------")
    print(f"Hidden Size: {model_config.get('hidden_size')}")
    print(f"Number of Hidden Layers: {model_config.get('num_hidden_layers')}")
    print(f"Number of Attention Heads: {model_config.get('num_attention_heads')}")
    print(f"Number of Key-Value Heads: {model_config.get('num_key_value_heads')}")
    print(f"Intermediate Size: {model_config.get('intermediate_size')}")
    print(f"Hidden Activation: {model_config.get('hidden_act')}")
    print(f"Maximum Position Embeddings: {model_config.get('max_position_embeddings')}")
    print(f"RMS Norm Epsilon: {model_config.get('rms_norm_eps')}")
    print(f"Vocabulary Size: {model_config.get('vocab_size')}")
    print(f"Use Cache: {model_config.get('use_cache')}")
    print(f"Tie Word Embeddings: {model_config.get('tie_word_embeddings')}")
    print(f"Initializer Range: {model_config.get('initializer_range')}")
    print(f"BOS Token ID: {model_config.get('bos_token_id')}")
    print(f"EOS Token ID: {model_config.get('eos_token_id')}")
    print(f"Pad Token ID: {model_config.get('pad_token_id')}")
    print(f"Pretraining TP: {model_config.get('pretraining_tp')}")
    print(f"Rope Theta: {model_config.get('rope_theta')}")
    print(f"Rope Interleaved: {model_config.get('rope_interleaved')}")
    print(f"Rope Scaling: {model_config.get('rope_scaling')}")
    print(f"Is LLaMA Config: {model_config.get('is_llama_config')}")

def main():
    # Path to the JSON file
    json_file_path = "config_smollm2_135M.json"

    # Load the JSON file
    try:
        json_data = load_json(json_file_path)
    except Exception as e:
        print(f"Failed to load JSON file: {e}")
        return

    # Extract the model configuration
    model_config = json_data.get("model", {}).get("model_config", {})

    # Print the model architecture
    if model_config:
        print_model_architecture(model_config)
    else:
        print("Error: Model configuration not found in the JSON file.")

if __name__ == "__main__":
    main()

Model Architecture:
-------------------
Hidden Size: 576
Number of Hidden Layers: 30
Number of Attention Heads: 9
Number of Key-Value Heads: 3
Intermediate Size: 1536
Hidden Activation: silu
Maximum Position Embeddings: 2048
RMS Norm Epsilon: 1e-05
Vocabulary Size: 49152
Use Cache: True
Tie Word Embeddings: True
Initializer Range: 0.041666666666666664
BOS Token ID: 0
EOS Token ID: 0
Pad Token ID: None
Pretraining TP: 1
Rope Theta: 10000.0
Rope Interleaved: False
Rope Scaling: None
Is LLaMA Config: True


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional

class RMSNorm(nn.Module):
    """
    Root Mean Square Layer Normalization (RMSNorm).
    """
    def __init__(self, hidden_size: int, eps: float = 1e-5):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.eps = eps

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        variance = x.pow(2).mean(-1, keepdim=True)
        x = x * torch.rsqrt(variance + self.eps)
        return self.weight * x

class RotaryPositionalEmbedding(nn.Module):
    """
    Rotary Positional Embedding (RoPE) for transformers.
    """
    def __init__(self, dim: int, theta: float = 10000.0):
        super().__init__()
        self.dim = dim
        self.theta = theta

    def forward(self, x: torch.Tensor, seq_len: int) -> torch.Tensor:
        """
        Apply rotary positional embedding to the input tensor.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, seq_len, num_heads, head_dim).
            seq_len (int): Sequence length.

        Returns:
            torch.Tensor: Output tensor with rotary positional embeddings applied.
        """
        batch_size, seq_len, num_heads, head_dim = x.shape

        # Generate position indices
        position = torch.arange(seq_len, dtype=torch.float32, device=x.device).unsqueeze(-1)

        # Generate frequencies
        freqs = torch.exp(
            torch.arange(0, head_dim, 2, dtype=torch.float32, device=x.device) * -(torch.log(torch.tensor(self.theta)) / head_dim)
        )

        # Compute sinusoids
        sinusoid = position * freqs
        sin = torch.sin(sinusoid)
        cos = torch.cos(sinusoid)

        # Reshape sin and cos to match the input tensor's shape
        sin = sin.unsqueeze(0).unsqueeze(2)  # Shape: (1, seq_len, 1, head_dim // 2)
        cos = cos.unsqueeze(0).unsqueeze(2)  # Shape: (1, seq_len, 1, head_dim // 2)

        # Apply rotary embeddings
        x_rotated = x.clone()
        x_rotated[..., 0::2] = x[..., 0::2] * cos - x[..., 1::2] * sin
        x_rotated[..., 1::2] = x[..., 1::2] * cos + x[..., 0::2] * sin

        return x_rotated

from torch.utils.checkpoint import checkpoint

class TransformerBlock(nn.Module):
    """
    A single transformer block with self-attention and feed-forward layers.
    """
    def __init__(
        self,
        hidden_size: int,
        num_attention_heads: int,
        intermediate_size: int,
        num_key_value_heads: int,
        rms_norm_eps: float,
        hidden_act: str = "silu",
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.head_dim = hidden_size // num_attention_heads

        # Ensure the hidden size is divisible by the number of attention heads
        if hidden_size % num_attention_heads != 0:
            raise ValueError(
                f"hidden_size ({hidden_size}) must be divisible by num_attention_heads ({num_attention_heads})"
            )

        # Self-attention layers
        self.q_proj = nn.Linear(hidden_size, hidden_size)
        self.k_proj = nn.Linear(hidden_size, num_key_value_heads * self.head_dim)
        self.v_proj = nn.Linear(hidden_size, num_key_value_heads * self.head_dim)
        self.o_proj = nn.Linear(hidden_size, hidden_size)

        # Feed-forward layers
        self.gate_proj = nn.Linear(hidden_size, intermediate_size)
        self.up_proj = nn.Linear(hidden_size, intermediate_size)
        self.down_proj = nn.Linear(intermediate_size, hidden_size)

        # Normalization layers
        self.input_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
        self.post_attention_norm = RMSNorm(hidden_size, eps=rms_norm_eps)

        # Activation function
        self.act = nn.SiLU() if hidden_act == "silu" else nn.GELU()

        # Rotary positional embedding
        self.rope = RotaryPositionalEmbedding(self.head_dim)

    def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        def create_custom_forward(module):
            def custom_forward(*inputs):
                return module._forward(inputs[0], inputs[1])
            return custom_forward

        # Use gradient checkpointing
        return checkpoint(create_custom_forward(self), x, attention_mask)

    def _forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        # Self-attention
        residual = x
        x = self.input_norm(x)

        # Project inputs to query, key, and value
        batch_size, seq_len, _ = x.shape

        # Reshape queries for multi-head attention
        q = self.q_proj(x).view(batch_size, seq_len, self.num_attention_heads, self.head_dim)

        # Reshape keys and values for key-value heads
        k = self.k_proj(x).view(batch_size, seq_len, self.num_key_value_heads, self.head_dim)
        v = self.v_proj(x).view(batch_size, seq_len, self.num_key_value_heads, self.head_dim)

        # Apply rotary positional embedding
        q = self.rope(q, seq_len)
        k = self.rope(k, seq_len)

        # Scaled dot-product attention
        attn_output = F.scaled_dot_product_attention(q, k, v, attn_mask=attention_mask)
        attn_output = attn_output.transpose(1, 2).reshape(batch_size, seq_len, self.hidden_size)
        attn_output = self.o_proj(attn_output)

        # Add residual connection
        x = residual + attn_output

        # Feed-forward network
        residual = x
        x = self.post_attention_norm(x)
        gate = self.act(self.gate_proj(x))
        up = self.up_proj(x)
        ff_output = self.down_proj(gate * up)

        # Add residual connection
        x = residual + ff_output

        return x

class TransformerModel(nn.Module):
    """
    The full transformer model with multiple layers.
    """
    def __init__(
        self,
        vocab_size: int,
        hidden_size: int,
        num_hidden_layers: int,
        num_attention_heads: int,
        intermediate_size: int,
        num_key_value_heads: int,
        max_position_embeddings: int,
        rms_norm_eps: float,
        hidden_act: str = "silu",
        tie_word_embeddings: bool = True,
    ):
        super().__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.max_position_embeddings = max_position_embeddings

        # Embedding layers
        self.embed_tokens = nn.Embedding(vocab_size, hidden_size)
        self.embed_positions = nn.Embedding(max_position_embeddings, hidden_size)

        # Transformer blocks
        self.layers = nn.ModuleList([
            TransformerBlock(
                hidden_size=hidden_size,
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                num_key_value_heads=num_key_value_heads,
                rms_norm_eps=rms_norm_eps,
                hidden_act=hidden_act,
            )
            for _ in range(num_hidden_layers)
        ])

        # Final normalization layer
        self.final_norm = RMSNorm(hidden_size, eps=rms_norm_eps)

        # Output layer (tied to input embeddings if specified)
        self.lm_head = nn.Linear(hidden_size, vocab_size, bias=False)
        if tie_word_embeddings:
            self.lm_head.weight = self.embed_tokens.weight

    def forward(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        # Embed tokens and positions
        seq_len = input_ids.size(1)
        position_ids = torch.arange(seq_len, dtype=torch.long, device=input_ids.device)
        token_embeddings = self.embed_tokens(input_ids)
        position_embeddings = self.embed_positions(position_ids)
        x = token_embeddings + position_embeddings

        # Pass through transformer layers
        for layer in self.layers:
            x = layer(x, attention_mask)

        # Final normalization
        x = self.final_norm(x)

        # Output logits
        logits = self.lm_head(x)
        return logits

    def generate(
        self,
        input_ids: torch.Tensor,
        max_length: int = 50,
        temperature: float = 1.0,
        top_k: int = 50,
        do_sample: bool = True,
    ) -> torch.Tensor:
        """
        Generate text autoregressively.

        Args:
            input_ids (torch.Tensor): Input token IDs of shape (batch_size, seq_len).
            max_length (int): Maximum length of the generated sequence.
            temperature (float): Sampling temperature. Higher values mean more random sampling.
            top_k (int): Top-k sampling. Only the top-k tokens are considered.
            do_sample (bool): Whether to sample from the distribution or take the argmax.

        Returns:
            torch.Tensor: Generated token IDs of shape (batch_size, max_length).
        """
        self.eval()
        with torch.no_grad():
            for _ in range(max_length - input_ids.size(1)):
                # Get the logits for the last token
                logits = self(input_ids)[:, -1, :]

                # Apply temperature
                logits = logits / temperature

                # Top-k sampling
                if top_k > 0:
                    top_k_values, top_k_indices = torch.topk(logits, top_k)
                    logits[logits < top_k_values[:, -1].unsqueeze(-1)] = -float("Inf")

                # Convert logits to probabilities
                probs = F.softmax(logits, dim=-1)

                # Sample or take the argmax
                if do_sample:
                    next_token = torch.multinomial(probs, num_samples=1)
                else:
                    next_token = torch.argmax(probs, dim=-1, keepdim=True)

                # Append the next token to the input_ids
                input_ids = torch.cat([input_ids, next_token], dim=-1)

        return input_ids

# Create the model based on the configuration
def create_model_from_config(config: dict) -> TransformerModel:
    model_config = config["model"]["model_config"]
    return TransformerModel(
        vocab_size=model_config["vocab_size"],
        hidden_size=model_config["hidden_size"],
        num_hidden_layers=model_config["num_hidden_layers"],
        num_attention_heads=model_config["num_attention_heads"],
        intermediate_size=model_config["intermediate_size"],
        num_key_value_heads=model_config["num_key_value_heads"],
        max_position_embeddings=model_config["max_position_embeddings"],
        rms_norm_eps=model_config["rms_norm_eps"],
        hidden_act=model_config["hidden_act"],
        tie_word_embeddings=model_config["tie_word_embeddings"],
    )

# Example usage
if __name__ == "__main__":
    import json

    # Load the configuration file
    with open("config_smollm2_135M.json", "r") as f:
        config = json.load(f)

    # Create the model
    model = create_model_from_config(config)
    print(model)

TransformerModel(
  (embed_tokens): Embedding(49152, 576)
  (embed_positions): Embedding(2048, 576)
  (layers): ModuleList(
    (0-29): 30 x TransformerBlock(
      (q_proj): Linear(in_features=576, out_features=576, bias=True)
      (k_proj): Linear(in_features=576, out_features=192, bias=True)
      (v_proj): Linear(in_features=576, out_features=192, bias=True)
      (o_proj): Linear(in_features=576, out_features=576, bias=True)
      (gate_proj): Linear(in_features=576, out_features=1536, bias=True)
      (up_proj): Linear(in_features=576, out_features=1536, bias=True)
      (down_proj): Linear(in_features=1536, out_features=576, bias=True)
      (input_norm): RMSNorm()
      (post_attention_norm): RMSNorm()
      (act): SiLU()
      (rope): RotaryPositionalEmbedding()
    )
  )
  (final_norm): RMSNorm()
  (lm_head): Linear(in_features=576, out_features=49152, bias=False)
)


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
# from model import create_model_from_config
import json
import os
from torch.cuda.amp import GradScaler, autocast

# Configuration
CONFIG_FILE = "config_smollm2_135M.json"
CHECKPOINT_DIR = "checkpoints"
BATCH_SIZE = 4  # Reduced batch size
SEQ_LENGTH = 1024  # Reduced sequence length
GRADIENT_ACCUMULATION_STEPS = 1
PREDICTION_INTERVAL = 500
TRAIN_STEPS_PHASE_1 = 5000
TRAIN_STEPS_PHASE_2 = 50

# Load the configuration
with open(CONFIG_FILE, "r") as f:
    config = json.load(f)

# Set up the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(config["tokenizer"]["tokenizer_name_or_path"])

# Create the model
model = create_model_from_config(config).to(device)

# Define the optimizer
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=config["optimizer"]["learning_rate_scheduler"]["learning_rate"],
    betas=(
        config["optimizer"]["optimizer_factory"]["adam_beta1"],
        config["optimizer"]["optimizer_factory"]["adam_beta2"],
    ),
    eps=config["optimizer"]["optimizer_factory"]["adam_eps"],
    weight_decay=config["optimizer"]["weight_decay"],
)

# Define the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Initialize GradScaler for mixed precision
scaler = GradScaler()

# Dataset class
class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, seq_length):
        self.tokenizer = tokenizer
        self.seq_length = seq_length
        with open(file_path, "r") as f:
            self.text = f.read()
        self.tokens = self.tokenizer.encode(self.text)

    def __len__(self):
        return len(self.tokens) // self.seq_length

    def __getitem__(self, idx):
        start = idx * self.seq_length
        end = start + self.seq_length
        input_ids = self.tokens[start:end]
        labels = self.tokens[start + 1 : end + 1]
        return torch.tensor(input_ids, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

# Load the dataset
dataset = TextDataset("input.txt", tokenizer, SEQ_LENGTH)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Function to generate text
def generate_text(model, tokenizer, prompt="", max_length=50):
    """
    Generate text using the model.

    Args:
        model (TransformerModel): The trained model.
        tokenizer: The tokenizer used to encode/decode text.
        prompt (str): The initial prompt for text generation.
        max_length (int): Maximum length of the generated text.

    Returns:
        str: The generated text.
    """
    # Encode the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    # Generate text
    output_ids = model.generate(input_ids, max_length=max_length, do_sample=True)

    # Decode the generated text
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Training loop
def train(initial_step=0, total_steps=5000, checkpoint_path=None):
    model.train()
    global_step = initial_step

    # Load checkpoint if provided
    if checkpoint_path:
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        global_step = checkpoint["global_step"]
        print(f"Loaded checkpoint from {checkpoint_path} at step {global_step}")

    while global_step < total_steps:
        for batch_idx, (input_ids, labels) in enumerate(dataloader):
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            # Forward pass with mixed precision
            with autocast():
                outputs = model(input_ids)
                loss = loss_fn(outputs.view(-1, outputs.size(-1)), labels.view(-1))

            # Backward pass with gradient scaling
            scaler.scale(loss).backward()

            # Gradient accumulation
            if (batch_idx + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            # Logging
            if global_step % 10 == 0:
                print(f"Step {global_step}, Loss: {loss.item()}")

            # Generate text every PREDICTION_INTERVAL steps
            if global_step % PREDICTION_INTERVAL == 0:
                generated_text = generate_text(model, tokenizer, prompt="Once upon a time")
                print(f"Step {global_step}, Generated Text: {generated_text}")

            global_step += 1

            # Stop after the specified number of training steps
            if global_step >= total_steps:
                break

    # Save checkpoint at the end of training
    checkpoint_path = os.path.join(CHECKPOINT_DIR, f"checkpoint_{global_step}.pt")
    torch.save(
        {
            "global_step": global_step,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "loss": loss.item(),
        },
        checkpoint_path,
    )
    print(f"Checkpoint saved at {checkpoint_path}")

# Create the checkpoint directory if it doesn't exist
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Phase 1: Train for 5000 steps
print("Starting Phase 1: Training for 5000 steps")
train(initial_step=0, total_steps=TRAIN_STEPS_PHASE_1)

# Phase 2: Load the checkpoint and train for 50 more steps
print("Starting Phase 2: Loading checkpoint and training for 50 more steps")
checkpoint_path = os.path.join(CHECKPOINT_DIR, f"checkpoint_{TRAIN_STEPS_PHASE_1}.pt")
train(initial_step=TRAIN_STEPS_PHASE_1, total_steps=TRAIN_STEPS_PHASE_1 + TRAIN_STEPS_PHASE_2, checkpoint_path=checkpoint_path)

  scaler = GradScaler()


Starting Phase 1: Training for 5000 steps


  with autocast():


Step 0, Loss: 300.6046142578125




Step 0, Generated Text: Once upon a time time time time time time time time time time time time spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect spect
Step 10, Loss: 79.6866455078125
Step 20, Loss: 51.16653823852539
Step 30, Loss: 44.327701568603516
Step 40, Loss: 27.66715431213379
Step 50, Loss: 20.721342086791992
Step 60, Loss: 15.905719757080078
Step 70, Loss: 11.343408584594727
Step 80, Loss: 10.688121795654297
Step 90, Loss: 9.59188461303711
Step 100, Loss: 10.082836151123047
Step 110, Loss: 10.10753345489502
Step 120, Loss: 9.449414253234863
Step 130, Loss: 8.042485237121582
Step 140, Loss: 7.5388641357421875
Step 150, Loss: 9.793695449829102
Step 160, Loss: 9.59386920928955
Step 170, Loss: 9.547626495361328
Step 180, Loss: 9.427801132202148
Step 190, Loss: 7.332798957824707
Step 200, Loss: 8.496773719787598
Step 210, Loss: 8.8

RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling cublasLtMatmul with transpose_mat1 1 transpose_mat2 0 m 1536 n 5 k 576 mat1_ld 576 mat2_ld 576 result_ld 1536 abcType 0 computeType 68 scaleType 0

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
# from model import create_model_from_config
import json
import os
from torch.cuda.amp import GradScaler, autocast

# Configuration
CONFIG_FILE = "config_smollm2_135M.json"
CHECKPOINT_DIR = "checkpoints"
BATCH_SIZE = 2  # Further reduced batch size
SEQ_LENGTH = 512  # Further reduced sequence length
GRADIENT_ACCUMULATION_STEPS = 1
PREDICTION_INTERVAL = 500
TRAIN_STEPS_PHASE_1 = 5000
TRAIN_STEPS_PHASE_2 = 50

# Load the configuration
with open(CONFIG_FILE, "r") as f:
    config = json.load(f)

# Set up the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(config["tokenizer"]["tokenizer_name_or_path"])
print("Tokenizer vocab size:", tokenizer.vocab_size)
print("Model vocab size:", config["model"]["model_config"]["vocab_size"])

# Ensure tokenizer vocab size matches model vocab size
assert tokenizer.vocab_size == config["model"]["model_config"]["vocab_size"], "Tokenizer vocab size does not match model vocab size"

# Add pad_token_id to model config if missing
if "pad_token_id" not in config["model"]["model_config"]:
    config["model"]["model_config"]["pad_token_id"] = tokenizer.pad_token_id

# Create the model
model = create_model_from_config(config).to(device)

# Define the optimizer with a lower learning rate
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=1e-4,  # Reduced learning rate
    betas=(
        config["optimizer"]["optimizer_factory"]["adam_beta1"],
        config["optimizer"]["optimizer_factory"]["adam_beta2"],
    ),
    eps=config["optimizer"]["optimizer_factory"]["adam_eps"],
    weight_decay=config["optimizer"]["weight_decay"],
)

# Define the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Initialize GradScaler for mixed precision
scaler = GradScaler()

# Dataset class
class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, seq_length):
        self.tokenizer = tokenizer
        self.seq_length = seq_length
        with open(file_path, "r") as f:
            self.text = f.read()
        self.tokens = self.tokenizer.encode(self.text)

    def __len__(self):
        return len(self.tokens) // self.seq_length

    def __getitem__(self, idx):
        start = idx * self.seq_length
        end = start + self.seq_length
        input_ids = self.tokens[start:end]
        labels = self.tokens[start + 1 : end + 1]
        return torch.tensor(input_ids, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

# Load the dataset
dataset = TextDataset("input.txt", tokenizer, SEQ_LENGTH)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Function to generate text
def generate_text(model, tokenizer, prompt="", max_length=50):
    """
    Generate text using the model.

    Args:
        model (TransformerModel): The trained model.
        tokenizer: The tokenizer used to encode/decode text.
        prompt (str): The initial prompt for text generation.
        max_length (int): Maximum length of the generated text.

    Returns:
        str: The generated text.
    """
    # Encode the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    # Generate text
    output_ids = model.generate(input_ids, max_length=max_length, do_sample=True)

    # Decode the generated text
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Training loop
def train(initial_step=0, total_steps=5000, checkpoint_path=None):
    model.train()
    global_step = initial_step

    # Load checkpoint if provided
    if checkpoint_path:
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        global_step = checkpoint["global_step"]
        print(f"Loaded checkpoint from {checkpoint_path} at step {global_step}")

    while global_step < total_steps:
        for batch_idx, (input_ids, labels) in enumerate(dataloader):
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            # Debugging: Check token IDs
            if torch.any(input_ids < 0) or torch.any(input_ids >= model.vocab_size):
                raise ValueError(f"Invalid token IDs detected: {input_ids}")

            # Debugging: Check shapes
            assert input_ids.shape == labels.shape, "Mismatched shapes between input_ids and labels"

            # Forward pass with mixed precision
            with autocast():
                outputs = model(input_ids)
                loss = loss_fn(outputs.view(-1, outputs.size(-1)), labels.view(-1))

            # Check for nan or inf in loss
            if torch.isnan(loss).any() or torch.isinf(loss).any():
                print("Loss is nan or inf. Stopping training.")
                return

            # Backward pass with gradient scaling
            scaler.scale(loss).backward()

            # Gradient clipping
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            # Gradient accumulation
            if (batch_idx + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            # Logging
            if global_step % 10 == 0:
                print(f"Step {global_step}, Loss: {loss.item()}")

            # Generate text every PREDICTION_INTERVAL steps
            if global_step % PREDICTION_INTERVAL == 0:
                generated_text = generate_text(model, tokenizer, prompt="Once upon a time")
                print(f"Step {global_step}, Generated Text: {generated_text}")

            global_step += 1

            # Stop after the specified number of training steps
            if global_step >= total_steps:
                break

    # Save checkpoint at the end of training
    checkpoint_path = os.path.join(CHECKPOINT_DIR, f"checkpoint_{global_step}.pt")
    torch.save(
        {
            "global_step": global_step,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "loss": loss.item(),
        },
        checkpoint_path,
    )
    print(f"Checkpoint saved at {checkpoint_path}")

# Create the checkpoint directory if it doesn't exist
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Phase 1: Train for 5000 steps
print("Starting Phase 1: Training for 5000 steps")
train(initial_step=0, total_steps=TRAIN_STEPS_PHASE_1)

# Phase 2: Load the checkpoint and train for 50 more steps
print("Starting Phase 2: Loading checkpoint and training for 50 more steps")
checkpoint_path = os.path.join(CHECKPOINT_DIR, f"checkpoint_{TRAIN_STEPS_PHASE_1}.pt")
train(initial_step=TRAIN_STEPS_PHASE_1, total_steps=TRAIN_STEPS_PHASE_1 + TRAIN_STEPS_PHASE_2, checkpoint_path=checkpoint_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.91k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/489 [00:00<?, ?B/s]

Tokenizer vocab size: 49152
Model vocab size: 49152


  scaler = GradScaler()


Starting Phase 1: Training for 5000 steps


  with autocast():
  return fn(*args, **kwargs)


Step 0, Loss: 298.673095703125




Step 0, Generated Text: Once upon a time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time
Step 10, Loss: 161.7198028564453
Step 20, Loss: 53.27239227294922
Step 30, Loss: 48.80675506591797
Step 40, Loss: 40.90314483642578
Step 50, Loss: 36.19815444946289
Step 60, Loss: 39.507850646972656
Step 70, Loss: 34.20205307006836
Step 80, Loss: 33.852439880371094
Step 90, Loss: 32.16545104980469
Step 100, Loss: 29.291702270507812
Step 110, Loss: 29.522321701049805
Step 120, Loss: 29.088640213012695
Step 130, Loss: 27.521713256835938
Step 140, Loss: 24.323755264282227
Step 150, Loss: 26.249013900756836
Step 160, Loss: 26.47113609313965
Step 170, Loss: 25.80221939086914
Step 180, Loss: 26.118669509887695
Step 190, Loss: 26.004623413085938
Step 200, Loss: 24.777692794799805
Step 210, Loss: 24.28512954711914
Step 220, Los

  checkpoint = torch.load(checkpoint_path)


Loaded checkpoint from checkpoints/checkpoint_5000.pt at step 5000
Step 5000, Loss: 0.07633372396230698
Step 5000, Generated Text: Once upon a time,
 loved theeylon, my their superintendent?
As kept aly and a provided, and have leave, for the concerns,
A Boundaries, a sw swungung justlyed by the state state state state state
Step 5010, Loss: 0.06234195455908775
Step 5020, Loss: 0.1018715649843216
Step 5030, Loss: 0.06651565432548523
Step 5040, Loss: 0.17842015624046326
Checkpoint saved at checkpoints/checkpoint_5050.pt


In [None]:
from google.colab import files


In [None]:
files.download('/content/checkpoints/checkpoint_5000.pt')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download('/content/checkpoints/checkpoint_5050.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!zip -r checkpoint_files.zip /content/checkpoints

  adding: content/checkpoints/ (stored 0%)
  adding: content/checkpoints/checkpoint_5050.pt (deflated 10%)
  adding: content/checkpoints/checkpoint_5000.pt (deflated 10%)


In [None]:
from google.colab import files
files.download('checkpoint_files.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!cp /content/checkpoint_files.zip /content/drive/MyDrive/


# Getting Data from drive for quantization

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import zipfile
import os

# Path to the zip file in Google Drive
zip_path = "/content/drive/MyDrive/checkpoint_files.zip"

# Directory to extract the contents
extract_dir = "/content/checkpoint"

# Create the extraction directory
os.makedirs(extract_dir, exist_ok=True)

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("Extraction complete!")

Extraction complete!


# Now load the model from extracted checkpoint

In [None]:
import torch
# from model import TransformerModel  # Replace with your model class

# Path to the checkpoint file
checkpoint_path = os.path.join("/content/checkpoint/content/checkpoints", "checkpoint_5050.pt")

# Load the model architecture
# model = TransformerModel(...)  # Initialize with the same architecture

# Load the checkpoint
checkpoint = torch.load(checkpoint_path, map_location="cpu")
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()  # Set the model to evaluation mode

  checkpoint = torch.load(checkpoint_path, map_location="cpu")


TransformerModel(
  (embed_tokens): Embedding(49152, 576)
  (embed_positions): Embedding(2048, 576)
  (layers): ModuleList(
    (0-29): 30 x TransformerBlock(
      (q_proj): Linear(in_features=576, out_features=576, bias=True)
      (k_proj): Linear(in_features=576, out_features=192, bias=True)
      (v_proj): Linear(in_features=576, out_features=192, bias=True)
      (o_proj): Linear(in_features=576, out_features=576, bias=True)
      (gate_proj): Linear(in_features=576, out_features=1536, bias=True)
      (up_proj): Linear(in_features=576, out_features=1536, bias=True)
      (down_proj): Linear(in_features=1536, out_features=576, bias=True)
      (input_norm): RMSNorm()
      (post_attention_norm): RMSNorm()
      (act): SiLU()
      (rope): RotaryPositionalEmbedding()
    )
  )
  (final_norm): RMSNorm()
  (lm_head): Linear(in_features=576, out_features=49152, bias=False)
)

# New

In [None]:
import torch.nn as nn
import torch.quantization

# Specify which layers to quantize. "nn.Linear" is common.
quantized_model = torch.quantization.quantize_dynamic(
    model,
    {nn.Linear},  # This can be a set of layer types to quantize
    dtype=torch.qint8  # 8-bit integer quantization
)

# Now `quantized_model` is quantized
torch.save(quantized_model, "quantized_model.pt")
print("Quantized model saved to 'quantized_model.pt'")

Quantized model saved to 'quantized_model.pt'


# Move quantized model to google drive

In [None]:
import shutil

# Define the source and destination paths
source_path = "/content/quantized_model.pt"
destination_path = "/content/drive/MyDrive/quantized_model.pt"  # Save it to the root of your Google Drive

# Copy the file
shutil.copy(source_path, destination_path)

print(f"Model checkpoint copied to Google Drive: {destination_path}")

Model checkpoint copied to Google Drive: /content/drive/MyDrive/quantized_model.pt


# OLD

In [None]:
# Quantize the model to float16
model.half()  # Convert model weights to float16

# Save the quantized model
quantized_checkpoint_path = os.path.join("/content/checkpoint/content/checkpoints", "checkpoint_quantized.pt")
torch.save(model.state_dict(), quantized_checkpoint_path)

print("Quantization complete! Quantized model saved at:", quantized_checkpoint_path)

Quantization complete! Quantized model saved at: /content/checkpoint/content/checkpoints/checkpoint_quantized.pt


# Move quantized model from googlecolan to googledrive

In [None]:
import shutil

# Define the source and destination paths
source_path = "/content/checkpoint/content/checkpoints/checkpoint_quantized.pt"
destination_path = "/content/drive/MyDrive/checkpoint_quantized.pt"  # Save it to the root of your Google Drive

# Copy the file
shutil.copy(source_path, destination_path)

print(f"Model checkpoint copied to Google Drive: {destination_path}")

Model checkpoint copied to Google Drive: /content/drive/MyDrive/checkpoint_quantized.pt


# Updated code

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
import json
import os
from torch.cuda.amp import GradScaler, autocast
import torch.nn.functional as F

# --------------------------------------------------------------------------
# You must ensure that create_model_from_config is imported from your model.py
# or defined here. For example:
#
# from model import create_model_from_config
#
# If you prefer to define it inline, uncomment below (and ensure model.py is not conflicting):
#
# from model import TransformerModel
# def create_model_from_config(config):
#     model_config = config["model"]["model_config"]
#     return TransformerModel(
#         vocab_size=model_config["vocab_size"],
#         hidden_size=model_config["hidden_size"],
#         num_hidden_layers=model_config["num_hidden_layers"],
#         num_attention_heads=model_config["num_attention_heads"],
#         intermediate_size=model_config["intermediate_size"],
#         num_key_value_heads=model_config["num_key_value_heads"],
#         max_position_embeddings=model_config["max_position_embeddings"],
#         rms_norm_eps=model_config["rms_norm_eps"],
#         hidden_act=model_config["hidden_act"],
#         tie_word_embeddings=model_config["tie_word_embeddings"],
#     )
#
# --------------------------------------------------------------------------

# Configuration
CONFIG_FILE = "config_smollm2_135M.json"
CHECKPOINT_DIR = "checkpoints"
BATCH_SIZE = 2         # Reduced batch size
SEQ_LENGTH = 512       # Reduced sequence length
GRADIENT_ACCUMULATION_STEPS = 1
PREDICTION_INTERVAL = 500
TRAIN_STEPS_PHASE_1 = 5000
TRAIN_STEPS_PHASE_2 = 50

# Load the configuration
with open(CONFIG_FILE, "r") as f:
    config = json.load(f)

# Set up the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(config["tokenizer"]["tokenizer_name_or_path"])
print("Tokenizer vocab size:", tokenizer.vocab_size)
print("Model vocab size:", config["model"]["model_config"]["vocab_size"])

# Ensure tokenizer vocab size matches model vocab size
assert tokenizer.vocab_size == config["model"]["model_config"]["vocab_size"], \
    "Tokenizer vocab size does not match model vocab size"

# Add pad_token_id to model config if missing
if "pad_token_id" not in config["model"]["model_config"]:
    config["model"]["model_config"]["pad_token_id"] = tokenizer.pad_token_id

# --------------------------------------------------------------------------
# Create/Load your model from config
# (Uncomment or replace with your own import)
# --------------------------------------------------------------------------
# from model import create_model_from_config
model = create_model_from_config(config).to(device)
# --------------------------------------------------------------------------

# Define the optimizer with a lower learning rate
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=1e-4,  # Reduced learning rate
    betas=(
        config["optimizer"]["optimizer_factory"]["adam_beta1"],
        config["optimizer"]["optimizer_factory"]["adam_beta2"],
    ),
    eps=config["optimizer"]["optimizer_factory"]["adam_eps"],
    weight_decay=config["optimizer"]["weight_decay"],
)

# Define the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Initialize GradScaler for mixed precision
scaler = GradScaler()

# Simple text dataset
class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, seq_length):
        self.tokenizer = tokenizer
        self.seq_length = seq_length
        with open(file_path, "r", encoding="utf-8") as f:
            self.text = f.read()
        self.tokens = self.tokenizer.encode(self.text)

    def __len__(self):
        return len(self.tokens) // self.seq_length

    def __getitem__(self, idx):
        start = idx * self.seq_length
        end = start + self.seq_length
        input_ids = self.tokens[start:end]
        labels = self.tokens[start + 1 : end + 1]
        return torch.tensor(input_ids, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

# Load the dataset
dataset = TextDataset("input.txt", tokenizer, SEQ_LENGTH)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Function to generate text for logging/preview
def generate_text(model, tokenizer, prompt="", max_length=50):
    """
    Generate text using the model.
    """
    model.eval()
    # Encode the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    # Generate text
    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=max_length, do_sample=True)

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Training loop
def train(initial_step=0, total_steps=5000, checkpoint_path=None):
    model.train()
    global_step = initial_step

    # Load checkpoint if provided
    if checkpoint_path:
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        global_step = checkpoint["global_step"]
        print(f"Loaded checkpoint from {checkpoint_path} at step {global_step}")

    while global_step < total_steps:
        for batch_idx, (input_ids, labels) in enumerate(dataloader):
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            # Forward pass with mixed precision
            with autocast():
                outputs = model(input_ids)
                loss = loss_fn(outputs.view(-1, outputs.size(-1)), labels.view(-1))

            # Check for NaN or inf
            if torch.isnan(loss).any() or torch.isinf(loss).any():
                print("Loss is nan or inf. Stopping training.")
                return

            # Backward pass with gradient scaling
            scaler.scale(loss).backward()

            # Gradient clipping
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            # Gradient accumulation
            if (batch_idx + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            # Logging
            if global_step % 10 == 0:
                print(f"Step {global_step}, Loss: {loss.item()}")

            # Generate text every PREDICTION_INTERVAL steps
            if global_step % PREDICTION_INTERVAL == 0:
                gen_txt = generate_text(model, tokenizer, prompt="Once upon a time")
                print(f"Step {global_step}, Generated Text: {gen_txt}")

            global_step += 1
            if global_step >= total_steps:
                break

    # Save a full checkpoint (includes optimizer) for possible further training
    checkpoint_path = os.path.join(CHECKPOINT_DIR, f"checkpoint_{global_step}.pt")
    torch.save(
        {
            "global_step": global_step,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "loss": loss.item(),
        },
        checkpoint_path,
    )
    print(f"Checkpoint saved at {checkpoint_path}")

# Create the checkpoint directory if it doesn't exist
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Phase 1: Train for 5000 steps
print("Starting Phase 1: Training for 5000 steps")
train(initial_step=0, total_steps=TRAIN_STEPS_PHASE_1)

# Phase 2: Load the checkpoint from Phase 1 and train 50 more steps
print("Starting Phase 2: Loading checkpoint and training for 50 more steps")
phase1_ckpt = os.path.join(CHECKPOINT_DIR, f"checkpoint_{TRAIN_STEPS_PHASE_1}.pt")
train(
    initial_step=TRAIN_STEPS_PHASE_1,
    total_steps=TRAIN_STEPS_PHASE_1 + TRAIN_STEPS_PHASE_2,
    checkpoint_path=phase1_ckpt
)

# ------------------------------------------------------------------------------
# FINAL STEP: Create a smaller inference checkpoint in FP16, omitting optimizer
# ------------------------------------------------------------------------------
final_ckpt_path = os.path.join(CHECKPOINT_DIR, f"checkpoint_{TRAIN_STEPS_PHASE_1 + TRAIN_STEPS_PHASE_2}.pt")
print(f"\nLoading final training checkpoint: {final_ckpt_path}")
checkpoint = torch.load(final_ckpt_path, map_location="cpu")

fp32_state_dict = checkpoint["model_state_dict"]

# Convert all float32 params to float16
half_state_dict = {}
for name, param in fp32_state_dict.items():
    if param.dtype == torch.float32:
        half_state_dict[name] = param.half()
    else:
        half_state_dict[name] = param  # keep other dtypes as is

# Build the smaller checkpoint for inference
inference_checkpoint = {
    "global_step": checkpoint["global_step"],
    "model_state_dict": half_state_dict
    # optimizer_state_dict is omitted
}

inference_ckpt_path = os.path.join(CHECKPOINT_DIR, "model_weights_fp16.pt")
torch.save(inference_checkpoint, inference_ckpt_path)
print(f"Inference checkpoint (FP16) saved at {inference_ckpt_path}")

print("Done! You can upload 'model_weights_fp16.pt' (well under 1 GB) to Spaces.")


Tokenizer vocab size: 49152
Model vocab size: 49152


  scaler = GradScaler()


Starting Phase 1: Training for 5000 steps


  with autocast():
  return fn(*args, **kwargs)


Step 0, Loss: 297.957763671875




Step 0, Generated Text: Once upon a time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time time
Step 10, Loss: 157.36114501953125
Step 20, Loss: 58.85614776611328
Step 30, Loss: 46.871978759765625
Step 40, Loss: 39.91062927246094
Step 50, Loss: 39.27462387084961
Step 60, Loss: 35.252784729003906
Step 70, Loss: 34.14326858520508
Step 80, Loss: 33.6908073425293
Step 90, Loss: 30.927204132080078
Step 100, Loss: 29.71237564086914
Step 110, Loss: 30.40058135986328
Step 120, Loss: 29.440732955932617
Step 130, Loss: 29.94432258605957
Step 140, Loss: 27.920412063598633
Step 150, Loss: 27.681842803955078
Step 160, Loss: 27.285734176635742
Step 170, Loss: 25.474994659423828
Step 180, Loss: 25.483720779418945
Step 190, Loss: 28.43486785888672
Step 200, Loss: 24.9284725189209
Step 210, Loss: 26.502050399780273
Step 220, Loss:

  checkpoint = torch.load(checkpoint_path)


Loaded checkpoint from checkpoints/checkpoint_5000.pt at step 5000
Step 5000, Loss: 0.1158963069319725
Step 5000, Generated Text: Once upon a time time of mine mine; set him the
PRINI will hence, or brother:
Onlinglingierconfirmed must the foul depcchSamuel,
 concerning concerningallomile,awd thinking thinking thinking!


Step 5010, Loss: 0.12078849971294403
Step 5020, Loss: 0.09788670390844345
Step 5030, Loss: 0.11753468960523605
Step 5040, Loss: 0.14231787621974945
Checkpoint saved at checkpoints/checkpoint_5050.pt

Loading final training checkpoint: checkpoints/checkpoint_5050.pt


  checkpoint = torch.load(final_ckpt_path, map_location="cpu")


Inference checkpoint (FP16) saved at checkpoints/model_weights_fp16.pt
Done! You can upload 'model_weights_fp16.pt' (well under 1 GB) to Spaces.


# Move model to google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil

# Define the source and destination paths
source_path = "/content/checkpoints/model_weights_fp16.pt"
destination_path = "/content/drive/MyDrive/model_weights_fp16.pt"  # Save it to the root of your Google Drive

# Copy the file
shutil.copy(source_path, destination_path)

print(f"Model checkpoint copied to Google Drive: {destination_path}")

Model checkpoint copied to Google Drive: /content/drive/MyDrive/model_weights_fp16.pt
