In [1]:
!pip install transformers datasets accelerate


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
import torch
import torch.nn as nn
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset

class ExpertTransformer(nn.Module):
    def __init__(self, config):
        super(ExpertTransformer, self).__init__()
        self.transformer = GPT2LMHeadModel(config)

    def forward(self, input_ids, attention_mask=None, labels=None):
        return self.transformer(input_ids, attention_mask=attention_mask, labels=labels).logits

class GatingNetwork(nn.Module):
    def __init__(self, input_dim, num_experts):
        super(GatingNetwork, self).__init__()
        self.gate = nn.Linear(input_dim, num_experts)

    def forward(self, inputs):
        return torch.softmax(self.gate(inputs), dim=-1)

class MixtureOfExpertsLLM(nn.Module):
    def __init__(self, config, num_experts):
        super(MixtureOfExpertsLLM, self).__init__()
        self.num_experts = num_experts
        self.experts = nn.ModuleList([ExpertTransformer(config) for _ in range(num_experts)])
        self.gating_network = GatingNetwork(config.n_embd, num_experts)
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get embeddings from the first layer of the first expert (assuming all experts share the same embeddings)
        embeddings = self.experts[0].transformer.transformer.wte(input_ids)
        gate_scores = self.gating_network(embeddings.mean(dim=1))  # Average embeddings across the sequence length
        expert_outputs = torch.stack([expert(input_ids, attention_mask, labels) for expert in self.experts], dim=1)
        weighted_output = torch.einsum('bij,bjkl->bikl', gate_scores.unsqueeze(-1), expert_outputs)
        logits = weighted_output.sum(dim=1)  # Combine the outputs of the experts

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            return loss, logits

        return logits

# Define model configuration
config = GPT2Config(
    vocab_size=50257,
    n_positions=1024,
    n_ctx=1024,
    n_embd=768,  # Hidden size for GPT-2 Small (124M parameters)
    n_layer=12,  # Number of layers for GPT-2 Small
    n_head=12    # Number of attention heads for GPT-2 Small
)

# Initialize Mixture of Experts model
num_experts = 4  # Define number of experts
model = MixtureOfExpertsLLM(config, num_experts)

# Verify the model
input_ids = torch.tensor([model.tokenizer.encode("Hello, how are you?", add_special_tokens=True)])
attention_mask = torch.ones_like(input_ids)
output = model(input_ids, attention_mask=attention_mask)
print(output)


tensor([[[-1.2751, -0.5574,  0.6394,  ..., -0.5980, -0.2913, -0.0681],
         [-0.9400, -0.7675, -0.6831,  ...,  0.4926,  0.6107,  0.1921],
         [-0.6049, -0.8733, -0.3433,  ...,  0.7908, -0.5653,  0.7723],
         [-1.2577, -0.9058,  0.1342,  ...,  0.2771, -1.4552,  0.7181],
         [-0.9952, -0.5265, -1.8748,  ...,  0.1093, -1.1553,  0.7278],
         [-1.2309, -0.6475,  0.8482,  ..., -0.6022, -0.0141, -0.5603]]],
       grad_fn=<SumBackward1>)


In [9]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from huggingface_hub import login

# Authenticate using Hugging Face token
login(token='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')  # Replace with your actual Hugging Face access token

# Load dataset
dataset = load_dataset("nvidia/HelpSteer")

# Tokenize dataset
def concatenate_and_tokenize(examples):
    concatenated_texts = [prompt + " " + response for prompt, response in zip(examples['prompt'], examples['response'])]
    return model.tokenizer(concatenated_texts, truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(concatenate_and_tokenize, batched=True, remove_columns=['prompt', 'response', 'helpfulness', 'correctness', 'coherence', 'complexity', 'verbosity'])
tokenized_dataset.set_format("torch")

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,  # Set to a low number for demonstration purposes
    per_device_train_batch_size=8,  # Increase batch size if possible
    gradient_accumulation_steps=16,  # Accumulate gradients to simulate a larger batch size
    fp16=True,  # Enable mixed precision training
    save_steps=10_000,
    save_total_limit=2,
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=model.tokenizer,
    mlm=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the pretrained model and tokenizer
torch.save(model.state_dict(), "./gpt2-custom-124m-moe/pytorch_model.bin")
model.tokenizer.save_pretrained("./gpt2-custom-124m-moe")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful




Map:   0%|          | 0/35331 [00:00<?, ? examples/s]

Map:   0%|          | 0/1789 [00:00<?, ? examples/s]

Step,Training Loss


RuntimeError: Parent directory ./gpt2-custom-124m-moe does not exist.

In [10]:

# Save the pretrained model and tokenizer
torch.save(model.state_dict(), "./gpt2-custom-124m-moe/pytorch_model.bin")
model.tokenizer.save_pretrained("./gpt2-custom-124m-moe")


RuntimeError: Parent directory ./gpt2-custom-124m-moe does not exist.