In [1]:
!pip install datasets
!pip install trl
!pip install accelerate
!pip install transformers[torch]
!pip install bitsandbytes

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [2]:
import torch
from transformers import TrainingArguments, MistralForCausalLM, MistralConfig, AutoTokenizer
from datasets import load_dataset
from trl import SFTTrainer

# Adjusted configuration for a smaller model with approximately some million parameters
configuration = MistralConfig(
    vocab_size=32000,
    hidden_size=1024, # Reduced from 2048 to 1024 to work on free colab
    intermediate_size=3584, # Reduced from 7168 to 3584 to work on free colab
    num_hidden_layers=12, # Reduced from 24 to 12 to work on free colab
    num_attention_heads=32,
    num_key_value_heads=8,
    hidden_act="silu",
    max_position_embeddings=4096,
    pad_token_id=2,
    bos_token_id=1,
    eos_token_id=2
)

model = MistralForCausalLM(configuration)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", local_files_only=False)
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset('HuggingFaceTB/cosmopedia-20k', split="train")
dataset = dataset.shuffle(seed=42)
print(f'Number of prompts: {len(dataset)}')
print(f'Column names are: {dataset.column_names}')

def create_prompt_formats(sample):
    output_texts = []
    for i in range(len(sample['text'])):
        formatted_prompt = sample['text'][i]
        output_texts.append(formatted_prompt)
    return output_texts

trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    tokenizer=tokenizer,
    max_seq_length=2048,
    formatting_func=create_prompt_formats,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        warmup_steps=2,
        max_steps=1000,# Reduced from 10000 to 1000 to work on free colab
        learning_rate=1e-4,
        logging_steps=1,
        output_dir="M_outputs",
        overwrite_output_dir=True,
        save_steps=1000,
        optim="paged_adamw_32bit",
        report_to="none"
    )
)

trainer.train()
trainer.model.save_pretrained("M-final", dtype=torch.float32) # you can use float16
trainer.tokenizer.save_pretrained("M-final")


tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/470 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/61.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Number of prompts: 20000
Column names are: ['prompt', 'text_token_length', 'text', 'seed_data', 'format', 'audience']


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
1,10.5552
2,10.5955
3,10.3024
4,9.8721
5,9.4902
6,9.4157
7,9.414
8,9.0233
9,8.9977
10,8.7832


('M-final/tokenizer_config.json',
 'M-final/special_tokens_map.json',
 'M-final/tokenizer.model',
 'M-final/added_tokens.json',
 'M-final/tokenizer.json')