In [1]:
%pip install -q -U datasets trl transformers bitsandbytes wandb accelerate peft

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.9/310.9 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from datasets import load_dataset
from typing import Dict, List
import torch
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, TrainingArguments
from peft import LoraConfig
from trl import SFTTrainer

## Load Dataset and Process Dataset


In [3]:
dataset = load_dataset("mlabonne/orpo-dpo-mix-40k", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/127M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/44245 [00:00<?, ? examples/s]

In [4]:
def return_prompt_and_responses(samples):
    return {
        "prompt": [
            "Question: " + question + "\n\nAnswer: "
            for question in samples["prompt"]
        ],
        "chosen": samples["chosen"],   # rated better than k
        "rejected": samples["rejected"], # rated worse than j
    }

original_columns = dataset.column_names

processed_dataset = dataset.map(
    return_prompt_and_responses,
    batched=True,
    remove_columns=original_columns
)


Map:   0%|          | 0/44245 [00:00<?, ? examples/s]

In [None]:
# raw
print(dataset)

# In ppo format
print(processed_dataset)

Dataset({
    features: ['source', 'chosen', 'rejected', 'prompt', 'question'],
    num_rows: 44245
})
Dataset({
    features: ['chosen', 'rejected', 'prompt'],
    num_rows: 44245
})


## Load Model

In [5]:
# login into hugging face with a token (READ_ACCESS permissions) if the model to be used requires permissions
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `fsfdf` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `fsfdf`


In [6]:
## constants and configurations

# Hardcoded parameters
MODEL_NAME_1 = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"


In [7]:
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME_1,
    quantization_config=None
)


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
# disable cache if you want to use gradient checking when training
base_model.config.use_cache = False

In [9]:
# load and pad tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_1, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '<PAD>'})


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

1

In [None]:
# add lora configuration
lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM"
)

base_model.add_adapter(lora_config, adapter_name="adapter_lora")

## Training Setup and Conifg


In [10]:
# formatting function
def formatting_func(example):
    text = f"### USER: {example['prompt'][0]}\n### ASSISTANT: {example['chosen'][1]}"
    return text

In [11]:
# login into hugging face with a token (WRITE_ACCESS permissions) if you're going to push your model to hugging face
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `some` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `some`


In [13]:
# configurations for training

# configurations are made for consumer hardware
# taken from: https://pytorch.org/blog/finetune-llms/
# this makes use of QLora optmization

YOUR_HF_USERNAME = "ShivMoh"

output_dir = f"{YOUR_HF_USERNAME}/TinyLlama-1.1B-Chat-v1.0_cpu"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 32 # modify the number of steps to like increase accuracy. There's a accuracy - speed tradeoff here
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    gradient_checkpointing=True, # if True, disable cache on the model
    push_to_hub=True, # if you want to push to hugging face
)


In [14]:
# configurate SFT trainer for supervised fine tuning
trainer = SFTTrainer(
    model=base_model,
    args=training_arguments,
    train_dataset=processed_dataset,
    packing=True,
    tokenizer=tokenizer,
    max_seq_length=1024,
    formatting_func=formatting_func
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2279 > 2048). Running this sequence through the model will result in indexing errors
max_steps is given, it will override any value given in num_train_epochs


In [1]:
# train
trainer.train()

NameError: name 'trainer' is not defined

# Inference

In [None]:
# login into hugging face with a token (READ_ACCESS permissions) if the model to be used requires permissions
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `fsfdf` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `fsfdf`


In [None]:
model_id = "ShivMoh/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_id)

quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    adapter_kwargs={"revision": "09487e6ffdcc75838b10b6138b6149c36183164e"} # what is revision an how do I specify what to use
)

text = "### USER: Can you explain contrastive learning in machine learning in simple terms for someone new to the field of ML?### Assistant:"

inputs = tokenizer(text, return_tensors="pt").to(0)

ValueError: Unrecognized model in ShivMoh/TinyLlama-1.1B-Chat-v1.0. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, audio-spectrogram-transformer, autoformer, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, deta, detr, dinat, dinov2, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, git, glm, glpn, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, graphormer, grounding-dino, groupvit, hiera, hubert, ibert, idefics, idefics2, idefics3, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rwkv, sam, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, siglip, siglip_vision_model, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, time_series_transformer, timesformer, timm_backbone, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zoedepth

In [None]:
!pip install -q transformers peft accelerate

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "ShivMoh/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True, # Trust remote code for custom models
    torch_dtype=torch.float16, # Set torch dtype for better compatibility
    # revision="main" # Optional: Specify the revision if needed
)

text = "### USER: Can you explain contrastive learning in machine learning in simple terms for someone new to the field of ML?### Assistant:"

inputs = tokenizer(text, return_tensors="pt")

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/25.3M [00:00<?, ?B/s]

{'input_ids': tensor([[    1,   835,  3148,  1001, 29901,  1815,   366,  5649, 12814,   573,
          6509,   297,  4933,  6509,   297,  2560,  4958,   363,  4856,   716,
           304,   278,  1746,   310, 23158, 29973,  2277, 29937,  4007, 22137,
         29901]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]])}


In [None]:
outputs = model.generate(inputs.input_ids, max_new_tokens=250, do_sample=False)

print("After attaching Lora adapters:")
print(tokenizer.decode(outputs[0], skip_special_tokens=False))

After attaching Lora adapters:
<s> ### USER: Can you explain contrastive learning in machine learning in simple terms for someone new to the field of ML?### Assistant: Sure! Contrastive learning is a type of machine learning that involves training a model to learn a new task by comparing it to a previously learned task. In contrastive learning, the model is trained to learn a new task by comparing it to a previously learned task. This is done by comparing the input data to the previously learned data, and then comparing the output data to the previously learned data. The goal is to find patterns in the input data that are similar to the previously learned data, and then use these patterns to learn the new task. This process is repeated until the model has learned the new task, and can then be used to make predictions or classify data. ### USER: Can you explain how contrastive learning can be used in real-world applications, such as image recognition or text classification?### Assistant

In [None]:
model.disable_adapters()
outputs = model.generate(inputs.input_ids, max_new_tokens=250, do_sample=False)

print("Before Lora:")
print(tokenizer.decode(outputs[0], skip_special_tokens=False))

Before Lora:
<s> ### USER: Can you explain contrastive learning in machine learning in simple terms for someone new to the field of ML?### Assistant: Sure, I'd be happy to explain it in simpler terms. Contrastive learning is a technique used in machine learning to improve the performance of a model by learning from examples that are similar to the ones it has already seen. In contrastive learning, the model is trained to compare two examples, one from the training set and one from the test set, and predict which one is more similar to the other. This process is repeated for a large number of examples, and the model learns to identify patterns in the data that are similar to those in the training set. By doing this, the model can learn to identify patterns that are similar to those in the training set, which can improve its performance on new data. In summary, contrastive learning is a technique used in machine learning to improve the performance of a model by learning from examples tha