In [None]:
!pip install datasets trl bitsandbytes accelerate

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset, Dataset
import os
import argparse
import re
import sys
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model
import torch

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("bharatkumar0925/tmdb-movies-clean-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/bharatkumar0925/tmdb-movies-clean-dataset?dataset_version_number=1...


100%|██████████| 255M/255M [00:09<00:00, 27.5MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/bharatkumar0925/tmdb-movies-clean-dataset/versions/1


In [4]:
mv /root/.cache/kagglehub/datasets/bharatkumar0925/tmdb-movies-clean-dataset/versions/1 ./

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [6]:
model_name = "EleutherAI/pythia-1.4b"

model = AutoModelForCausalLM.from_pretrained(
  model_name,
  trust_remote_code=True,
  cache_dir="",
  quantization_config=bnb_config,
  use_cache=False,
)
model.gradient_checkpointing_disable()

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/2.93G [00:00<?, ?B/s]

The `GPTNeoXSdpaAttention` class is deprecated in favor of simply modifying the `config._attn_implementation`attribute of the `GPTNeoXAttention` class! It will be removed in v4.48


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [7]:
dataset = load_dataset("csv", data_files="/content/1/recommendation-movies/large_movies_data.csv", split="train").shuffle(seed=42).select(range(20))

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
def tokenize_function(examples):
    examples = tokenizer(
        examples["overview"],
        truncation=True,
        padding="max_length",
        max_length=2048,
    )
    examples["labels"] = examples["input_ids"]
    return examples

In [9]:
tokenizer.pad_token = tokenizer.eos_token

EOS_TOKEN = tokenizer.eos_token

In [10]:
dataset = dataset.map(
    tokenize_function,
    remove_columns=[item for item in dataset.column_names if item != "overview"],
    batched=True,
)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [11]:
pet_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules="all-linear",
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    use_dora = True,
)

In [12]:
PER_DEVICE_BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 8
EPOCHS = 2
WARMUP_STEPS = 0
LEARNING_RATE = 2e-5
OUTPUT_DIRECTORY = "output"
WEIGHT_DECAY = 0.001
SCHEDLER = "linear"
LOGGING_STEPS = 10

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    # dataset_num_proc = 2,
    peft_config=pet_config,
    args = SFTConfig(
        report_to="none",
        packing = False,
        dataset_text_field = "overview",
        max_seq_length = 2048,
        per_device_train_batch_size = PER_DEVICE_BATCH_SIZE,
        gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={"use_reentrant": True},
        warmup_steps = WARMUP_STEPS,
        num_train_epochs = EPOCHS,
        save_strategy="epoch",
        learning_rate = LEARNING_RATE,
        logging_steps = LOGGING_STEPS,
        optim = 'adamw_torch',
        # fp16=True,
        weight_decay = WEIGHT_DECAY,
        lr_scheduler_type = SCHEDLER,
        seed = 3407,
        output_dir = OUTPUT_DIRECTORY,
    ),
)

  trainer = SFTTrainer(


In [13]:
trainer_stats = trainer.train()

Step,Training Loss


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
trainer.model.save_pretrained(OUTPUT_DIRECTORY)
tokenizer.save_pretrained(OUTPUT_DIRECTORY)

('output/tokenizer_config.json',
 'output/special_tokens_map.json',
 'output/tokenizer.json')

In [None]:
from tqdm import tqdm
import json

vocab_tokens = tokenizer.get_vocab().keys()

# Convert vocabulary tokens to a list
vocab_tokens_list = list(vocab_tokens)

# Get the embeddings for each vocabulary token
embeddings_list = {}
for i in tqdm(range(len(vocab_tokens_list))):
    tokens = tokenizer(vocab_tokens_list[i], return_tensors="pt")
    embeddings = model(**tokens.to('cuda'), output_hidden_states=True).hidden_states[0][0].squeeze().tolist()
    embeddings_list[vocab_tokens_list[i]] = embeddings


with open('embeddings_list.json', 'w') as fp:
    json.dump(embeddings_list, fp, sort_keys=True, indent=4)

err


  0%|          | 52/50277 [00:25<6:40:57,  2.09it/s]