In [None]:
# ✅ Step 0: Install necessary libraries
!pip install -q transformers datasets accelerate

# ✅ Step 1: Add swap memory to avoid RAM crash (optional but recommended)
!fallocate -l 4G /swapfile
!chmod 600 /swapfile
!mkswap /swapfile
!swapon /swapfile

# ✅ Step 2: Import libraries
import pandas as pd
from datasets import Dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM

# ✅ Step 3: Load and preprocess CSV data
df = pd.read_csv("/content/sample_data/Copy of train.csv")
df = df[["text"]].dropna()
df["text"] = df["text"].astype(str).str.strip()
dataset = Dataset.from_pandas(df[["text"]])

# ✅ Step 4: Convert to Zephyr chat format
def to_chat_format(example):
    return {
        "chat": [
            {"role": "user", "content": example["text"]},
            {"role": "assistant", "content": ""}
        ]
    }

dataset = dataset.map(to_chat_format)

# ✅ Step 5: Load model and tokenizer
model_name = "HuggingFaceH4/zephyr-7b-beta"
tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
import torch
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16  # or "auto"
).to("cuda")

# ✅ Step 6: Tokenize using apply_chat_template
def tokenize_chat(example):
    # 1. Format the chat as a string
    chat_str = tokenizer.apply_chat_template(
        example["chat"],
        tokenize=False,  # We just want a formatted string first
        add_generation_prompt=False
    )
    # 2. Tokenize the resulting string
    tokens = tokenizer(
        chat_str,
        truncation=True,
        max_length=1024,
        padding=False
    )
    return tokens



# ✅ Step 7: Tokenize + Save to Disk (prevents RAM crash)
tokenized_dataset = dataset.map(tokenize_chat, remove_columns=dataset.column_names)
tokenized_dataset.save_to_disk("tokenized_dataset")

# ✅ Step 8: (RECOMMENDED) Restart runtime here to clear memory
# Runtime > Restart Runtime

# ✅ Step 9: Reload saved tokenized dataset
from datasets import load_from_disk
tokenized_dataset = load_from_disk("tokenized_dataset")

# ✅ Step 10: Set up training
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="./zephyr-finetuned",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    fp16=True,  # Use bf16=True if you have an A100 or similar
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ✅ Step 11: Train
trainer.train()


Setting up swapspace version 1, size = 4 GiB (4294963200 bytes)
no label, UUID=088174d9-132b-4e26-baaf-9d328309377b
swapon: /swapfile: swapon failed: Invalid argument


  df = pd.read_csv("/content/sample_data/Copy of train.csv")


Map:   0%|          | 0/1381 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]