Import All Necessary Files

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW
import pandas as pd
from transformers import TrainingArguments
from transformers import Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig,get_peft_model
import bitsandbytes
from transformers import BitsAndBytesConfig
import bitsandbytes as bnb

Check for GPU availability

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Load the pre-trained model and tokenizer

In [None]:
MODEL_NAME = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B"

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, 
    trust_remote_code=True  # Allow loading custom tokenizer implementations
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

Ensure pad token is set if not present

In [9]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [10]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True)


Load the model with 4-bit quantization

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    load_in_4bit=True
      # Enable 4-bit quantization
)

config.json:   0%|          | 0.00/823 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

Define LoRA configuration for efficient training

In [13]:
lora_config = LoraConfig(
    r=16,                # rank of LoRA updates
    lora_alpha=32,       # scaling factor
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # adjust target modules as needed
    lora_dropout=0.1,    # dropout for LoRA layers
    bias="none",
    task_type="CAUSAL_LM"
)

Wrap the model with PEFT/LoRA

In [None]:
model = get_peft_model(model, lora_config)
print("LoRA model parameters:")
model.print_trainable_parameters()

LoRA model parameters:
trainable params: 4,358,144 || all params: 1,781,446,144 || trainable%: 0.2446


Data Processing

In [None]:
DATA_FILE = '/content/spider_text_sql.csv'
MAX_SEQ_LENGTH = 1024
def format_example(example):
    # Concatenate the 'intput' and 'output' columns with a newline and EOS token.
    text = example["text_query"].strip() + "\n" + example["sql_command"].strip() + tokenizer.eos_token
    return {"text": text}

# Load the CSV dataset.
dataset = load_dataset("csv", data_files={"train": DATA_FILE})
# Map each example to a unified text field.
dataset = dataset["train"].map(format_example)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/8034 [00:00<?, ? examples/s]

Tokenize the dataset.

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        padding="max_length"
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/8034 [00:00<?, ? examples/s]

Create a data collator for language modeling.

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

Run Only If Your Are on Google Colab

In [None]:
training_args = TrainingArguments(
    output_dir="./deepseek_r1_finetuned",
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,  # Simulates a larger batch size
    num_train_epochs=1,
    learning_rate=2e-5,  # Reduced learning rate for stability
    logging_steps=10,  # Log every 10 steps
    save_steps=100,  # Save checkpoint every 100 steps
    fp16=True,  # Enable mixed precision training
    report_to="none",
)


In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,)

In [26]:
print("Starting training...")
trainer.train()
print("Training complete.")

Starting training...


Step,Training Loss
10,1.8801
20,1.9536
30,1.8326
40,1.9695
50,1.8617
60,1.9075
70,1.9125
80,1.698
90,1.7583
100,1.7687


Training complete.


In [None]:
output_dir = '/content/drive/MyDrive/output'
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)Save Model

('/content/drive/MyDrive/output/tokenizer_config.json',
 '/content/drive/MyDrive/output/special_tokens_map.json',
 '/content/drive/MyDrive/output/tokenizer.json')

Save Model

In [31]:
save_path = '/content/drive/MyDrive/output1'
torch.save(model.state_dict(), save_path)

