In [1]:
!pip install datasets sympy wandb
!pip install --no-cache-dir bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m295.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.2


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")

tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [4]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qw

In [14]:
import json
with open("/kaggle/input/bhagwad-geeta-data/geeta.json",'r') as f:
    data = json.load(f)

In [17]:
print(len(data))
print(type(data))
print(type(data[0]))

177
<class 'list'>
<class 'dict'>


In [32]:
import wandb

wandb.init(
    project="DeepSeek Fine-Tune on Geeta",
    mode="offline",  # Use "offline" mode to avoid network issues
    config={
        "learning_rate": 5e-5,
        "architecture": "DeepSeek-R1-Distill-Qwen-1.5B",
        "dataset": "/kaggle/input/bhagwad-geeta-data/geeta.json",
        "epochs": 3
    }
)

print("W&B initialized successfully!")

W&B initialized successfully!


In [29]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="/kaggle/input/bhagwad-geeta-data/geeta.json",split='train')
print(len(dataset))

tts = dataset.train_test_split(test_size=0.15)
train_data = tts['train']
test_data = tts['test']

len(train_data),len(test_data)

177


(150, 27)

In [36]:
def tokenize_function(examples):
    combined_texts = [f"{question}\n{answer}" for question, answer in zip(examples["question"], examples["answer"])]
    tokenized = tokenizer(combined_texts, truncation=True, max_length=512,padding="max_length")
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

train_data_tokenized = train_data.map(tokenize_function, batched=True)
test_data_tokenized = test_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

In [46]:
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")

**# Implementing LORA Config**

In [47]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=8,
    lora_alpha = 8, #lower = fast task
    # target_modules = ["q_proj" , "v_proj"],
    # lora_dropout = 0.05,
    # bias = "none",
    task_type = TaskType.CAUSAL_LM,
)

model = get_peft_model(model,lora_config)
model.print_trainable_parameters()

trainable params: 1,089,536 || all params: 1,778,177,536 || trainable%: 0.0613


In [48]:
from transformers import TrainingArguments, Trainer 

training_args = TrainingArguments(
    output_dir = "./deepseek_finetuned_on_bhagwad_geeta",
    num_train_epochs = 50,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps=16,
    fp16=True,
    logging_steps=10,
    learning_rate = 3e-5,
    logging_dir = "./logs",
    report_to = "wandb",
    run_name = "DeepSeek_finetuning_on_Bhagwad_Geeta"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = train_data_tokenized,
    eval_dataset = test_data_tokenized
)

In [49]:
trainer.train()

Step,Training Loss
10,123.978
20,110.3888
30,90.8759
40,63.9383
50,43.2456
60,25.716
70,17.2024
80,10.458
90,7.4223
100,5.7777


TrainOutput(global_step=200, training_loss=27.409903011322022, metrics={'train_runtime': 2406.6236, 'train_samples_per_second': 3.116, 'train_steps_per_second': 0.083, 'total_flos': 3.316251412660224e+16, 'train_loss': 27.409903011322022, 'epoch': 49.85333333333333})

In [50]:
save_path = "/kaggle/working"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('/kaggle/working/tokenizer_config.json',
 '/kaggle/working/special_tokens_map.json',
 '/kaggle/working/tokenizer.json')

In [55]:
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(model_name)
model = PeftModel.from_pretrained(base_model, save_path)
model = model.merge_and_unload()

final_save_path = "/kaggle/working/Finetuned"
model.save_pretrained(final_save_path)
tokenizer.save_pretrained(final_save_path)

('/kaggle/working/Finetuned/tokenizer_config.json',
 '/kaggle/working/Finetuned/special_tokens_map.json',
 '/kaggle/working/Finetuned/tokenizer.json')

In [58]:
!zip -r /kaggle/working/deepseek_finetuned.zip /kaggle/working/

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/deepseek_finetuned_on_bhagwad_geeta/ (stored 0%)
  adding: kaggle/working/deepseek_finetuned_on_bhagwad_geeta/checkpoint-200/ (stored 0%)
  adding: kaggle/working/deepseek_finetuned_on_bhagwad_geeta/checkpoint-200/scheduler.pt (deflated 56%)
  adding: kaggle/working/deepseek_finetuned_on_bhagwad_geeta/checkpoint-200/adapter_config.json (deflated 53%)
  adding: kaggle/working/deepseek_finetuned_on_bhagwad_geeta/checkpoint-200/optimizer.pt (deflated 7%)
  adding: kaggle/working/deepseek_finetuned_on_bhagwad_geeta/checkpoint-200/training_args.bin (deflated 52%)
  adding: kaggle/working/deepseek_finetuned_on_bhagwad_geeta/checkpoint-200/README.md (deflated 66%)
  adding: kaggle/working/deepseek_finetuned_on_bhagwad_geeta/checkpoint-200/rng_state.pth (deflated 25%)
  adding: kaggle/working/deepseek_finetuned_on_bhagwad_geeta/checkpoint-200/adapter_model.safetensors (deflated 8%)
  adding: kaggle/working/deepseek_finetuned_on_bha

In [None]:
from IPython.display import FileLink

FileLink("deepseek_finetuned.zip")