In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW
import pandas as pd
import bitsandbytes
from transformers import TrainingArguments
from transformers import Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig,get_peft_model

In [None]:
# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# Load the tokenizer
MODEL_NAME = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B"


In [None]:
# Explicitly setting the tokenizer type
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, 
    trust_remote_code=True,  # Allow loading custom tokenizer implementations
)

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Load the model with 8-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    load_in_4bit=True  # Enable 8-bit quantization
)

In [None]:
lora_config = LoraConfig(
    r=16,                # rank of LoRA updates
    lora_alpha=32,       # scaling factor
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # adjust target modules as needed
    lora_dropout=0.1,    # dropout for LoRA layers
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
# Wrap the model with PEFT/LoRA
model = get_peft_model(model, lora_config)
print("LoRA model parameters:")
model.print_trainable_parameters()

In [None]:
DATA_FILE = 'D:/Projects/Chat Assistant for SQLite Database/FineTune/SQL_CustumDataSet/spider_text_sql.csv'
MAX_SEQ_LENGTH = 1024

In [None]:
def format_example(example):
    # Concatenate the 'intput' and 'output' columns with a newline and EOS token.
    text = example["text_query"].strip() + "\n" + example["sql_command"].strip() + tokenizer.eos_token
    return {"text": text}

# Load the CSV dataset.
dataset = load_dataset("csv", data_files={"train": DATA_FILE})
# Map each example to a unified text field.
dataset = dataset["train"].map(format_example)

In [None]:
# Tokenize the dataset.
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        padding="max_length"
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

In [None]:
# Create a data collator for language modeling.
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
training_args = TrainingArguments(
    output_dir="./deepseek_r1_finetuned",
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,  # simulate a larger batch size
    num_train_epochs=3,
    learning_rate=2e-2,
    logging_steps=5,
    save_steps=50,
    fp16=True,  # mixed precision if supported
    report_to="none",
)

In [None]:
#Google Colab
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./deepseek_r1_finetuned",
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,  # Simulates a larger batch size
    num_train_epochs=3,
    learning_rate=2e-5,  # Reduced learning rate for stability
    logging_steps=10,  # Log every 10 steps
    save_steps=100,  # Save checkpoint every 100 steps
    fp16=True,  # Enable mixed precision training
    report_to="none",
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,)

In [None]:
print("Starting training...")
trainer.train()
print("Training complete.")

In [None]:
# Save Model and Tokenizer
output_dir = 'fine_tuned_deepseek'
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)