## Step 0 - Installing Librariess

In [27]:
!pip install -q --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install -q sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!pip install -q --no-deps unsloth

In [28]:
import os
os.environ["UNSLOTH_COMPILE_DISABLE"] = "1" 


## Step 1 - Choose a Base Model

## Step 2 - Play with the Base Model

In [29]:
# CHANGES:
# - torch_dtype set to torch.bfloat16 (fixed).
# - Paths parameterized via env vars for portability (Kaggle/local).
# - Consistent device variable.

import os, json, zipfile, random, warnings
from pathlib import Path

import torch
import pandas as pd
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import Dataset
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig

warnings.filterwarnings("ignore")
SEED = int(os.getenv("SEED", 42))
random.seed(SEED)
torch.manual_seed(SEED)

# ---- Paths ----
INPUT_DIR = os.getenv("INPUT_DIR", "/kaggle/input/final-dataset-v4")  # change if not on Kaggle
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)

TRAIN_CSV = os.getenv("TRAIN_CSV", os.path.join(INPUT_DIR, "trial_translated_with_english_V1.csv"))
TEST_XLSX = os.getenv("TEST_XLSX", os.path.join(INPUT_DIR, "test_250.xlsx"))  # can be .csv too

# ---- Model ----
MODEL_ID = os.getenv("MODEL_ID", "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct")

# ---- Device / dtype ----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DTYPE = torch.bfloat16  # prefer bfloat16 if your GPU supports it

print(f"Using device: {device}, dtype: {DTYPE}, model: {MODEL_ID}")


Using device: cuda, dtype: torch.bfloat16, model: deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct


In [30]:
# CHANGES:
# - Pulls the official chat template from tokenizer_config.json.
# - Registers this template with Unsloth (so apply_chat_template works consistently).

from unsloth.chat_templates import get_chat_template
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True,trust_remote_code=True)
assert tokenizer.chat_template is not None, "Tokenizer must provide a chat_template!"

# Register with Unsloth (tuple = (template, eos_token))
tokenizer = get_chat_template(tokenizer, chat_template=(tokenizer.chat_template, tokenizer.eos_token))

# Sanity-check
print("Template head:", tokenizer.chat_template[:200].replace("\n", "\\n"))

# Ensure padding token is set (HF usually handles this; fallback to eos)
if tokenizer.pad_token is None and tokenizer.eos_token is not None:
    tokenizer.pad_token = tokenizer.eos_token


Model does not have a padding token! Will use pad_token = <|PAD_TOKEN|>.
Template head: {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['cont


In [31]:
# CHANGES:
# - Optional 4-bit loading via USE_4BIT env var.
# - Aligns model.config.torch_dtype when possible.

USE_4BIT = bool(int(os.getenv("USE_4BIT", "0")))  # set 1 to enable 4-bit quant

bnb_config = None
if USE_4BIT:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=DTYPE,
        bnb_4bit_use_double_quant=True,
    )

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto" if torch.cuda.is_available() else None,
    quantization_config=bnb_config,
    trust_remote_code=True, 
)

try:
    model.config.torch_dtype = DTYPE
except Exception:
    pass

model.eval()
print("Model loaded.")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded.


## Step 3 - Set the LoRA Adapters

In [32]:
# CHANGES:
# - Robust column detection for IN/OUT and test prompt columns.
# - openpyxl needed for .xlsx; use CSV if you prefer.

assert os.path.exists(TRAIN_CSV), f"Train CSV not found at {TRAIN_CSV}"
assert os.path.exists(TEST_XLSX), f"Test file not found at {TEST_XLSX}"

train_df = pd.read_csv(TRAIN_CSV)

# Auto-detect columns; override these two if your schema differs.
C_IN, C_OUT = None, None
for cand_in in ["instruction", "prompt", "input", "question", "Instruction"]:
    if cand_in in train_df.columns:
        C_IN = cand_in
        break
for cand_out in ["response", "output", "answer", "target", "Response"]:
    if cand_out in train_df.columns:
        C_OUT = cand_out
        break

if C_IN is None or C_OUT is None:
    raise ValueError(
        f"Could not auto-detect (instruction/response) columns. "
        f"Found: {list(train_df.columns)}. Please set C_IN/C_OUT manually."
    )

# Read test
if TEST_XLSX.endswith(".xlsx"):
    test_df = pd.read_excel(TEST_XLSX, engine="openpyxl")
else:
    test_df = pd.read_csv(TEST_XLSX)

# Find a likely prompt column in test
TEST_PROMPT_COL = None
for cand in ["instruction", "prompt", "input", "question", "Instruction"]:
    if cand in test_df.columns:
        TEST_PROMPT_COL = cand
        break
if TEST_PROMPT_COL is None:
    for col in test_df.columns:
        if test_df[col].dtype == object:
            TEST_PROMPT_COL = col
            break
    if TEST_PROMPT_COL is None:
        raise ValueError("Could not determine test prompt column. Please set TEST_PROMPT_COL.")

print(f"Train columns: IN={C_IN}, OUT={C_OUT}; Test prompt column: {TEST_PROMPT_COL}")
print(f"Train size={len(train_df)}, Test size={len(test_df)}")


Train columns: IN=instruction, OUT=response; Test prompt column: instruction
Train size=74, Test size=250


## Step 4 - Data Prep

### Chat Template

In [33]:
# CHANGES:
# - Uses 'user' / 'assistant' roles (not 'model').
# - Builds a single 'text' field per sample using tokenizer.apply_chat_template.

def to_convo(row):
    return [
        {"role": "user", "content": str(row[C_IN])},
        {"role": "assistant", "content": str(row[C_OUT])},
    ]

chat_texts = []
for _, row in train_df.iterrows():
    conv = to_convo(row)
    txt = tokenizer.apply_chat_template(
        conv, tokenize=False, add_generation_prompt=False
    )
    chat_texts.append(txt)

train_text_df = pd.DataFrame({"text": chat_texts})
dataset = Dataset.from_pandas(train_text_df)
print(dataset)


Dataset({
    features: ['text'],
    num_rows: 74
})


In [34]:
import torch

# --- Precision auto-detect ---
has_cuda = torch.cuda.is_available()
mps = getattr(torch.backends, "mps", None)
has_mps = bool(mps and torch.backends.mps.is_available())  # Apple Silicon

# BF16 needs Ampere+ (compute capability >= 8.0) with CUDA 11+
def _ampere_or_newer():
    if not has_cuda:
        return False
    major, minor = torch.cuda.get_device_capability(0)
    return major >= 8

use_bf16 = has_cuda and _ampere_or_newer()
use_fp16 = has_cuda and not use_bf16  # prefer bf16 when available; otherwise fp16; otherwise neither

# MPS or CPU -> no mixed precision via TrainingArguments
if has_mps or (not has_cuda):
    use_bf16 = False
    use_fp16 = False

print(f"CUDA: {has_cuda} | MPS: {has_mps} | fp16: {use_fp16} | bf16: {use_bf16}")


CUDA: True | MPS: False | fp16: True | bf16: False


### Use Your Own Instruction Dataset
### You will find a sample dataset here - https://noshinulfat.github.io/blp25_code_generation_task/#/task-announcement

### Map

In [35]:
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig
import os, torch

# --- LoRA ---
lora = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=None,  # let PEFT auto-detect; or pass a list for your model family
)

# --- Hyperparams ---
EPOCHS = int(os.getenv("EPOCHS", "1"))
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "2"))
GRAD_ACCUM = int(os.getenv("GRAD_ACCUM", "8"))
LR = float(os.getenv("LR", "2e-4"))
MAX_SEQ_LEN = int(os.getenv("MAX_SEQ_LEN", "4096"))

# Ensure tokenizer has a pad token (common source of OOMs / shape errors)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    if hasattr(model.config, "pad_token_id"):
        model.config.pad_token_id = tokenizer.pad_token_id

# Optional: helps large context models
if hasattr(model, "gradient_checkpointing_enable"):
    model.gradient_checkpointing_enable()

# --- TRL config (ONLY TrainingArguments-compatible fields here) ---

sft_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    fp16=False,
    bf16=True,
    report_to=None,  # or "wandb"/"tensorboard"
)


# --- Trainer (pass seq length / dataset field / packing HERE) ---
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    peft_config=lora,
    args=sft_config,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LEN,
    packing=False,
)

print("Trainer ready.")


ValueError: Your setup doesn't support bf16/gpu. You need Ampere+ GPU with cuda>=11.0

In [None]:
trainer.train()

### Check the First Instance

In [None]:
# CHANGES:
# - Consistent device handling.
# - Strips echoed prompt text when possible.

GEN_KW = dict(
    max_new_tokens=512,
    do_sample=True,
    temperature=0.2,
    top_p=0.95,
    num_return_sequences=1,
)

def generate_one(prompt: str) -> str:
    conv = [{"role": "user", "content": str(prompt)}]
    templated = tokenizer.apply_chat_template(
        conv, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer(templated, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model.generate(**inputs, **GEN_KW)
    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    # Try to remove the prompt portion if generation echoes it
    return decoded[len(templated):].strip() if decoded.startswith(templated) else decoded.strip()

# Quick smoke test
print(generate_one("Say hello in one short line."))


### Formatting

In [None]:
# CHANGES:
# - Produces both submission.json and submission.zip.

preds = []
for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    prompt = str(row[TEST_PROMPT_COL])
    pred = generate_one(prompt)
    preds.append(pred)

submission = [{"id": int(i), "output": o} for i, o in enumerate(preds)]

sub_json = Path(OUTPUT_DIR) / "submission.json"
with open(sub_json, "w", encoding="utf-8") as f:
    json.dump(submission, f, ensure_ascii=False, indent=2)

zip_path = Path(OUTPUT_DIR) / "submission.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
    zf.write(sub_json, arcname="submission.json")

print(f"Saved: {sub_json}")
print(f"Saved: {zip_path}")


### Check the First Instance Again

In [None]:
# from trl import SFTTrainer, SFTConfig
# import math

# # --- constants ----------------------------------------------------------
# DATASET_SIZE    = 74
# PER_DEV_BATCH   = 16
# GRAD_ACC_STEPS  = 4
# EPOCHS          = 10

# # derived values (no longer passed as kwargs)
# EFFECTIVE_BATCH   = PER_DEV_BATCH * GRAD_ACC_STEPS
# STEPS_PER_EPOCH   = math.ceil(DATASET_SIZE / EFFECTIVE_BATCH)
# MAX_STEPS         = EPOCHS * STEPS_PER_EPOCH

# # If your GPU supports bfloat16 (A100/H100/4090 etc.), prefer bf16=True, fp16=False.
# # With 4-bit QLoRA both work; fp16=True is fine on most consumer GPUs.
# cfg = SFTConfig(
#     dataset_text_field            = "text",

#     # ── memory / speed knobs ────────────────────────────────────────────
#     packing                       = False,             # set True if your samples are short & you want higher throughput
#     per_device_train_batch_size   = PER_DEV_BATCH,
#     gradient_accumulation_steps   = GRAD_ACC_STEPS,
#     gradient_checkpointing        = True,
#     gradient_checkpointing_kwargs = {"use_reentrant": False},  # avoids re-entrant issues on some stacks
#     bf16                          = False,
#     fp16                          = True,

#     # Use paged 8-bit AdamW with 4-bit QLoRA
#     optim                         = "paged_adamw_8bit",

#     # ── schedule / optimisation ────────────────────────────────────────
#     num_train_epochs              = EPOCHS,
#     max_steps                     = MAX_STEPS,        # keeps the same total step target
#     warmup_steps                  = 10,
#     lr_scheduler_type             = "cosine",
#     learning_rate                 = 1e-4,             # common for r=16 LoRA; adjust 5e-5–2e-4 as needed
#     weight_decay                  = 0.01,

#     # ── misc / io ──────────────────────────────────────────────────────
#     logging_steps                 = 1,
#     dataset_num_proc              = 8,
#     seed                          = 3407,
#     report_to                     = "none",
#     save_safetensors              = True,
#     remove_unused_columns         = False,            # avoids dropping columns when using custom formatting
#     dataloader_pin_memory         = True,
# )

# trainer = SFTTrainer(
#     model         = model,       # loaded earlier with Unsloth (4-bit) and LoRA injected
#     tokenizer     = tokenizer,
#     train_dataset = dataset,
#     args          = cfg,
# )


### Masking

In [None]:
# from unsloth.chat_templates import train_on_responses_only
# trainer = train_on_responses_only(
#     trainer,
#     instruction_part = "<start_of_turn>user\n",
#     response_part = "<start_of_turn>model\n",
# )

In [None]:
# from unsloth.chat_templates import train_on_responses_only

# trainer = train_on_responses_only(
#     trainer,
#     instruction_part = "<|user|>\n",       # user message marker
#     response_part    = "<|assistant|>\n",  # assistant response marker
# )


### Verify the Masking

### Now let's print the masked out example - you should see only the answer is present:

## Step 6 - Let's Finetune 🔥🔥

In [None]:
# import json

# def add_prefix_to_response(input_file: str, output_file: str, prefix: str):
#     """
#     Reads a JSON file with fields 'id' and 'response', 
#     adds a prefix string before the 'response' field value,
#     and writes the modified JSON to a new file.

#     Args:
#         input_file (str): Path to input JSON file.
#         output_file (str): Path to output JSON file.
#         prefix (str): String to add before response.
#     """
#     # Read the JSON file
#     with open(input_file, "r", encoding="utf-8") as infile:
#         data = json.load(infile)

#     # Handle both dict and list of dicts
#     if isinstance(data, dict):
#         if "response" in data:
#             data["response"] = prefix + data["response"]
#     elif isinstance(data, list):
#         for item in data:
#             if isinstance(item, dict) and "response" in item:
#                 item["response"] = prefix + item["response"]

#     # Write modified JSON to output file
#     with open(output_file, "w", encoding="utf-8") as outfile:
#         json.dump(data, outfile, indent=4, ensure_ascii=False)


# # Example usage:
# add_prefix_to_response("submission.json", "submission_modified.json", "python\n")


## Step 9 - Preparing Submission File

In [None]:
# import json, os, re, zipfile

# SUB_PATH = "submission.json"

# def file_format_check(path: str) -> bool:
#     # name + extension
#     if os.path.basename(path) != "submission.json":
#         print("Error: File name must be exactly 'submission.json'")
#         return False
#     if not path.lower().endswith(".json"):
#         print("Error: File must have .json extension")
#         return False

#     # must be valid JSON (not JSONL) and root must be a list
#     try:
#         with open(path, "r", encoding="utf-8") as f:
#             data = json.load(f)
#     except json.JSONDecodeError as e:
#         print(f"Error: Invalid JSON format - {e}")
#         print("Note: The file must be in proper JSON format (not JSONL)")
#         return False

#     if not isinstance(data, list):
#         print("Error: The root element should be a list of objects")
#         return False

#     # each item: dict with ONLY keys {'id','response'}; id=int; response=str
#     for idx, item in enumerate(data):
#         if not isinstance(item, dict):
#             print(f"Error: Item at index {idx} is not a dictionary")
#             return False
#         keys = set(item.keys())
#         if keys != {"id", "response"}:
#             print(f"Error: Item at index {idx} must contain only keys 'id' and 'response', found: {keys}")
#             return False
#         if not isinstance(item["id"], int):
#             print(f"Error: 'id' field at index {idx} must be an integer")
#             return False
#         if not isinstance(item["response"], str):
#             print(f"Error: 'response' field at index {idx} must be a string")
#             return False

#     print("Format check passed successfully!")
#     return True

# # ---------- Load, compute per-item validity, blank invalids, save, zip ----------
# # Load JSON list
# with open(SUB_PATH, "r", encoding="utf-8") as f:
#     data = json.load(f)

# n = len(data)
# fence_pat = re.compile(r"^```python[\s\S]*```$", re.MULTILINE)

# valid_format = []
# valid_fence  = []
# valid_both   = []

# # Per-item validation mirrors file checker semantics
# def item_format_ok(item):
#     return (
#         isinstance(item, dict)
#         and set(item.keys()) == {"id", "response"}
#         and isinstance(item["id"], int)
#         and isinstance(item["response"], str)
#     )

# for item in data:
#     vfmt = item_format_ok(item)
#     vf   = bool(fence_pat.match(item["response"])) if vfmt else False
#     valid_format.append(vfmt)
#     valid_fence.append(vf)
#     valid_both.append(vfmt and vf)

# # Report stats
# nf = sum(valid_fence)
# nm = sum(valid_format)
# nb = sum(valid_both)
# den = max(n, 1)
# print(f"Fencing valid: {nf}/{n} ({nf*100.0/den:.1f}%)")
# print(f"Format valid:  {nm}/{n} ({nm*100.0/den:.1f}%)")
# print(f"Both valid:    {nb}/{n} ({nb*100.0/den:.1f}%)")

# # Strict policy: blank responses that fail ANY check
# for i, ok in enumerate(valid_both):
#     if not ok and isinstance(data[i], dict) and "response" in data[i]:
#         data[i]["response"] = ""

# # Overwrite submission.json (id+response only)
# with open(SUB_PATH, "w", encoding="utf-8") as f:
#     json.dump(
#         [{"id": item["id"], "response": item["response"]} for item in data],
#         f, ensure_ascii=False, indent=2
#     )
# print("✅ Updated submission.json after checks (invalid responses blanked).")

# # Final file-level check (should pass)
# _ = file_format_check(SUB_PATH)

# # Zip as submission.zip (Jupyter-friendly, no shell commands)
# with zipfile.ZipFile("submission.zip", "w", compression=zipfile.ZIP_DEFLATED) as zf:
#     zf.write(SUB_PATH)
# print("📦 Created submission.zip containing submission.json.")


# Submit the submission.zip file in CodaBench

### Save the NEW model...if it's good :)

In [None]:
# model.save_pretrained("New_Model")  # Local saving
# tokenizer.save_pretrained("New_Model")