# part1: dependency and login

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # Suppress TensorFlow logs

In [2]:

!pip install -q transformers peft trl datasets bitsandbytes evaluate rouge_score nltk accelerate wandb hf_xet

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.0/348.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.5/25.5 MB[0m [31m75.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.9 MB/s[0m eta 

In [3]:
import os
import pandas as pd
import numpy as np
import torch
import nltk
import evaluate 

from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    # TrainingArguments,
    # HfArgumentParser
)
from huggingface_hub import login
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt', quiet=True)
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

E0000 00:00:1747160731.885492      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747160731.938808      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Visible CUDA devices:", torch.cuda.device_count())
print("Using device:", torch.cuda.get_device_name(0))

print(f"Current GPU: {torch.cuda.current_device()}")

Visible CUDA devices: 1
Using device: Tesla P100-PCIE-16GB
Current GPU: 0


In [5]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/d/rockerleo/mimic-final/mimic_tuning_small_fixed.csv
/kaggle/input/small-raapid/mimic_instruction_tuning_small.csv


In [6]:
from kaggle_secrets import UserSecretsClient
secret_label = "HF_TOKEN"
hf_token = UserSecretsClient().get_secret(secret_label)

try:
    if hf_token:
        print("Hugging Face token found. Loggin in!")
        login(token = hf_token)
except NameError: # In case 'userdata' is not available (e.g. local run)
    print("HF_TOKEN secret not found. Assuming public model access or manual login.")
    notebook_login() # Uncomment for manual login if needed

Hugging Face token found. Loggin in!


In [7]:
from kaggle_secrets import UserSecretsClient
import wandb
user_secrets = UserSecretsClient()
secret_value_1 = user_secrets.get_secret("wandb_token")
wandb.login(key = secret_value_1)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgoel0632[0m ([33mgoel0632-raapid-ai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# part2: model loading and dataset

In [8]:
################################################################################
# 3. CONFIGURATION PARAMETERS
################################################################################

# --- Model Configuration ---
model_name = "mistralai/Mistral-7B-Instruct-v0.3"


# --- Dataset Configuration ---
dataset_path = "/kaggle/input/d/rockerleo/mimic-final/mimic_tuning_small_fixed.csv" # <--- CHANGE THIS

# --- LoRA Configuration ---
lora_r = 16  
lora_alpha = 32 
lora_dropout = 0.05 
lora_target_modules = [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj",
]
use_4bit_quantization = True 
compute_dtype = torch.float16 

# --- Training Arguments ---
output_dir = "/kaggle/working/mistral_finetuned_lora" 
per_device_train_batch_size = 2 
gradient_accumulation_steps = 4
per_device_eval_batch_size = 1  
eval_accumulation_steps = 2
# per_device_eval_batch_size = 1  
# eval_accumulation_steps = 1
learning_rate = 5e-5 
num_train_epochs = 1 
max_steps = -1 
logging_steps = 25 
eval_steps = 100
save_steps = 100 
max_seq_length = 2048 
warmup_ratio = 0.03 
# default weight decay
lr_scheduler_type = "cosine" 
# lr_scheduler_type = "linear" 
# optim_type = "paged_adamw_8bit" if use_4bit_quantization else "adamw_torch" 
optim_type = "adamw_torch" 

# --- Evaluation Metrics ---
rouge_metric = evaluate.load('rouge')
bleu_metric = evaluate.load('bleu')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [9]:
################################################################################
# 4. DATASET PREPARATION
################################################################################

# Function to format prompts
def format_prompt(example):
    """
    Formats a single data example into the required prompt structure for Mistral Instruct.
    Example:
    <s>[INST] instruction
    input (if provided) [/INST] output</s>
    """
    instruction = str(example.get('instruction', ''))
    input_text = str(example.get('input', ''))
    output_text = str(example.get('output', ''))

    if not instruction: # Skip if instruction is missing
        return None
    if not output_text: # Skip if output is missing
        return None

    # Format with instruction and input
    text = f"<s>[INST] {instruction}\n{input_text} [/INST] {output_text}</s>"
    return {"text": text}

# Load dataset
try:
    raw_dataset = load_dataset('csv', data_files={'train': dataset_path})['train']
    print(f"Successfully loaded dataset from {dataset_path}")
    print(f"Dataset features: {raw_dataset.features}")
    print(f"First example: {raw_dataset[0]}")

except FileNotFoundError:
    print(f"ERROR: Dataset file not found at {dataset_path}. Please check the path.")
    # You might want to raise an error or exit here if the dataset is crucial


formatted_dataset = raw_dataset.map(format_prompt, num_proc=os.cpu_count() // 2 if os.cpu_count() > 1 else 1)
original_columns = formatted_dataset.column_names
formatted_dataset = formatted_dataset.filter(lambda example: example["text"] is not None)

if len(formatted_dataset) == 0:
    raise ValueError("Dataset is empty after formatting. Check your data and formatting function.")

print(f"Number of examples after formatting: {len(formatted_dataset)}")
if len(formatted_dataset) > 0:
    print(f"First formatted example: {formatted_dataset[0]['text']}")


if len(formatted_dataset) < 10: # Arbitrary small number, adjust as needed
    print("Warning: Dataset is very small. Using all data for training and evaluation.")
    train_dataset = formatted_dataset
    eval_dataset = formatted_dataset
else:
    split_dataset = formatted_dataset.train_test_split(test_size=0.05, seed=42)
    train_dataset = split_dataset['train']
    eval_dataset = split_dataset['test']

print(f"Train dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")

Generating train split: 0 examples [00:00, ? examples/s]

Successfully loaded dataset from /kaggle/input/d/rockerleo/mimic-final/mimic_tuning_small_fixed.csv
Dataset features: {'instruction': Value(dtype='string', id=None), 'input': Value(dtype='string', id=None), 'output': Value(dtype='string', id=None)}
First example: {'instruction': 'Extract ICD-10-CM codes.', 'input': 'Admission Date:  [**2169-1-9**]              Discharge Date:   [**2169-1-15**]\n\nDate of Birth:  [**2087-9-22**]             Sex:   F\n\nService: CARDIOTHORACIC\n\nAllergies:\nPatient recorded as having No Known Allergies to Drugs\n\nAttending:[**First Name3 (LF) 1283**]\nChief Complaint:\nAsymptomatic valve and coronary artery disease\n\nMajor Surgical or Invasive Procedure:\n[**2169-1-10**] - AVR (21mm [**Company 1543**] Mosaic Porcine Valve); CABGx3 (Left\ninternal mammary->Left anterior descending artery, Vein->Obtuse\nmarginal artery, vein->right coronary artery)\n\n\nHistory of Present Illness:\n81 y/o female with known aortic stenosis which has been followed\nby ser

Map (num_proc=2):   0%|          | 0/4720 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4720 [00:00<?, ? examples/s]

Number of examples after formatting: 4720
First formatted example: <s>[INST] Extract ICD-10-CM codes.
Admission Date:  [**2169-1-9**]              Discharge Date:   [**2169-1-15**]

Date of Birth:  [**2087-9-22**]             Sex:   F

Service: CARDIOTHORACIC

Allergies:
Patient recorded as having No Known Allergies to Drugs

Attending:[**First Name3 (LF) 1283**]
Chief Complaint:
Asymptomatic valve and coronary artery disease

Major Surgical or Invasive Procedure:
[**2169-1-10**] - AVR (21mm [**Company 1543**] Mosaic Porcine Valve); CABGx3 (Left
internal mammary->Left anterior descending artery, Vein->Obtuse
marginal artery, vein->right coronary artery)


History of Present Illness:
81 y/o female with known aortic stenosis which has been followed
by serial echocardiograms. Her most recent echocardiogram showed
severe aortic stenosis with dilation of her left atrium and left
ventricle. She underwent an elective cardiac catheterization
which revealed severe three vessel disease.

Past Me

In [10]:
train_dataset["text"][0]

'<s>[INST] Extract clinical findings as condition evidence.\nAdmission Date:  [**2178-9-3**]              Discharge Date:   [**2178-9-5**]\n\n\nService: SURGERY\n\nAllergies:\nPatient recorded as having No Known Allergies to Drugs\n\nAttending:[**First Name3 (LF) 4691**]\nChief Complaint:\ns/p fall\nHead Trauma\n\nMajor Surgical or Invasive Procedure:\nNone\n\nHistory of Present Illness:\n83 yo woman with CAD who presents after fall from standing\nposition today, causing a hip fracture as well as significant\nSAH and left temporal lobe contusion. She was initially awake\nand alert then declined in responsiveness. Intubated in ED.\n\nPast Medical History:\nappendectomy\n\nSocial History:\nnon-contributory\n\nFamily History:\nnon-contributory\n\nPhysical Exam:\n98.5  70  173/95  16  96% intubated\nGen: verbalizing incoherently --> non-verbal; disoriented; GCS\n10\nHead: hematoma at occiput, PERRL\nNeck: c-collar in place, no appreciable C-spine stepoffs\nCV: RRR\nLungs: CTAB\nAbd: soft/N

In [11]:
train_dataset, eval_dataset

(Dataset({
     features: ['instruction', 'input', 'output', 'text'],
     num_rows: 4484
 }),
 Dataset({
     features: ['instruction', 'input', 'output', 'text'],
     num_rows: 236
 }))

In [12]:
################################################################################
# 5. TOKENIZER AND MODEL LOADING
################################################################################

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Set padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Tokenizer pad_token set to eos_token.")

# Configure BitsAndBytes for 4-bit quantization
if use_4bit_quantization:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",        # "nf4" or "fp4"
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,   # Improves quantization precision
    )
    print("4-bit quantization configured.")
else:
    bnb_config = None
    print("Not using 4-bit quantization.")

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    # device_map="auto", # Automatically distribute model across available GPUs
    trust_remote_code=True,
    torch_dtype=compute_dtype if bnb_config is None else None, # Set dtype if not quantizing, BNB handles it otherwise
    use_cache=False,
)
print(f"Model {model_name} loaded.")

# Configure model for k-bit training (important if using quantization)
if use_4bit_quantization:
    model = prepare_model_for_kbit_training(model)
    print("Model prepared for k-bit training.")

# Configure LoRA
peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none", # Typically "none" for LoRA
    task_type="CAUSAL_LM",
)
print("LoRA configured.")

# Note: SFTTrainer will apply PEFT to the model internally if peft_config is provided.
# If you wanted to apply it manually:
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Tokenizer pad_token set to eos_token.
4-bit quantization configured.


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Model mistralai/Mistral-7B-Instruct-v0.3 loaded.
Model prepared for k-bit training.
LoRA configured.


In [13]:
################################################################################
# 6. EVALUATION METRICS FUNCTION (V2)
################################################################################

def compute_metrics_for_trainer(eval_preds):
    """
    Computes ROUGE and BLEU scores for generated text.
    eval_preds is a tuple (predictions, labels).
    predictions are the generated token IDs.
    labels are the input_ids of the original text, with prompt parts masked by -100.
    """
    predictions_ids, label_ids = eval_preds

    # Handle tuple outputs (e.g., (logits,) or (sequences,))
    if isinstance(predictions_ids, tuple):
        predictions_ids = predictions_ids[0]

    # Convert torch tensors to numpy safely
    if isinstance(predictions_ids, torch.Tensor):
        predictions_ids = predictions_ids.detach().cpu().numpy()
    if isinstance(label_ids, torch.Tensor):
        label_ids = label_ids.detach().cpu().numpy()

    # Ensure proper integer type
    predictions_ids = predictions_ids.astype("int64")
    label_ids = label_ids.astype("int64")

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(predictions_ids, skip_special_tokens=True)

    # Replace -100 in labels with pad_token_id for decoding
    label_ids_copy = np.copy(label_ids)
    label_ids_copy[label_ids_copy == -100] = tokenizer.pad_token_id
    decoded_labels = tokenizer.batch_decode(label_ids_copy, skip_special_tokens=True)

    # Strip and clean
    decoded_preds = [pred.strip() if pred else " " for pred in decoded_preds]
    decoded_labels = [label.strip() if label else " " for label in decoded_labels]

    # Compute ROUGE
    rouge_output = rouge_metric.compute(predictions=decoded_preds,
                                        references=decoded_labels,
                                        use_stemmer=True)

    # Compute BLEU using tokenized format
    tokenized_preds = [nltk.word_tokenize(pred.lower()) for pred in decoded_preds]
    tokenized_labels = [[nltk.word_tokenize(label.lower())] for label in decoded_labels]
    bleu_output = bleu_metric.compute(predictions=tokenized_preds, references=tokenized_labels)

    # Combine and return metrics
    metrics_results = {
        "rouge1": rouge_output["rouge1"].mid.fmeasure,
        "rouge2": rouge_output["rouge2"].mid.fmeasure,
        "rougeL": rouge_output["rougeL"].mid.fmeasure,
        "rougeLsum": rouge_output["rougeLsum"].mid.fmeasure,
        "bleu": bleu_output["bleu"]
    }

    return metrics_results


In [14]:
# safe_metrics = trainer.evaluate(
#     eval_dataset=safe_eval_dataset,
#     max_length=256,
#     num_beams=1,
#     metric_key_prefix="safe_eval"
# )


In [15]:
torch.cuda.empty_cache()
model.config.use_cache = False
model.gradient_checkpointing_enable()

In [16]:
################################################################################
# 7. TRAINING ARGUMENTS AND TRAINER INITIALIZATION
################################################################################
training_args = SFTConfig(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    eval_accumulation_steps = eval_accumulation_steps,
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    max_steps=max_steps,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=logging_steps,
    eval_strategy="steps" if eval_dataset is not None and len(eval_dataset) > 0 else "no",
    eval_steps=eval_steps if eval_dataset is not None and len(eval_dataset) > 0 else None,
    save_strategy="steps",
    save_steps=save_steps,
    save_total_limit=2, # Save only the last 2 checkpoints
    load_best_model_at_end=True if eval_dataset is not None and len(eval_dataset) > 0 else False,
    metric_for_best_model="eval_loss" if eval_dataset is not None and len(eval_dataset) > 0 else None, # Or 'eval_bleu', 'eval_rougeL' after prefix
    greater_is_better=False if eval_dataset is not None and len(eval_dataset) > 0 and "loss" in (eval_dataset.column_names if eval_dataset else []) else True, # False for loss, True for ROUGE/BLEU
    report_to="wandb", # or "tensorboard"
    run_name = "mistral-tuning",
    fp16=True if compute_dtype == torch.float16 and not use_4bit_quantization else False, # FP16 training if not using 4-bit and dtype is float16
    bf16=True if compute_dtype == torch.bfloat16 and not use_4bit_quantization else False, # BF16 training
    optim=optim_type,
    warmup_ratio=warmup_ratio,
    seed=42,
    resume_from_checkpoint = True,
    # lr_scheduler_type=lr_scheduler_type,

    # prediction_loss_only=True,

        
    # generation_max_length=512,  # Prevent excessively long outputs
    # generation_num_beams=1,     # Reduce memory during eval
    # packing=False,
    # Required if using SFTTrainer and want to compute metrics like ROUGE/BLEU during training
    # predict_with_generate=True,
    # remove_unused_columns=True, # SFTTrainer default, but good to be explicit

    # multi - gpu idk:
    # dataloader_pin_memory=True,  # Avoids device issues
    # ddp_find_unused_parameters=False,  # Critical for multi-GPU
)
print("TrainingArguments configured.")

# Initialize SFTTrainer
trainer = SFTTrainer(
    model=model,
    # tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset if len(eval_dataset) > 0 else None,
    peft_config=peft_config,
    # dataset_text_field="text",  # Name of the column with formatted prompts
    # max_seq_length=max_seq_length,
    # formatting_func=format_prompt, # Alternative to dataset_text_field if you pass raw dataset
    # compute_metrics=compute_metrics_for_trainer if eval_dataset is not None and len(eval_dataset) > 0 else None,
    # preprocess_logits_for_metrics=lambda logits, labels: torch.argmax(logits, dim=-1), # This allows us to reduce the size of the logits stored on the GPU and use a larger eval_accumulation_steps value
    # only thing i do not understand in this program is the above parameter, but program breaks if i remove this, i regret my life now
)
print("SFTTrainer initialized.")

TrainingArguments configured.


Converting train dataset to ChatML:   0%|          | 0/4484 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/4484 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/4484 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/4484 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/236 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/236 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/236 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/236 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


SFTTrainer initialized.


# training and eval

In [17]:
################################################################################
# 8. TRAINING
################################################################################
if len(train_dataset) > 0:
    print("Starting training...")
    train_result = trainer.train()
    print("Training finished.")

    # Save training metrics
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer_eval = trainer.evaluate()
    # Save the final model (LoRA adapters)
    final_model_path = os.path.join(output_dir, "final_model_adapters")
    trainer.save_model(final_model_path) # Saves LoRA adapters
    tokenizer.save_pretrained(final_model_path) # Save tokenizer with adapters
    print(f"Fine-tuned LoRA adapters and tokenizer saved to {final_model_path}")
else:
    print("Skipping training as train_dataset is empty.")

Starting training...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Tracking run with wandb version 0.19.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250513_182904-o3ilpkls[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mmistral-tuning[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/goel0632-raapid-ai/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/goel0632-raapid-ai/huggingface/runs/o3ilpkls[0m


Step,Training Loss,Validation Loss
100,1.2496,1.232457
200,1.0559,1.07084
300,0.9558,0.939779
400,0.841,0.823374
500,0.7514,0.74606


Training finished.
***** train metrics *****
  total_flos               = 183384203GF
  train_loss               =      1.0093
  train_runtime            =  9:51:36.24
  train_samples_per_second =       0.126
  train_steps_per_second   =       0.016


Fine-tuned LoRA adapters and tokenizer saved to /kaggle/working/mistral_finetuned_lora/final_model_adapters


In [18]:
################################################################################
# 9. OPTIONAL: MERGE ADAPTERS AND SAVE FULL MODEL (for easier deployment)
################################################################################
# This step is optional. It merges the LoRA weights into the base model,
# creating a new model that doesn't require the PEFT library for inference.

# Only proceed if training happened and model was saved
if len(train_dataset) > 0 and 'final_model_path' in locals():
    print("\nMerging LoRA adapters with the base model...")
    try:
        # Reload the base model (important: use same quantization and dtype as training for consistency)
        # Ensure device_map is set for potentially large model loading
        base_model_for_merge = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config, # Use the same bnb_config as during training
            device_map="auto", # Or "cpu" if GPU memory is an issue for merging, then move to GPU later
            trust_remote_code=True,
            torch_dtype=compute_dtype if bnb_config is None else None
        )
        
        # Load the PEFT model (adapters)
        from peft import PeftModel
        merged_model = PeftModel.from_pretrained(base_model_for_merge, final_model_path)
        
        # Merge LoRA weights
        merged_model = merged_model.merge_and_unload()
        print("LoRA adapters merged into the base model.")

        # Save the merged model
        merged_model_path = os.path.join(output_dir, "final_merged_model")
        merged_model.save_pretrained(merged_model_path)
        tokenizer.save_pretrained(merged_model_path)
        print(f"Full merged model and tokenizer saved to {merged_model_path}")

    except Exception as e:
        print(f"Error during model merging: {e}")
        print("Skipping model merging.")



Merging LoRA adapters with the base model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



LoRA adapters merged into the base model.
Full merged model and tokenizer saved to /kaggle/working/mistral_finetuned_lora/final_merged_model


In [19]:
# ################################################################################
# # 10. EXAMPLE INFERENCE (using the fine-tuned LoRA adapters)
# ################################################################################
# # Only proceed if training happened and model was saved
# if len(train_dataset) > 0 and 'final_model_path' in locals():
#     print("\nRunning example inference with fine-tuned LoRA adapters...")
#     try:
#         # Load the base model again (if not already loaded or if using merged_model above)
#         # For inference with adapters, you need the base model + adapters.
#         # If you ran the merge step, you could load `final_merged_model` directly.
        
#         # Option 1: Load base model and apply adapters (if not using merged model)
#         # Ensure device_map is appropriate for inference
#         inference_base_model = AutoModelForCausalLM.from_pretrained(
#             model_name,
#             quantization_config=bnb_config, # Consistent quantization
#             # device_map="auto",
#             trust_remote_code=True,
#             torch_dtype=compute_dtype if bnb_config is None else None
#         )
#         inference_tokenizer = AutoTokenizer.from_pretrained(model_name) # Or final_model_path
#         if inference_tokenizer.pad_token is None:
#             inference_tokenizer.pad_token = inference_tokenizer.eos_token

#         from peft import PeftModel
#         inference_model = PeftModel.from_pretrained(inference_base_model, final_model_path)
#         inference_model.eval() # Set to evaluation mode
#         print("Loaded base model with LoRA adapters for inference.")

#         # Option 2: Load the merged model (if merge step was successful)
#         # from transformers import AutoModelForCausalLM, AutoTokenizer
#         # merged_model_path_to_load = os.path.join(output_dir, "final_merged_model")
#         # if os.path.exists(merged_model_path_to_load):
#         #     print(f"Loading merged model from {merged_model_path_to_load} for inference...")
#         #     inference_model = AutoModelForCausalLM.from_pretrained(merged_model_path_to_load, device_map="auto", torch_dtype=compute_dtype)
#         #     inference_tokenizer = AutoTokenizer.from_pretrained(merged_model_path_to_load)
#         #     inference_model.eval() # Set to evaluation mode
#         # else:
#         #     print("Merged model not found, using base model + adapters (if available).")


#         # Example prompt
#         df = pd.read_csv(dataset_path)
        
#         test_instruction = df.iloc(0)["instructions"]
#         test_input = df.iloc(0)["input"]

#         if test_input and str(test_input).strip() != "":
#             prompt_text = f"<s>[INST] {test_instruction}\n{test_input} [/INST]"
#         else:
#             prompt_text = f"<s>[INST] {test_instruction} [/INST]"

#         print(f"\nTest Prompt:\n{prompt_text}")

#         # Tokenize and generate
#         # Ensure inputs are on the same device as the model
#         device = inference_model.device
#         inputs = inference_tokenizer(prompt_text, return_tensors="pt", truncation=True, max_length=max_seq_length).to(device)
        
#         with torch.no_grad(): # Disable gradient calculations for inference
#             outputs = inference_model.generate(
#                 **inputs,
#                 max_new_tokens=150,  # Adjust as needed
#                 num_return_sequences=1,
#                 do_sample=True,       # For more creative outputs
#                 temperature=0.7,      # Controls randomness (lower is more deterministic)
#                 top_k=50,             # Considers the top K tokens
#                 top_p=0.95,           # Nucleus sampling
#                 pad_token_id=inference_tokenizer.eos_token_id # Important for generation
#             )
        
#         generated_text = inference_tokenizer.decode(outputs[0], skip_special_tokens=True)
        
#         # The output includes the prompt, so we can try to extract just the response
#         response_text = generated_text.split("[/INST]")[-1].strip()

#         print("\nGenerated Response (full):")
#         print(generated_text)
#         print("\nGenerated Response (extracted):")
#         print(response_text)

#     except Exception as e:
#         print(f"Error during example inference: {e}")

# # # Clean up dummy dataset if created
# # if dummy_data_created and os.path.exists("dummy_dataset.csv"):
# #     os.remove("dummy_dataset.csv")
# #     print("Dummy dataset file removed.")

# # print("\nScript finished.")

In [20]:
from huggingface_hub import HfApi, HfFolder, create_repo, upload_folder


# Define local path where your merged model is saved
# merged_model_path = merged_model_path

# Define Hugging Face repo name (e.g., username/model-name)
hf_repo_name = "Rockerleo/mistralRAAPID"

# OPTIONAL: Create the repo on the hub (skip if it already exists)
create_repo(hf_repo_name, private=True)  # Set private=False to make it public

# Upload the entire model folder to the repo
upload_folder(
    repo_id=hf_repo_name,
    folder_path=merged_model_path,
    path_in_repo=".",  # Upload all files to root
    commit_message="Upload merged Mistral LoRA model"
)

print(f"Model successfully uploaded to: https://huggingface.co/{hf_repo_name}")


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

Model successfully uploaded to: https://huggingface.co/Rockerleo/mistralRAAPID


In [21]:
print("\nScript finished.")


Script finished.
