In [1]:
import torch
import gc

gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()


In [2]:
# 1. Install PyTorch with A100 CUDA support
!pip install torch --index-url https://download.pytorch.org/whl/cu121 --quiet

# 2. Install Unsloth & Core Libraries
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" --quiet
!pip install --no-deps xformers trl peft accelerate bitsandbytes psutil ipywidgets --quiet
!pip install pandas pyarrow fastparquet --quiet
!pip install ipywidgets widgetsnbextension
# 3. CRITICAL FIX: Patch 'psutil' for Python 3.12
import builtins
import psutil
builtins.psutil = psutil

# 4. Enable A100 Math Acceleration (TF32)
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

print("‚úÖ A100 Environment Ready: TF32 Enabled & System Patched.")

‚úÖ A100 Environment Ready: TF32 Enabled & System Patched.


In [3]:
import time
from datetime import datetime

class ProLogger:
    def __init__(self, project_name):
        self.start_time = time.time()
        print(f"\nüöÄ Starting pipeline: {project_name}")
        print(f"   [Timestamp] {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print("-" * 60)

    def log_config(self, config_dict):
        print(f"[config] Configuration loaded:")
        for key, value in config_dict.items():
            print(f"   ‚Ä¢ {key}: {value}")
    
    def log_hardware(self):
        gpu_stats = torch.cuda.get_device_properties(0)
        vram = round(gpu_stats.total_memory / 1024**3, 2)
        print(f"[hardware] Detected Device: {gpu_stats.name}")
        print(f"[hardware] VRAM Available:  {vram} GB")
        if vram > 35:
            print(f"[hardware] Status: üü¢ A100 High-Bandwidth Mode Active")
        else:
            print(f"[hardware] Status: üü° Standard Mode")

    def log_step(self, tag, message):
        print(f"[{tag}] {message}")

    def log_success(self, message):
        elapsed = round((time.time() - self.start_time) / 60, 2)
        print(f"‚úÖ {message} (Total Runtime: {elapsed} min)")

# Initialize
logger = ProLogger("Rohit_Pathopredict_A100_Redline")


üöÄ Starting pipeline: Rohit_Pathopredict_A100_Redline
   [Timestamp] 2025-12-24 08:14:16
------------------------------------------------------------


In [4]:
from unsloth import FastLanguageModel
from datasets import load_dataset

# 1. Configuration
config = {
    "model_name": "Qwen/Qwen3-4B-Instruct-2507",
    "max_seq_length": 2048,
    "load_in_4bit": True,
    "lora_rank": 16,
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
}

logger.log_config(config)
logger.log_hardware()

# 2. Load Base Model
logger.log_step("model", f"Loading base model: {config['model_name']}...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = config['model_name'],
    max_seq_length = config['max_seq_length'],
    dtype = None, 
    load_in_4bit = config['load_in_4bit'],
    trust_remote_code = True,
)

# 3. Attach LoRA (With Safety Dropout)
logger.log_step("model", "Injecting LoRA adapters (Dropout=0.05)...")
model = FastLanguageModel.get_peft_model(
    model,
    r = config['lora_rank'],
    target_modules = config['target_modules'],
    lora_alpha = 16,
    lora_dropout = 0.05, # Safety: Prevents over-memorization
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

# 4. Load & Slice Data
logger.log_step("data", "Loading full dataset...")
full_dataset = load_dataset("parquet", data_files={"train": "clinvar_llm_train.parquet"})["train"]

# --- ‚úÇÔ∏è THE 70% CUT ---
logger.log_step("data", "Cutting dataset to 70% size...")
full_dataset = full_dataset.shuffle(seed=3407) # Shuffle first to avoid bias
keep_count = int(len(full_dataset) * 0.50)
subset_dataset = full_dataset.select(range(keep_count))

# --- üõ°Ô∏è VALIDATION SPLIT (1%) ---
# We take 1% of the REMAINING 70% to use for Early Stopping tests
split_dataset = subset_dataset.train_test_split(test_size=0.01)

logger.log_step("data", f"Original Size: {len(full_dataset):,}")
logger.log_step("data", f"Training Size (70%): {len(split_dataset['train']):,}")
logger.log_step("data", f"Validation Size (1%): {len(split_dataset['test']):,} (For Early Stopping)")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


ü¶• Unsloth Zoo will now patch everything to make training faster!
[config] Configuration loaded:
   ‚Ä¢ model_name: Qwen/Qwen3-4B-Instruct-2507
   ‚Ä¢ max_seq_length: 2048
   ‚Ä¢ load_in_4bit: True
   ‚Ä¢ lora_rank: 16
   ‚Ä¢ target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
[hardware] Detected Device: NVIDIA A100-SXM4-40GB
[hardware] VRAM Available:  39.49 GB
[hardware] Status: üü¢ A100 High-Bandwidth Mode Active
[model] Loading base model: Qwen/Qwen3-4B-Instruct-2507...
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.12.9: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.495 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore d

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


[model] Injecting LoRA adapters (Dropout=0.05)...


Unsloth 2025.12.9 patched 36 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


[data] Loading full dataset...
[data] Cutting dataset to 70% size...
[data] Original Size: 2,217,246
[data] Training Size (70%): 1,097,536
[data] Validation Size (1%): 11,087 (For Early Stopping)


In [5]:
from trl import SFTTrainer
from transformers import TrainingArguments, EarlyStoppingCallback
import torch

# Force TF32 for Speed
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

logger.log_step("training", "Configuring Parameters (Batch 176 | Save every 1k)...")

training_args = TrainingArguments(
    output_dir = "rohit_pathopredict_checkpoints",

    # üöÄ BATCH SIZE 160 (Safe from OOM)
    per_device_train_batch_size = 176, 
    gradient_accumulation_steps = 1,
    per_device_eval_batch_size = 64,
    
    # üõ°Ô∏è CHECKPOINT STRATEGY (User Request: Every 1000 steps)
    eval_strategy = "steps",      
    eval_steps = 1000,            # Only check accuracy every 1000 steps
    save_strategy = "steps",      
    save_steps = 1000,            # Only save to disk every 1000 steps
    save_total_limit = 2,         # Keep only the 2 best checkpoints to save space
    
    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss",
    
    # OPTIMIZER & PRECISION
    optim = "adamw_8bit",
    bf16 = True,
    
    # MAX DATA LOADING
    dataloader_num_workers = 16,
    dataloader_pin_memory = True,
    
    # SCHEDULER
    learning_rate = 2e-4,
    warmup_steps = 100,
    max_steps = -1,
    num_train_epochs = 1,
    
    # LOGGING
    logging_steps = 10,  # Still shows the progress bar update often
    report_to = "none",
    seed = 3407,
    
    # SPEED OPTIMIZATIONS
    gradient_checkpointing = False,
    group_by_length = True,
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = split_dataset["train"],
    eval_dataset = split_dataset["test"], 
    
    dataset_text_field = "text",
    max_seq_length = config['max_seq_length'],
    dataset_num_proc = 12,
    
    # Unsloth packing handling
    packing = False, 
    
    args = training_args,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

logger.log_step("training", "üî• STARTING TRAINING (Checkpoints @ 1000 steps)...")
trainer_stats = trainer.train()

logger.log_success("Training Pipeline Complete.")

[training] Configuring Parameters (Batch 160 | Save every 1k)...


Unsloth: Tokenizing ["text"] (num_proc=34):   0%|          | 0/1097536 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=34):   0%|          | 0/11087 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


[training] üî• STARTING TRAINING (Checkpoints @ 1000 steps)...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,097,536 | Num Epochs = 1 | Total steps = 6,236
O^O/ \_/ \    Batch size per device = 176 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (176 x 1 x 1) = 176
 "-____-"     Trainable parameters = 33,030,144 of 4,055,498,240 (0.81% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
1000,0.4096,0.390952
2000,0.3837,0.347993
3000,0.3771,0.332097
4000,0.3723,0.324486
5000,0.3663,0.318403
6000,0.3672,0.314631


‚úÖ Training Pipeline Complete. (Total Runtime: 300.93 min)


In [6]:
output_path = "final_rohit_pathopredict_qwen3"

logger.log_step("io", f"Saving best model to: {output_path}")
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)

logger.log_success(f"Model saved successfully. Ready for validation.")

[io] Saving best model to: final_rohit_pathopredict_qwen3
‚úÖ Model saved successfully. Ready for validation. (Total Runtime: 300.95 min)


In [None]:
# --------------------------------------------------------------------------------
# CELL 7: EXPORT TO GGUF (For Ollama / LMStudio)
# --------------------------------------------------------------------------------
logger.log_step("export", "Starting GGUF Export Process...")

# 1. Define Export Methods
# "q4_k_m" = Standard 4-bit (Fast, Low RAM, Good Quality) - Best for Ollama
# "q8_0"   = 8-bit (High Precision, slow) - Best for archiving
quantization_methods = ["q4_k_m", "q8_0"]

for method in quantization_methods:
    save_filename = f"rohit_pathopredict_qwen3_{method}"
    
    logger.log_step("export", f"Converting to {method.upper()} format...")
    try:
        model.save_pretrained_gguf(
            save_filename,
            tokenizer,
            quantization_method = method,
        )
        logger.log_success(f"Exported: {save_filename}.gguf")
        
        # 2. Create Ollama Modelfile (Auto-Generated)
        # This lets you run 'ollama create rohit_model -f Modelfile' instantly
        if method == "q4_k_m":
            with open(f"{save_filename}/Modelfile", "w") as f:
                f.write(f"FROM ./{save_filename}.gguf\n")
                f.write("TEMPLATE \"{{ .System }}\nUser: {{ .Prompt }}\nAssistant: \"\n")
                f.write("SYSTEM \"You are an expert genetic variant classifier. Classify variants as Pathogenic, Benign, or Uncertain.\"\n")
                f.write("PARAMETER temperature 0.1\n")
                f.write("PARAMETER num_ctx 4096\n")
            print(f"   üìÑ Created Ollama Modelfile at: {save_filename}/Modelfile")
            
    except Exception as e:
        print(f"‚ùå Failed to export {method}: {str(e)}")

print("\n" + "="*60)
print("üöÄ ALL SYSTEMS GO! PIPELINE FINISHED.")
print("To use in Ollama: cd rohit_pathopredict_qwen3_q4_k_m && ollama create rohit_patho -f Modelfile")
print("="*60)