In [1]:
# 1. Install Unsloth with H100 Support
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" --quiet

# 2. Install Core Libraries
!pip install --no-deps xformers trl peft accelerate bitsandbytes psutil pandas pyarrow fastparquet scikit-learn --quiet

# 3. CRITICAL: Python 3.12 Patch
import builtins
import psutil
builtins.psutil = psutil

# 4. Enable H100 Specific Math (TF32 & BF16)
import torch
if torch.cuda.is_available():
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    print(f"‚úÖ A100 Environment Ready: {torch.cuda.get_device_name(0)}")

‚úÖ A100 Environment Ready: NVIDIA A100-SXM4-40GB


In [2]:
import pandas as pd
import os
import torch
from tqdm import tqdm
from unsloth import FastLanguageModel
from sklearn.metrics import classification_report, confusion_matrix

# ==========================================
# ‚öôÔ∏è A100 CONFIGURATION
# ==========================================
MODEL_PATH = "final_rohit_pathopredict_qwen3"  
BATCH_SIZE = 128    

TEST_FILES = [
    "test_unseen_variant_level.parquet"
]
# ==========================================

# ‚ö° OPTIMIZATION: Enable TF32 Math (A100 Speed Secret)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# 1. Load Model
print(f"üîÑ Loading model from: {MODEL_PATH}...")
if not os.path.exists(MODEL_PATH):
    raise FileNotFoundError(f"‚ùå CRITICAL: Folder '{MODEL_PATH}' not found.")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_PATH,
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True, 
)

# Enable A100 Native Fast Inference
FastLanguageModel.for_inference(model) 
tokenizer.padding_side = "left" # ‚ö° Faster generation

print("‚úÖ A100 Model Loaded & Optimized!")

# 2. Speed-Optimized Predictor
def get_batch_predictions(prompts, batch_size):
    predictions = []
    
    for i in tqdm(range(0, len(prompts), batch_size), desc="   Processing", unit="batch"):
        batch_prompts = prompts[i : i + batch_size]
        
        inputs = tokenizer(
            batch_prompts, 
            return_tensors="pt", 
            padding=True, 
            truncation=True, 
            max_length=2048
        ).to("cuda")
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs, 
                max_new_tokens=10, 
                use_cache=True, 
                temperature=0.01,
                pad_token_id=tokenizer.pad_token_id
            )
        
        decoded = tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
        predictions.extend([p.lower().strip() for p in decoded])
        
    return predictions

# 3. Run The Suite
print("\n" + "="*60)
print("üöÄ STARTING A100 ABLATION SUITE (TF32 Enabled)")
print("="*60)

for file_path in TEST_FILES:
    if not os.path.exists(file_path):
        print(f"‚ö†Ô∏è  Skipping: {file_path}")
        continue
        
    dataset_name = os.path.basename(file_path).replace(".parquet", "")
    print(f"\nüìä DATASET: {dataset_name}")
    print("-" * 40)
    
    try:
        df = pd.read_parquet(file_path)
        print(f"   Rows: {len(df):,}")
        
        prompts = [
            row['text'].split("### Response:")[0] + "### Response:\n" 
            for _, row in df.iterrows()
        ]
        
        # üî• RUN INFERENCE
        raw_preds = get_batch_predictions(prompts, batch_size=BATCH_SIZE)
        
        # Scoring
        y_true = []
        y_pred = []
        
        for truth, pred in zip(df['clean_label'], raw_preds):
            t_clean = truth.lower()
            p_clean = pred
            
            true_label = "other"
            pred_label = "other"
            
            if "pathogenic" in t_clean: true_label = "pathogenic"
            elif "benign" in t_clean:   true_label = "benign"
            elif "uncertain" in t_clean: true_label = "uncertain"
                
            if "pathogenic" in p_clean: pred_label = "pathogenic"
            elif "benign" in p_clean:   pred_label = "benign"
            elif "uncertain" in p_clean: pred_label = "uncertain"
            
            y_true.append(true_label)
            y_pred.append(pred_label)

        # Report
        labels = ["pathogenic", "benign", "uncertain"]
        print("\nüîç Classification Report:")
        print(classification_report(y_true, y_pred, labels=labels, zero_division=0))
        
        cm = confusion_matrix(y_true, y_pred, labels=labels)
        print("üìâ Confusion Matrix:")
        print(f"               Pathogenic   Benign   Uncertain")
        print(f"  Pathogenic   {cm[0][0]:<12} {cm[0][1]:<8} {cm[0][2]}")
        print(f"  Benign       {cm[1][0]:<12} {cm[1][1]:<8} {cm[1][2]}")
        print(f"  Uncertain    {cm[2][0]:<12} {cm[2][1]:<8} {cm[2][2]}")
        
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")

print("\n" + "="*60)
print("üèÅ DONE.")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


ü¶• Unsloth Zoo will now patch everything to make training faster!
üîÑ Loading model from: final_rohit_pathopredict_qwen3...
==((====))==  Unsloth 2025.12.9: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.495 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.12.9 patched 36 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


‚úÖ A100 Model Loaded & Optimized!

üöÄ STARTING A100 ABLATION SUITE (TF32 Enabled)

üìä DATASET: test_unseen_variant_level
----------------------------------------
   Rows: 55,376


  out = torch_matmul(X, W.t(), out = out)
  out = torch_matmul(X, W, out = out)
  out = torch_matmul(X, W.t(), out = out)
  out = torch_matmul(X, W, out = out)
   Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 433/433 [18:00<00:00,  2.49s/batch]



üîç Classification Report:
              precision    recall  f1-score   support

  pathogenic       0.95      0.94      0.94      7071
      benign       0.99      0.99      0.99     48305
   uncertain       0.00      0.00      0.00         0

   micro avg       0.99      0.99      0.99     55376
   macro avg       0.65      0.64      0.65     55376
weighted avg       0.99      0.99      0.99     55376

üìâ Confusion Matrix:
               Pathogenic   Benign   Uncertain
  Pathogenic   6644         427      0
  Benign       365          47940    0
  Uncertain    0            0        0

üèÅ DONE.


In [3]:
# --------------------------------------------------------------------------------
# CELL 2: EXPORT TO GGUF
# --------------------------------------------------------------------------------
print("üì¶ Starting GGUF Export Process...")

quantization_methods = ["q4_k_m", "q8_0"]

for method in quantization_methods:
    save_filename = f"rohit_pathopredict_qwen3_{method}"
    
    print(f"   ‚Ü≥ Converting to {method.upper()} format...")
    try:
        model.save_pretrained_gguf(
            save_filename,
            tokenizer,
            quantization_method = method,
        )
        print(f"   ‚úÖ Exported: {save_filename}.gguf")
        
        if method == "q4_k_m":
            with open(f"{save_filename}/Modelfile", "w") as f:
                f.write(f"FROM ./{save_filename}.gguf\n")
                f.write("TEMPLATE \"{{ .System }}\nUser: {{ .Prompt }}\nAssistant: \"\n")
                f.write("SYSTEM \"You are an expert genetic variant classifier. Classify variants as Pathogenic, Benign, or Uncertain.\"\n")
                f.write("PARAMETER temperature 0.1\n")
                f.write("PARAMETER num_ctx 4096\n")
            print(f"      üìÑ Created Ollama Modelfile at: {save_filename}/Modelfile")
            
    except Exception as e:
        print(f"   ‚ùå Failed to export {method}: {str(e)}")

print("\n" + "="*60)
print("üöÄ PIPELINE FINISHED.")
print("To use in Ollama: cd rohit_pathopredict_qwen3_q4_k_m && ollama create rohit_patho -f Modelfile")
print("="*60)

üì¶ Starting GGUF Export Process...
   ‚Ü≥ Converting to Q4_K_M format...
Unsloth: Merging model weights to 16-bit format...


config.json: 0.00B [00:00, ?B/s]

Found HuggingFace hub cache directory: /teamspace/studios/this_studio/.cache/huggingface/hub


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00002.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 1/2 [00:05<00:05,  5.32s/it]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:08<00:00,  4.48s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:21<00:00, 10.75s/it]


Unsloth: Merge process complete. Saved to `/teamspace/studios/this_studio/rohit_pathopredict_qwen3_q4_k_m`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF bf16 might take 3 minutes.
\        /    [2] Converting GGUF bf16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: Updating system package directories
Unsloth: Missing packages: libcurl4-openssl-dev
Unsloth: Will attempt to install missing system packages.
Unsloth: Installing packages: libcurl4-openssl-dev
Unsloth: Install llama.cpp and building - please wait 1 to 3 minutes
Unsloth: Cloning llama.cpp repository
Unsloth: Install GGUF and other packages
Unsloth: Successfully installed llama.cpp!
Unsloth: Preparing converter script...
Unsloth: [1] Converting 

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00002.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 1/2 [00:05<00:05,  5.54s/it]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:09<00:00,  4.56s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:20<00:00, 10.18s/it]


Unsloth: Merge process complete. Saved to `/teamspace/studios/this_studio/rohit_pathopredict_qwen3_q8_0`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF bf16 might take 3 minutes.
\        /    [2] Converting GGUF bf16 to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: llama.cpp found in the system. Skipping installation.
Unsloth: Preparing converter script...
Unsloth: [1] Converting model into bf16 GGUF format.
This might take 3 minutes...
Unsloth: Initial conversion completed! Files: ['qwen3-4b-instruct-2507.BF16.gguf']
Unsloth: [2] Converting GGUF bf16 into q8_0. This might take 10 minutes...
Unsloth: Model files cleanup...
Unsloth: All GGUF conversions completed successfully!
Generated files: ['qwen3-4b-instruct-2507.Q8_0.gguf']
Unsloth: example usage for text only L

In [3]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix

# ==========================================
# ‚öôÔ∏è CONFIGURATION
# ==========================================
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"  # Double check this ID! (Qwen3 isn't out publicly yet, assumed 2.5)
# If you have a specific private repo, keep your ID.
TEST_FILE = "test_unseen_variant_level.parquet"
BATCH_SIZE = 16   

print(f"‚¨áÔ∏è Loading RAW baseline model: {MODEL_ID}")

# Load Model
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

model.eval()
tokenizer.padding_side = "left"

print("‚úÖ Raw model loaded.")

# -----------------------------------------
# Prediction function
# -----------------------------------------
def predict(prompts):
    preds = []
    for i in tqdm(range(0, len(prompts), BATCH_SIZE), desc="Raw model thinking"):
        batch = prompts[i:i+BATCH_SIZE]
        inputs = tokenizer(
            batch, 
            return_tensors="pt", 
            padding=True, 
            truncation=True, 
            max_length=2048
        ).to(model.device)
        
        with torch.no_grad():
            out = model.generate(
                **inputs, 
                max_new_tokens=10, 
                temperature=0.01,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id
            )
        
        decoded = tokenizer.batch_decode(
            out[:, inputs.input_ids.shape[1]:], 
            skip_special_tokens=True
        )
        preds.extend([d.lower().strip() for d in decoded])
        
    return preds

# -----------------------------------------
# Run
# -----------------------------------------
df = pd.read_parquet(TEST_FILE)
print(f"Test size: {len(df)}")

# Check what columns we actually have
print(f"Columns found: {df.columns.tolist()}")

prompts = []
for text in df["text"]:
    # We strip the answer part if it exists in the text column
    input_text = text.split("### Response:")[0] + "\n### Response:\n"
    prompts.append(input_text)

raw_preds = predict(prompts)

# -----------------------------------------
# Scoring (FIXED HERE) üîß
# -----------------------------------------
y_true, y_pred = [], []

# Use 'clean_label' instead of 'label'
target_col = 'clean_label' if 'clean_label' in df.columns else 'label'

for truth, pred in zip(df[target_col], raw_preds):
    t = str(truth).lower()
    p = str(pred).lower()
    
    # Truth mapping
    if "pathogenic" in t: true_val = "pathogenic"
    elif "benign" in t:   true_val = "benign"
    else: continue # Skip uncertains if any crept in

    # Prediction mapping
    if "pathogenic" in p: pred_val = "pathogenic"
    elif "benign" in p:   pred_val = "benign"
    else: pred_val = "benign" # Default to benign if model hallucinates or outputs gibberish

    y_true.append(true_val)
    y_pred.append(pred_val)

print("\nüìä RAW MODEL BASELINE RESULTS")
print(classification_report(y_true, y_pred, zero_division=0))

print("üìâ Confusion Matrix")
print(confusion_matrix(y_true, y_pred, labels=["pathogenic", "benign"]))

‚¨áÔ∏è Loading RAW baseline model: Qwen/Qwen2.5-7B-Instruct


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

‚úÖ Raw model loaded.
Test size: 55376
Columns found: ['text', 'clean_label', 'variant_id']


Raw model thinking: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3461/3461 [24:07<00:00,  2.39it/s]



üìä RAW MODEL BASELINE RESULTS
              precision    recall  f1-score   support

      benign       0.87      1.00      0.93     48305
  pathogenic       0.53      0.00      0.01      7071

    accuracy                           0.87     55376
   macro avg       0.70      0.50      0.47     55376
weighted avg       0.83      0.87      0.81     55376

üìâ Confusion Matrix
[[   30  7041]
 [   27 48278]]


In [4]:
import torch
import gc

def free_vram(model=None, tokenizer=None):
    """
    Forcefully clears GPU memory. 
    If you pass 'model' and 'tokenizer', it deletes them first.
    """
    print("üßπ Starting VRAM Cleanup...")
    
    # 1. Delete Python objects if provided
    if model:
        del model
    if tokenizer:
        del tokenizer
        
    # 2. Garbage Collect (Python side)
    gc.collect()
    
    # 3. Clear CUDA Cache (GPU side)
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    
    # 4. Verification
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1e9
        reserved = torch.cuda.memory_reserved() / 1e9
        print(f"‚úÖ VRAM Cleared.")
        print(f"   - Memory Allocated: {allocated:.2f} GB")
        print(f"   - Memory Reserved:  {reserved:.2f} GB")
    else:
        print("‚ö†Ô∏è CUDA not available.")

# Usage Example:
# free_vram(model, tokenizer) 
# OR just:
# free_vram()

In [1]:
import pandas as pd

file_path = "clinvar_evo2_labeled_HYBRID_200.parquet"

try:
    df = pd.read_parquet(file_path)
    print(f"üìÇ File: {file_path}")
    print(f"üìä Total Rows: {len(df)}")
    
    if len(df) > 0:
        print("\nüßê Column Names:")
        print(df.columns.tolist())
        
        print("\nüè∑Ô∏è Label Counts (pseudo_label):")
        print(df['pseudo_label'].value_counts())
        
        print("\nüëÄ First 5 rows:")
        print(df[['GeneSymbol', 'pseudo_label', 'evo_score']].head())
    else:
        print("‚ö†Ô∏è The file is empty! Evo2 didn't confidently label anything.")
        
except Exception as e:
    print(f"‚ùå Error reading file: {e}")

üìÇ File: clinvar_evo2_labeled_HYBRID_200.parquet
üìä Total Rows: 200

üßê Column Names:
['original_index', 'evo_score', 'pseudo_label', 'fetch_status', '#AlleleID', 'Type', 'Name', 'GeneID', 'GeneSymbol', 'HGNC_ID', 'ClinicalSignificance', 'ClinSigSimple', 'LastEvaluated', 'RS# (dbSNP)', 'nsv/esv (dbVar)', 'RCVaccession', 'PhenotypeIDS', 'PhenotypeList', 'Origin', 'OriginSimple', 'Assembly', 'ChromosomeAccession', 'Chromosome', 'Start', 'Stop', 'ReferenceAllele', 'AlternateAllele', 'Cytogenetic', 'ReviewStatus', 'NumberSubmitters', 'Guidelines', 'TestedInGTR', 'OtherIDs', 'SubmitterCategories', 'VariationID', 'PositionVCF', 'ReferenceAlleleVCF', 'AlternateAlleleVCF', 'SomaticClinicalImpact', 'SomaticClinicalImpactLastEvaluated', 'ReviewStatusClinicalImpact', 'Oncogenicity', 'OncogenicityLastEvaluated', 'ReviewStatusOncogenicity', 'SCVsForAggregateGermlineClassification', 'SCVsForAggregateSomaticClinicalImpact', 'SCVsForAggregateOncogenicityClassification']

üè∑Ô∏è Label Counts (ps