In [1]:
# Colab-ready installs. Skip if you already have these packages.
# This may take a few minutes.
!pip install -q "transformers==4.56.2" tokenizers trl==0.22.2 datasets unsloth_zoo unsloth bitsandbytes accelerate scikit-learn pyarrow==19.0.0
# If you see GPU/CUDA related errors, try restarting the runtime after install.

In [2]:
import os
import math
from pathlib import Path
from tqdm import tqdm

import torch
from datasets import load_dataset

from sklearn.metrics import accuracy_score, f1_score, classification_report

print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


  from .autonotebook import tqdm as notebook_tqdm


torch: 2.9.0+cu128
cuda available: True
Using device: cuda


In [3]:
# Login to HF if private dataset: run `huggingface-cli login` in a cell or terminal beforehand.
ds = load_dataset("DaniilOr/SemEval-2026-Task13", "A")   # per your note

# ds could be a DatasetDict or a Dataset. Normalize to train/validation splits.
if isinstance(ds, dict) or hasattr(ds, "keys"):
    keys = list(ds.keys())
    print("Dataset keys:", keys)
    if "train" in ds:
        train_ds = ds["train"]
    else:
        # if single split present, use it and create a train/validation split
        if len(keys) == 1:
            train_ds = ds[keys[0]].train_test_split(test_size=0.1, seed=3407)["train"]
            val_ds = ds[keys[0]].train_test_split(test_size=0.1, seed=3407)["test"]
        else:
            train_ds = ds[keys[0]]

    if "validation" in ds:
        val_ds = ds["validation"]
    else:
        # fallback: if user provided single dataset and we didn't already split
        if "val_ds" not in globals():
            val_ds = train_ds.train_test_split(test_size=0.1, seed=3407)["test"]

    if "test" in ds:
        test_ds = ds["test"]
else:
    # single dataset object
    full = ds
    split = full.train_test_split(test_size=0.1, seed=3407)
    train_ds = split["train"]
    val_ds = split["test"]

print("train size:", len(train_ds), "val size:", len(val_ds))
print("Columns:", train_ds.column_names)
# Quick peek
print(train_ds[0])

Dataset keys: ['train', 'validation', 'test']
train size: 500000 val size: 100000
Columns: ['code', 'generator', 'label', 'language']
{'code': "(a, b, c, d) = [int(x) for x in input().split()]\nk = input()\n(p, q, r, s) = (0, 0, 0, 0)\nfor i in k:\n\tif i == '1':\n\t\tp += 1\n\telif i == '2':\n\t\tq += 1\n\telif i == '3':\n\t\tr += 1\n\telif i == '4':\n\t\ts += 1\nprint(a * p + b * q + c * r + d * s)\n", 'generator': 'human', 'label': 0, 'language': 'Python'}


In [4]:
from unsloth import FastLanguageModel

# Choose model ‚Äî 4-bit prequantized recommended on Colab for memory reasons:
MODEL_NAME = "unsloth/gpt-oss-20b"  # change if you prefer another
MAX_SEQ_LENGTH = 1024
LOAD_IN_4BIT = True  # set False if you want full precision (requires much more VRAM)

print("Loading model/tokenizer (this downloads weights)...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    dtype = None,
    max_seq_length = MAX_SEQ_LENGTH,
    load_in_4bit = LOAD_IN_4BIT,
    full_finetuning = False,  # we will use PEFT/LoRA below
    offload_embedding = True,
)
print("Model + tokenizer loaded.")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
Loading model/tokenizer (this downloads weights)...
==((====))==  Unsloth 2025.11.4: Fast Gpt_Oss patching. Transformers: 4.56.2.
   \\   /|    NVIDIA RTX 5000 Ada Generation. Num GPUs = 1. Max memory: 31.6 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Fetching 4 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:43<00:00, 10.94s/it]
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:08<00:00,  2.18s/it]


Unsloth: Offloading embeddings to RAM to save 1.08 GB.
Model + tokenizer loaded.


In [5]:
def build_texts(batch):
    codes = batch.get("code", [])
    labels = batch.get("label", [])
    langs = batch.get("language", [])
    out_texts = []
    for i, code in enumerate(codes):
        label = labels[i] if i < len(labels) else 0
        lang = langs[i] if i < len(langs) else "Unknown"
        prompt = f"""
You are a code origin classifier. Your task is to determine whether the following code was written by a human or generated by a machine.

Output rule:
- Respond with EXACTLY ONE WORD: "human" or "machine".
- Do not include any explanations, punctuation, or extra text.

Guidelines for reasoning:
1. Naming: Human code shows mixed styles or domain-specific names. Machine code uses consistent, verbose, or generic names.
2. Comments: Human comments are sparse or informal. Machine comments are detailed, redundant, or uniformly formatted.
3. Structure: Human code may have shortcuts or irregularities. Machine code is overly structured or templated.
4. Formatting: Human code has minor inconsistencies. Machine code follows perfect style rules.
5. Logic: Human logic is pragmatic and iterative. Machine logic is overly complete or formal.

Language: {lang}

Code:
```{lang.lower()}
{code}
Output your decision (human or machine):
"""

        assistant_reply = "machine" if int(label) == 1 else "human"
        convo = [
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": assistant_reply},
        ]
        text = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
        out_texts.append(text)
    return {"text": out_texts}

# Apply mapping (batched). This will add a `text` column that SFTTrainer expects.
train_ds = train_ds.map(build_texts, batched=True, batch_size=128)
val_ds = val_ds.map(build_texts, batched=True, batch_size=128)

print("Formatting done. Example formatted text (train):")
print(train_ds[0]["text"][:1000].replace("\n", "\\n")[:1000])

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500000/500000 [00:30<00:00, 16424.65 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100000/100000 [00:06<00:00, 15800.66 examples/s]

Formatting done. Example formatted text (train):
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\nKnowledge cutoff: 2024-06\nCurrent date: 2025-11-30\n\nReasoning: medium\n\n# Valid channels: analysis, commentary, final. Channel must be included for every message.\nCalls to these tools must go to the commentary channel: 'functions'.<|end|><|start|>user<|message|>\nYou are a code origin classifier. Your task is to determine whether the following code was written by a human or generated by a machine.\n\nOutput rule:\n- Respond with EXACTLY ONE WORD: "human" or "machine".\n- Do not include any explanations, punctuation, or extra text.\n\nGuidelines for reasoning:\n1. Naming: Human code shows mixed styles or domain-specific names. Machine code uses consistent, verbose, or generic names.\n2. Comments: Human comments are sparse or informal. Machine comments are detailed, redundant, or uniformly formatted.\n3. Structure: Human code may have shortcuts or ir




In [6]:
print("\n" + "="*80)
print("CHECKING CHAT TEMPLATE FORMAT")
print("="*80)
example_text = train_ds[0]["text"]
print("First 1500 characters of formatted example:")
print(example_text[:6000])
print("\n" + "="*80)
print("IMPORTANT: Look at the output above and identify the EXACT markers")
print("for where the assistant response starts. Update gpt_oss_kwargs below.")
print("="*80 + "\n")


CHECKING CHAT TEMPLATE FORMAT
First 1500 characters of formatted example:
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: 2025-11-30

Reasoning: medium

# Valid channels: analysis, commentary, final. Channel must be included for every message.
Calls to these tools must go to the commentary channel: 'functions'.<|end|><|start|>user<|message|>
You are a code origin classifier. Your task is to determine whether the following code was written by a human or generated by a machine.

Output rule:
- Respond with EXACTLY ONE WORD: "human" or "machine".
- Do not include any explanations, punctuation, or extra text.

Guidelines for reasoning:
1. Naming: Human code shows mixed styles or domain-specific names. Machine code uses consistent, verbose, or generic names.
2. Comments: Human comments are sparse or informal. Machine comments are detailed, redundant, or uniformly formatted.
3. Structure: Human code may have shortc

In [7]:
# Attach PEFT/LoRA adapters to the loaded model (only a small fraction of params will be trained)
model = FastLanguageModel.get_peft_model(
    model,
    r = 12,  # try 8-32 depending on capacity
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0.0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

print("PEFT/LoRA adapters attached.")


Unsloth: Making `model.base_model.model.model` require gradients
PEFT/LoRA adapters attached.


In [8]:
from trl import SFTConfig, SFTTrainer
from unsloth.chat_templates import train_on_responses_only

sft_args = SFTConfig(
    per_device_train_batch_size = 1,   # lower for Colab; increase for bigger GPUs
    gradient_accumulation_steps = 4,
    warmup_steps = 4,
    max_steps = 100,
    # num_train_epochs = 1,              # small demo run; set higher or use num_train_epochs
    learning_rate = 2e-4,
    logging_steps = 10,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 3407,
    output_dir = "outputs",
    report_to = "none",
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_ds,
    args = sft_args,
)

# Use the correct assistant marker that appears in your formatted text
gpt_oss_kwargs = dict(
    instruction_part = "<|start|>user<|message|>",
    response_part    = "<|start|>assistant<|message|>",   # <-- corrected
)

trainer = train_on_responses_only(trainer, **gpt_oss_kwargs)


print("\n" + "="*80)
print("MASKING VERIFICATION")
print("="*80)

def verify_masking(trainer, indices=[0, 10, min(100, len(trainer.train_dataset)-1)]):
    """Verify that only assistant responses are being trained on."""
    for idx in indices:
        if idx >= len(trainer.train_dataset):
            continue

        print(f"\n--- Example {idx} ---")

        # Show full input
        input_ids = trainer.train_dataset[idx]["input_ids"]
        full_text = tokenizer.decode(input_ids)
        print("Full conversation (last 300 chars):")
        print(full_text[-300:])

        # Show what's being trained on (should be ONLY "human" or "machine")
        labels = trainer.train_dataset[idx]["labels"]
        trained_tokens = [
            tokenizer.pad_token_id if x == -100 else x
            for x in labels
        ]
        trained_text = tokenizer.decode(trained_tokens).replace(
            tokenizer.pad_token, " "
        ).strip()

        print("\nWhat model is trained on (should be ONLY 'human' or 'machine'):")
        print(f"'{trained_text}'")

        non_masked_count = sum(1 for x in labels if x != -100)
        total_count = len(labels)
        print(f"Non-masked tokens: {non_masked_count}/{total_count}")

        # CRITICAL CHECK: Should be 1-5 tokens max (just "human" or "machine")
        if non_masked_count > 10:
            print("‚ö†Ô∏è  WARNING: Too many non-masked tokens! Masking may not be working.")
            print("   Expected: 1-5 tokens (just the answer)")
            print(f"   Got: {non_masked_count} tokens")
        else:
            print("‚úì Masking looks correct!")

verify_masking(trainer)

Unsloth: Tokenizing ["text"] (num_proc=64): 100%|‚ñà| 500000/500000 [02:03<00:00, 4057.53 examples/s
Map (num_proc=64): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500000/500000 [00:28<00:00, 17725.66 examples/s]



MASKING VERIFICATION

--- Example 0 ---
Full conversation (last 300 chars):
in input().split()]
k = input()
(p, q, r, s) = (0, 0, 0, 0)
for i in k:
	if i == '1':
		p += 1
	elif i == '2':
		q += 1
	elif i == '3':
		r += 1
	elif i == '4':
		s += 1
print(a * p + b * q + c * r + d * s)

Output your decision (human or machine):
<|end|><|start|>assistant<|message|>human<|return|>

What model is trained on (should be ONLY 'human' or 'machine'):
'human<|return|>'
Non-masked tokens: 2/386
‚úì Masking looks correct!

--- Example 10 ---
Full conversation (last 300 chars):
thon

Code:
```python
a = sorted(list(map(int, input().split())), reverse=True)
f1 = a[0] + a[3]
f2 = a[1] + a[2]
if f1 == f2:
	print('YES')
elif a[0] == a[1] + a[2] + a[3]:
	print('YES')
else:
	print('NO')

Output your decision (human or machine):
<|end|><|start|>assistant<|message|>human<|return|>

What model is trained on (should be ONLY 'human' or 'machine'):
'human<|return|>'
Non-masked tokens: 2/351
‚úì Masking looks corr

In [9]:
def count_examples_with_labels(ds, sample_n=2000):
    # checks first sample_n examples (or entire ds if smaller)
    N = min(len(ds), sample_n)
    has = 0
    for i in range(N):
        lbls = ds[i]['labels']
        if any([x != -100 for x in lbls]):
            has += 1
    return has, N

has, total = count_examples_with_labels(trainer.train_dataset, sample_n=2000)
print(f"{has}/{total} examples have at least one non- -100 label.")

1863/2000 examples have at least one non- -100 label.


In [10]:
print("Starting training...")
trainer_stats = trainer.train()
print("Training finished.")
print(trainer_stats)


Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500,000 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 5,971,968 of 20,920,729,152 (0.03% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,5.6512
20,0.3345
30,0.3555
40,0.1468
50,0.127
60,0.0016
70,0.0312
80,0.1563
90,0.1262
100,0.3703


Training finished.
TrainOutput(global_step=100, training_loss=0.7300447638612241, metrics={'train_runtime': 704.623, 'train_samples_per_second': 0.568, 'train_steps_per_second': 0.142, 'total_flos': 2.496414197116915e+16, 'train_loss': 0.7300447638612241, 'epoch': 0.0008})


In [11]:
SAVE_DIR = "finetuned_gptoss_lora_2"
print("Saving adapters to", SAVE_DIR)
model.save_pretrained(SAVE_DIR)
print("Saved.")
# Optionally push to hub: model.push_to_hub("hf_username/repo", token="hf_...")


Saving adapters to finetuned_gptoss_lora_2
Saved.


In [12]:
!zip -r /content/finetuned_gptoss_lora_2.zip /content/finetuned_gptoss_lora_2


zip error: Nothing to do! (try: zip -r /content/finetuned_gptoss_lora_2.zip . -i /content/finetuned_gptoss_lora_2)


In [13]:
from google.colab import files

files.download('finetuned_gptoss_lora_2.zip')

ModuleNotFoundError: No module named 'google.colab'

In [32]:
# Paste and run this entire cell in your notebook (semeval-oss kernel).
# It will inspect the LoRA folder, fix paths, load base model (Unsloth) if needed,
# and attempt to apply the LoRA weights with PEFT.

import os, glob, sys, traceback
from pathlib import Path

# Choose GPU (set before heavy imports if you want to pin)
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "2")  # pick a free GPU id

# Adjust these if your repo layout differs
SCRIPT_DIR = Path.cwd()  # current working dir where notebook runs
# Candidate relative path you mentioned
candidate = SCRIPT_DIR / "finetuned_gptoss_lora_2"
# Also check likely absolute path inferred from prior output
alt = Path.home() / "MS_projects" / "DL_P" / "SemEval-2026-Task13" / "scripts" / "finetuned_gptoss_lora_2"

print("Notebook cwd:", SCRIPT_DIR)
print("Candidate local path:", candidate)
print("Alt path:", alt)
print()

# Try to find the correct folder automatically (fix accidental duplicate segments)
possible = []
# add explicit candidates
possible.append(candidate)
possible.append(alt)
# also search for name anywhere beneath project dir (fast)
for p in [Path.home() / "MS_projects", SCRIPT_DIR]:
    if p.exists():
        for found in p.rglob("finetuned_gptoss_lora_2"):
            possible.append(found)

# canonicalize and remove duplicates
possible = list(dict.fromkeys([p.resolve() for p in possible if p.exists()]))
if not possible:
    print("ERROR: Could not find any 'finetuned_gptoss_lora_2' folder under home/MS_projects or cwd.")
    print("Run `!ls -la` where your notebook is located and paste the listing here.")
    raise SystemExit

print("Found candidate LoRA folders (choose first):")
for i,p in enumerate(possible):
    print(f"  [{i}] {p}")
LORA_DIR = possible[0]
print("\nUsing LORA_DIR =", LORA_DIR)
print("\nFiles in LORA_DIR:")
!ls -la "{LORA_DIR}"

# ---------------------------
# Now try to load model + apply LoRA
# ---------------------------
print("\n--- Loading base model (Unsloth) ---")
try:
    # Import Unsloth loader (fast path). If it's already loaded in the session,
    # this will reuse it; otherwise it will import now.
    from unsloth import FastLanguageModel
    print("Imported unsloth.FastLanguageModel")
except Exception as e:
    print("Could not import unsloth:", e)
    traceback.print_exc()
    raise SystemExit("Please install unsloth in the semeval-oss env or use HF AutoModel path.")

# Attempt Unsloth loader (reduced seq length to save memory)
try:
    model_wrapper, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/gpt-oss-20b",
        max_seq_length = 512,
        load_in_4bit = True,
        full_finetuning = False,
        offload_embedding = True,
    )
    print("Unsloth FastLanguageModel loaded successfully (model_wrapper).")
except Exception as e:
    print("Unsloth loader failed (see traceback); trying HF AutoModel fallback.")
    traceback.print_exc()
    # fallback: try HF AutoModel below
    model_wrapper = None

# Helper to attempt applying LoRA via PEFT on various targets
def try_apply_peft_on(target_model, lora_path):
    """
    Try PeftModel.from_pretrained on target_model
    """
    try:
        from peft import PeftModel
        print("Attempting PeftModel.from_pretrained on target_model:", type(target_model).__name__)
        peft_model = PeftModel.from_pretrained(target_model, str(lora_path), device_map="auto", trust_remote_code=True)
        print("PeftModel.from_pretrained succeeded on", type(target_model).__name__)
        return peft_model
    except Exception as e:
        print("PeftModel.from_pretrained failed on", type(target_model).__name__)
        traceback.print_exc()
        return None

# If Unsloth wrapper loaded, try to apply LoRA using a few heuristics:
peft_model = None
if model_wrapper is not None:
    # Unsloth's wrapper likely exposes a HF model in an attribute; check common names
    candidate_attrs = ["model", "hf_model", "transformer", "base_model", "model_obj"]
    for attr in candidate_attrs:
        if hasattr(model_wrapper, attr):
            print("Found attribute on wrapper:", attr)
            target = getattr(model_wrapper, attr)
            peft_model = try_apply_peft_on(target, LORA_DIR)
            if peft_model is not None:
                break

    # Also try the wrapper object itself
    if peft_model is None:
        print("Trying PeftModel.from_pretrained on the wrapper object itself...")
        peft_model = try_apply_peft_on(model_wrapper, LORA_DIR)

# If still not applied, try pure HF flow (AutoModel + Peft)
if peft_model is None:
    print("\n--- Attempting HF AutoModel + PeftModel.from_pretrained (local LoRA) ---")
    try:
        from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
        tokenizer = AutoTokenizer.from_pretrained("unsloth/gpt-oss-20b", trust_remote_code=True)
        # IMPORTANT: We do not pass quantization_config here (it caused issues earlier).
        # If this raises memory/quantizer errors, we'll print the error and stop.
        base_model = AutoModelForCausalLM.from_pretrained(
            "unsloth/gpt-oss-20b",
            device_map="auto",
            trust_remote_code=True,
            local_files_only=False,
        )
        print("HF base model loaded (may use lots of VRAM). Now applying LoRA via PEFT...")
        from peft import PeftModel
        peft_model = PeftModel.from_pretrained(base_model, str(LORA_DIR), device_map="auto", trust_remote_code=True)
        print("PEFT applied to HF base model successfully.")
    except Exception as e:
        print("HF AutoModel or PeftModel.from_pretrained failed. See traceback below.")
        traceback.print_exc()

# Final status
if peft_model is not None:
    print("\nSUCCESS: LoRA weights applied. You can now use `peft_model` to generate or continue training.")
    # small test: generate a short sample if generate exists
    try:
        text = "Hello, this is a quick generation test."
        input_ids = tokenizer(text, return_tensors="pt").input_ids.to(peft_model.device)
        out = peft_model.generate(input_ids, max_new_tokens=32)
        print("Sample generation:", tokenizer.decode(out[0], skip_special_tokens=True))
    except Exception as e:
        print("Generation test failed (that's okay). Error:")
        traceback.print_exc()
else:
    print("\nFAILED to apply LoRA automatically.")
    print("Please paste the output above (the directory listing and the last traceback).")
    print("\nIf PeftModel.from_pretrained raised an error mentioning 'Repo id must be' then use this local-folder-only call instead:")
    print("  from peft import PeftModel")
    print("  peft_model = PeftModel.from_pretrained(base_model, '<ABSOLUTE_PATH_TO_FINETUNED_FOLDER>', device_map='auto', trust_remote_code=True)")
    print("Make sure you use the absolute path printed at the top of this cell (LORA_DIR).")



Notebook cwd: /home/ramnarayan.ramniwas/MS_projects/DL_P/SemEval-2026-Task13/scripts
Candidate local path: /home/ramnarayan.ramniwas/MS_projects/DL_P/SemEval-2026-Task13/scripts/finetuned_gptoss_lora_2
Alt path: /home/ramnarayan.ramniwas/MS_projects/DL_P/SemEval-2026-Task13/scripts/finetuned_gptoss_lora_2

Found candidate LoRA folders (choose first):
  [0] /home/ramnarayan.ramniwas/MS_projects/DL_P/SemEval-2026-Task13/scripts/finetuned_gptoss_lora_2
  [1] /home/ramnarayan.ramniwas/MS_projects/DL_P/SemEval-2026-Task13/scripts/outputs/finetuned_gptoss_lora_2

Using LORA_DIR = /home/ramnarayan.ramniwas/MS_projects/DL_P/SemEval-2026-Task13/scripts/finetuned_gptoss_lora_2

Files in LORA_DIR:
total 23421
drwxr-xr-x 2 ramnarayan.ramniwas students        5 Nov 30 15:55 .
drwxr-xr-x 6 ramnarayan.ramniwas students       12 Nov 30 16:20 ..
-rw-r--r-- 1 ramnarayan.ramniwas students     1208 Nov 30 15:55 adapter_config.json
-rw-r--r-- 1 ramnarayan.ramniwas students 23913528 Nov 30 15:55 adapter_mod

Traceback (most recent call last):
  File "/tmp/slurm-ramnarayan.ramniwas-102588/ipykernel_2600554/1307047158.py", line 65, in <module>
    model_wrapper, tokenizer = FastLanguageModel.from_pretrained(
  File "/home/ramnarayan.ramniwas/anaconda3/envs/semeval-oss/lib/python3.10/site-packages/unsloth/models/loader.py", line 485, in from_pretrained
    return FastModel.from_pretrained(
  File "/home/ramnarayan.ramniwas/anaconda3/envs/semeval-oss/lib/python3.10/site-packages/unsloth/models/loader.py", line 1143, in from_pretrained
    model, tokenizer = FastBaseModel.from_pretrained(
  File "/home/ramnarayan.ramniwas/anaconda3/envs/semeval-oss/lib/python3.10/site-packages/unsloth/models/vision.py", line 657, in from_pretrained
    model = auto_model.from_pretrained(
  File "/home/ramnarayan.ramniwas/anaconda3/envs/semeval-oss/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 604, in from_pretrained
    return model_class.from_pretrained(
  File "/home/ramnarayan.

HF AutoModel or PeftModel.from_pretrained failed. See traceback below.

FAILED to apply LoRA automatically.
Please paste the output above (the directory listing and the last traceback).

If PeftModel.from_pretrained raised an error mentioning 'Repo id must be' then use this local-folder-only call instead:
  from peft import PeftModel
  peft_model = PeftModel.from_pretrained(base_model, '<ABSOLUTE_PATH_TO_FINETUNED_FOLDER>', device_map='auto', trust_remote_code=True)
Make sure you use the absolute path printed at the top of this cell (LORA_DIR).


Traceback (most recent call last):
  File "/tmp/slurm-ramnarayan.ramniwas-102588/ipykernel_2600554/1307047158.py", line 121, in <module>
    base_model = AutoModelForCausalLM.from_pretrained(
  File "/home/ramnarayan.ramniwas/anaconda3/envs/semeval-oss/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 604, in from_pretrained
    return model_class.from_pretrained(
  File "/home/ramnarayan.ramniwas/anaconda3/envs/semeval-oss/lib/python3.10/site-packages/transformers/modeling_utils.py", line 288, in _wrapper
    return func(*args, **kwargs)
  File "/home/ramnarayan.ramniwas/anaconda3/envs/semeval-oss/lib/python3.10/site-packages/transformers/modeling_utils.py", line 5179, in from_pretrained
    ) = cls._load_pretrained_model(
  File "/home/ramnarayan.ramniwas/anaconda3/envs/semeval-oss/lib/python3.10/site-packages/transformers/modeling_utils.py", line 5496, in _load_pretrained_model
    model._initialize_missing_keys(checkpoint_keys, ignore_mismatched_sizes, is

In [31]:
!ls -la DL_P/SemEval-2026-Task13/scripts/finetuned_gptoss_lora_2
!head -n 200 DL_P/SemEval-2026-Task13/scripts/finetuned_gptoss_lora_2/config.json


ls: cannot access 'DL_P/SemEval-2026-Task13/scripts/finetuned_gptoss_lora_2': No such file or directory
head: cannot open 'DL_P/SemEval-2026-Task13/scripts/finetuned_gptoss_lora_2/config.json' for reading: No such file or directory


In [33]:
import os, pathlib, sys
# pick a free GPU (you used 2 earlier)
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

# set these to your exact absolute paths (the script discovered these for you)
NOTEBOOK_CWD = "/home/ramnarayan.ramniwas/MS_projects/DL_P/SemEval-2026-Task13/scripts"
LORA_DIR = "/home/ramnarayan.ramniwas/MS_projects/DL_P/SemEval-2026-Task13/scripts/finetuned_gptoss_lora_2"

# sanity checks
print("cwd:", NOTEBOOK_CWD)
print("exists LORA_DIR:", os.path.exists(LORA_DIR))
print("files:", list(pathlib.Path(LORA_DIR).iterdir())[:10])


cwd: /home/ramnarayan.ramniwas/MS_projects/DL_P/SemEval-2026-Task13/scripts
exists LORA_DIR: True
files: [PosixPath('/home/ramnarayan.ramniwas/MS_projects/DL_P/SemEval-2026-Task13/scripts/finetuned_gptoss_lora_2/adapter_config.json'), PosixPath('/home/ramnarayan.ramniwas/MS_projects/DL_P/SemEval-2026-Task13/scripts/finetuned_gptoss_lora_2/adapter_model.safetensors'), PosixPath('/home/ramnarayan.ramniwas/MS_projects/DL_P/SemEval-2026-Task13/scripts/finetuned_gptoss_lora_2/README.md')]


In [43]:
# Paste this whole cell into your notebook (run in the semeval-oss kernel).
import os, sys, traceback, textwrap

# === CONFIG ===
os.environ["CUDA_VISIBLE_DEVICES"] = "2"   # change if you want a different GPU
MAX_SEQ_LENGTH = 128                       # try 128 (lower to 64 or 32 if OOM)
BASE = "unsloth/gpt-oss-20b"
LORA_DIR = os.path.abspath("finetuned_gptoss_lora_2")  # must be absolute path
print("Notebook cwd:", os.getcwd())
print("Using LORA_DIR:", LORA_DIR)
print("Pinning visible GPU:", os.environ.get("CUDA_VISIBLE_DEVICES"))

# === helper: monkeypatch methods PEFT expects when missing ===
def ensure_prepare_and_generate(hf_model):
    """
    PEFT expects base_model.prepare_inputs_for_generation (and sometimes
    .get_encoder and certain generate helpers). Add minimal, safe implementations
    when missing so LoRA can be attached and generation works.
    """
    def prepare_inputs_for_generation(self, input_ids, **kwargs):
        # minimal mapping: PEFT only needs input dict shape
        outputs = {"input_ids": input_ids}
        if "attention_mask" in kwargs:
            outputs["attention_mask"] = kwargs["attention_mask"]
        if "past_key_values" in kwargs:
            outputs["past_key_values"] = kwargs["past_key_values"]
        return outputs

    def _greedy_generate_wrapper(self, *args, **kwargs):
        # If hf_model already has generate, call it. Otherwise raise helpful error.
        if hasattr(hf_model, "generate"):
            return hf_model.generate(*args, **kwargs)
        raise AttributeError("Base HF model has no generate()")

    # attach to model and to model.model if wrapper exists
    try:
        if not hasattr(hf_model, "prepare_inputs_for_generation"):
            hf_model.prepare_inputs_for_generation = prepare_inputs_for_generation.__get__(hf_model, hf_model.__class__)
        # also attach to nested .model if present (Unsloth wrapper often has .model)
        if hasattr(hf_model, "model") and not hasattr(hf_model.model, "prepare_inputs_for_generation"):
            hf_model.model.prepare_inputs_for_generation = prepare_inputs_for_generation.__get__(hf_model.model, hf_model.model.__class__)
    except Exception:
        pass

# === main try-flow ===
try:
    # import after CUDA pin to ensure device pinning works
    import torch
    print("torch:", torch.__version__, "cuda available:", torch.cuda.is_available(), "device_count:", torch.cuda.device_count())
    from unsloth import FastLanguageModel
    from transformers import AutoTokenizer, AutoModelForCausalLM
    from peft import PeftModel

    # 1) Try Unsloth fast loader (preferred)
    print("\n=== Attempt 1: Unsloth.FastLanguageModel.from_pretrained() ===")
    try:
        wrapper, tokenizer = FastLanguageModel.from_pretrained(
            model_name = BASE,
            max_seq_length = MAX_SEQ_LENGTH,
            load_in_4bit = True,        # unsloth-friendly; it may ignore deprecated args internally
            offload_embedding = True,
            full_finetuning = False,
            trust_remote_code = True,
        )
        print("Unsloth wrapper loaded (type):", type(wrapper))
        # Under Unsloth, the real HF model is usually wrapper.model or wrapper._model
        hf_model = None
        if hasattr(wrapper, "model"):
            hf_model = wrapper.model
        elif hasattr(wrapper, "_model"):
            hf_model = wrapper._model
        else:
            # maybe wrapper is itself the HF model
            hf_model = wrapper

        print("HF model found at:", type(hf_model))
        ensure_prepare_and_generate(hf_model)

        # 2) Try apply PEFT LoRA from local folder
        print("\nApplying LoRA from:", LORA_DIR)
        try:
            peft_model = PeftModel.from_pretrained(hf_model, LORA_DIR, device_map="auto", trust_remote_code=True)
            print("‚úÖ PEFT LoRA applied successfully to Unsloth/HF model.")
            model = peft_model
            tok = tokenizer
        except Exception as e_peft:
            print("‚ùó PEFT.from_pretrained failed on the Unsloth-wrapped HF model.")
            print("Exception:")
            traceback.print_exception(type(e_peft), e_peft, e_peft.__traceback__)
            # Try using PeftModel.from_pretrained by passing HF model as base and absolute LORA path
            try:
                print("\nTrying fallback PeftModel.from_pretrained with explicit hf_model argument...")
                peft_model = PeftModel.from_pretrained(hf_model, str(LORA_DIR), device_map="auto", trust_remote_code=True)
                print("‚úÖ Fallback PEFT applied.")
                model = peft_model
                tok = tokenizer
            except Exception as e2:
                print("Fallback PEFT also failed.")
                traceback.print_exception(type(e2), e2, e2.__traceback__)
                raise RuntimeError("PEFT application failed; see above tracebacks.")

    except Exception as unsloth_err:
        print("Unsloth loader failed ‚Äî will try HF AutoModel fallback.")
        traceback.print_exception(type(unsloth_err), unsloth_err, unsloth_err.__traceback__)

        # HF fallback: load tokenizer + base model with safe offload device_map options
        print("\n=== Attempt 2: HF AutoModelForCausalLM.from_pretrained (fallback) ===")
        try:
            tokenizer = AutoTokenizer.from_pretrained(BASE, trust_remote_code=True)
            # Build load kwargs conservatively. We use low_cpu_mem_usage and device_map='auto'.
            hf_load_kwargs = {
                "trust_remote_code": True,
                "device_map": "auto",
                "low_cpu_mem_usage": True,
            }
            # Try with llm_int8_enable_fp32_cpu_offload if the model accepts it (some HF builds don't).
            try:
                hf_load_kwargs["llm_int8_enable_fp32_cpu_offload"] = True
            except Exception:
                pass

            print("Calling AutoModelForCausalLM.from_pretrained(...) with kwargs:", hf_load_kwargs)
            hf_model = AutoModelForCausalLM.from_pretrained(BASE, **hf_load_kwargs)
            print("HF base model loaded:", type(hf_model))
            ensure_prepare_and_generate(hf_model)

            # apply local LoRA
            print("Applying LoRA from:", LORA_DIR)
            peft_model = PeftModel.from_pretrained(hf_model, str(LORA_DIR), device_map="auto", trust_remote_code=True)
            print("‚úÖ PEFT LoRA applied to HF model fallback.")
            model = peft_model
            tok = tokenizer

        except Exception as hf_err:
            print("HF fallback load/apply failed. Full traceback below:")
            traceback.print_exception(type(hf_err), hf_err, hf_err.__traceback__)
            raise RuntimeError("All load attempts failed; see tracebacks above.")

    # If we reached here, model + tokenizer should be set
    print("\n=== SUCCESS summary ===")
    print("Model object type:", type(model))
    print("Tokenizer type:", type(tok))
    print("You can now run a quick test generation. Example:")
    try:
        # small smoke test generation (short prompt)
        prompt = "Hello, this is a quick test."
        input_ids = tok(prompt, return_tensors="pt").input_ids.to(torch.device("cuda:0"))
        # make sure model is on cuda if device_map is 'auto' it should be on GPU
        out = model.generate(input_ids, max_new_tokens=32)
        print("Generated (first 200 chars):", tok.decode(out[0], skip_special_tokens=True)[:200])
    except Exception as gen_err:
        print("Generation test failed (but model/LoRA might still be attached).")
        traceback.print_exception(type(gen_err), gen_err, gen_err.__traceback__)

except Exception as final_err:
    print("\n=== FINAL FAILURE ===")
    traceback.print_exception(type(final_err), final_err, final_err.__traceback__)
    print("\nIf this fails, copy-paste the last full traceback here. If you see messages about:")
    print(" - 'Some modules are dispatched on the CPU or the disk' => try lowering MAX_SEQ_LENGTH to 64 or 32,")
    print(" - 'GptOssTopKRouter has no attribute weight' => this is an HF model init issue from trust_remote_code; try running the notebook inside an interactive srun/tmux session (shell) instead of the notebook UI),")
    print(" - PEFT errors about prepare_inputs_for_generation => the cell tries to monkeypatch but may need more methods; paste the traceback.")


Notebook cwd: /home/ramnarayan.ramniwas/MS_projects/DL_P/SemEval-2026-Task13/scripts
Using LORA_DIR: /home/ramnarayan.ramniwas/MS_projects/DL_P/SemEval-2026-Task13/scripts/finetuned_gptoss_lora_2
Pinning visible GPU: 2
torch: 2.9.0+cu128 cuda available: True device_count: 1

=== Attempt 1: Unsloth.FastLanguageModel.from_pretrained() ===
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.11.4: Fast Gpt_Oss patching. Transformers: 4.56.2.
   \\   /|    NVIDIA RTX 5000 Ada Generation. Num GPUs = 1. Max memory: 31.6 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth loader failed ‚Äî will try HF AutoModel fallback.

=== Attempt 2: HF AutoModelForCausalLM.from_pretrained (fallback) ===


Traceback (most recent call last):
  File "/tmp/slurm-ramnarayan.ramniwas-102588/ipykernel_2600554/1388823000.py", line 57, in <module>
    wrapper, tokenizer = FastLanguageModel.from_pretrained(
  File "/home/ramnarayan.ramniwas/anaconda3/envs/semeval-oss/lib/python3.10/site-packages/unsloth/models/loader.py", line 485, in from_pretrained
    return FastModel.from_pretrained(
  File "/home/ramnarayan.ramniwas/anaconda3/envs/semeval-oss/lib/python3.10/site-packages/unsloth/models/loader.py", line 1143, in from_pretrained
    model, tokenizer = FastBaseModel.from_pretrained(
  File "/home/ramnarayan.ramniwas/anaconda3/envs/semeval-oss/lib/python3.10/site-packages/unsloth/models/vision.py", line 657, in from_pretrained
    model = auto_model.from_pretrained(
  File "/home/ramnarayan.ramniwas/anaconda3/envs/semeval-oss/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 604, in from_pretrained
    return model_class.from_pretrained(
  File "/home/ramnarayan.ramniw

Calling AutoModelForCausalLM.from_pretrained(...) with kwargs: {'trust_remote_code': True, 'device_map': 'auto', 'low_cpu_mem_usage': True, 'llm_int8_enable_fp32_cpu_offload': True}
HF fallback load/apply failed. Full traceback below:

=== FINAL FAILURE ===

If this fails, copy-paste the last full traceback here. If you see messages about:
 - 'Some modules are dispatched on the CPU or the disk' => try lowering MAX_SEQ_LENGTH to 64 or 32,
 - 'GptOssTopKRouter has no attribute weight' => this is an HF model init issue from trust_remote_code; try running the notebook inside an interactive srun/tmux session (shell) instead of the notebook UI),
 - PEFT errors about prepare_inputs_for_generation => the cell tries to monkeypatch but may need more methods; paste the traceback.


Traceback (most recent call last):
  File "/tmp/slurm-ramnarayan.ramniwas-102588/ipykernel_2600554/1388823000.py", line 57, in <module>
    wrapper, tokenizer = FastLanguageModel.from_pretrained(
  File "/home/ramnarayan.ramniwas/anaconda3/envs/semeval-oss/lib/python3.10/site-packages/unsloth/models/loader.py", line 485, in from_pretrained
    return FastModel.from_pretrained(
  File "/home/ramnarayan.ramniwas/anaconda3/envs/semeval-oss/lib/python3.10/site-packages/unsloth/models/loader.py", line 1143, in from_pretrained
    model, tokenizer = FastBaseModel.from_pretrained(
  File "/home/ramnarayan.ramniwas/anaconda3/envs/semeval-oss/lib/python3.10/site-packages/unsloth/models/vision.py", line 657, in from_pretrained
    model = auto_model.from_pretrained(
  File "/home/ramnarayan.ramniwas/anaconda3/envs/semeval-oss/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 604, in from_pretrained
    return model_class.from_pretrained(
  File "/home/ramnarayan.ramniw

### Evaluation:

In [37]:
!unzip -q /content/outputs.zip -d /content/outputs

unzip:  cannot find or open /content/outputs.zip, /content/outputs.zip.zip or /content/outputs.zip.ZIP.


In [None]:
!unzip -q /content/finetuned_gptoss_lora_2.zip -d /

In [None]:
!zip -r /content/outputs.zip /content/outputs

In [None]:
!pip install -q "transformers==4.56.2" tokenizers datasets unsloth_zoo unsloth bitsandbytes accelerate scikit-learn pyarrow==19.0.0

import torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import numpy as np
from tqdm import tqdm

print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

In [None]:
BASE_MODEL = "unsloth/gpt-oss-20b"  # same as training
ADAPTER_PATH = "finetuned_gptoss_lora_2"  # path to your saved adapters
MAX_SEQ_LENGTH = 1024
LOAD_IN_4BIT = True

print("Loading base model + adapters...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = ADAPTER_PATH,  # this loads base model + adapters
    dtype = None,
    max_seq_length = MAX_SEQ_LENGTH,
    load_in_4bit = LOAD_IN_4BIT,
)

# Enable inference mode for faster generation
FastLanguageModel.for_inference(model)
print("Model loaded and ready for inference.")

In [None]:
ds = load_dataset("DaniilOr/SemEval-2026-Task13", "A")

# Get validation split (same logic as training)
if isinstance(ds, dict) or hasattr(ds, "keys"):
    keys = list(ds.keys())
    print("Dataset keys:", keys)
    # if "validation" in ds:
    #     val_ds = ds["validation"]
    if "test" in ds:
        test_ds = ds["test"]
else:
    val_ds = ds.train_test_split(test_size=0.1, seed=3407)["test"]

print(f"Test set size: {len(test_ds)}")

In [None]:
test_ds

In [None]:
MAX_CODE_CHARS = 5000

def _crop_head_tail(text: str, max_chars: int) -> str:
    if len(text) <= max_chars:
        return text
    half = max_chars // 2
    return text[:half] + "\n... [truncated] ...\n" + text[-half:]

def _build_prompt(lang: str, code: str) -> str:
    # Use the SAME prompt structure as training
    prompt = f"""
            You are a code origin classifier. Your task is to determine whether the following code was written by a human or generated by a machine.

            Output rule:
            - Respond with EXACTLY ONE WORD: "human" or "machine".
            - Do not include any explanations, punctuation, or extra text.

            Guidelines for reasoning:
            1. Naming: Human code shows mixed styles or domain-specific names. Machine code uses consistent, verbose, or generic names.
            2. Comments: Human comments are sparse or informal. Machine comments are detailed, redundant, or uniformly formatted.
            3. Structure: Human code may have shortcuts or irregularities. Machine code is overly structured or templated.
            4. Formatting: Human code has minor inconsistencies. Machine code follows perfect style rules.
            5. Logic: Human logic is pragmatic and iterative. Machine logic is overly complete or formal.

            Language: {lang}

            Code:
            ```{lang.lower()}
            {code}
            Output your decision (human or machine):
            """
    return prompt

def predict_single(code, language="Unknown"):
    """
    Predict whether code is human or machine-generated.
    Returns: "human" or "machine"
    """
    # Crop code if too long
    code_snippet = _crop_head_tail(code, MAX_CODE_CHARS)

    # Build prompt
    prompt = _build_prompt(language, code_snippet)

    # Format as messages
    messages = [{"role": "user", "content": prompt}]

    # Apply chat template WITHOUT tokenizing (returns string)
    input_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,  # Returns string, not tensors
        add_generation_prompt=True
    )

    # Tokenize with explicit truncation (CRITICAL for memory)
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,  # Truncate if too long
        max_length=MAX_SEQ_LENGTH  # Hard limit
    )

    # Move to GPU
    inputs = {k: v.to("cuda") for k, v in inputs.items()}

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=5,  # Only need 1 word
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Decode only the generated tokens
    input_length = inputs['input_ids'].shape[1]
    generated_tokens = outputs[0][input_length:]
    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    response_text = generated_text.strip().lower()

    # Parse response
    if "machine" in response_text:
        return "machine"
    elif "human" in response_text:
        return "human"
    else:
        first_word = response_text.split()[0] if response_text.split() else ""
        if first_word in ["human", "machine"]:
            return first_word
        print(f"Warning: Unexpected output: '{response_text[:100]}'")
        return "human"

In [None]:
torch.cuda.empty_cache()
import gc
gc.collect()

predictions = []
true_labels = []
raw_outputs = []

# Now run on full test set (no verbose to avoid clutter)
for idx in tqdm(range(len(test_ds)), desc="Predicting"):
    sample = test_ds[idx]
    code = sample.get("code", "")
    label = sample.get("label", 0)
    lang = sample.get("language", "Unknown")

    # Get prediction
    pred_text = predict_single(code, lang)
    raw_outputs.append(pred_text)

    # Convert to binary label
    pred_label = 1 if pred_text == "machine" else 0

    predictions.append(pred_label)
    true_labels.append(int(label))

    # Periodic memory cleanup
    if idx % 50 == 0 and idx > 0:
        torch.cuda.empty_cache()
        gc.collect()

# Validation
unexpected = sum(x not in ("human", "machine") for x in raw_outputs)
print(f"\nNon-binary outputs: {unexpected} (should be 0)")

if unexpected > 0:
    print("\nUnexpected outputs:")
    for i, out in enumerate(raw_outputs):
        if out not in ("human", "machine"):
            print(f"  Index {i}: '{out}'")

In [None]:
# Show some examples
print("\nFirst 10 predictions:")
for i in range(min(10, len(raw_outputs))):
    correct = "‚úì" if predictions[i] == true_labels[i] else "‚úó"
    print(f"  {correct} {i}: Predicted={raw_outputs[i]}, True={'machine' if true_labels[i] == 1 else 'human'}")

# Calculate accuracy
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(true_labels, predictions)
print(f"\nAccuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["human", "machine"]))

In [None]:
import pandas as pd

df = pd.DataFrame({
    'ID': range(len(predictions)),
    'label': predictions
})

df.to_csv('test_predictions.csv', index=False)
print(f"CSV file created with {len(df)} rows")

In [None]:
from google.colab import files

files.download('test_predictions.csv')