<a href="https://colab.research.google.com/github/Reene444/Dora_galora_ats-funetuned/blob/main/Dora_galora_ats_funetuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ["TRAIN_STEPS"]="120"
os.environ["MAX_EPOCHS"]="1"
os.environ["WARMUP_STEPS"]="8"
os.environ["BATCH_SIZE"]="1"
os.environ["GRAD_ACCUM_STEPS"]="2"
os.environ["DORA_RANK"]="4"
os.environ["DORA_ALPHA"]="16"
os.environ["DORA_DROPOUT"]="0.05"
os.environ["GALORE_ENABLE"]="0"
os.environ["SYNTHETIC_SAMPLES_PER_GROUP"]="10"
os.environ["STAR_SAMPLES"]="20"
os.environ["MAX_INPUT_CHARS"]="6000"
os.environ["USE_8BIT"] = "1"

In [None]:
#!/usr/bin/env python3
"""
Fine-tune TinyLlama with DoRA + GaLore for resume-based job applications.
"""

import json
import math
import os
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence

# --- Dependency bootstrap ----------------------------------------------------


def _ensure_package(import_name: str, pip_name: Optional[str] = None) -> None:
    try:
        __import__(import_name)
    except ImportError:
        target = pip_name or import_name
        print(f"Installing dependency: {target}")
        subprocess.check_call([sys.executable, "-m", "pip", "install", target])


for pkg in [
    ("torch", "torch"),
    ("transformers", "transformers>=4.40.0"),
    ("peft", "peft>=0.10.0"),
    ("accelerate", "accelerate>=0.30.0"),
    ("PyPDF2", "pypdf2"),
]:
    _ensure_package(*pkg)

# Import torch to check CUDA availability
import torch

# Try to install/upgrade bitsandbytes for quantization (CUDA only, optional)
bitsandbytes_available = False
if torch.cuda.is_available():
    try:
        import bitsandbytes
        # Check if it's a recent version by trying to import the quantizer
        try:
            from transformers.utils import is_bitsandbytes_available
            if is_bitsandbytes_available():
                bitsandbytes_available = True
                print("bitsandbytes is available")
            else:
                raise ImportError("bitsandbytes version too old")
        except (ImportError, AttributeError):
            # Try to upgrade bitsandbytes
            print("Upgrading bitsandbytes to latest version...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "bitsandbytes"])
            import importlib
            importlib.reload(bitsandbytes) if 'bitsandbytes' in sys.modules else None
            bitsandbytes_available = True
            print("bitsandbytes upgraded successfully")
    except ImportError:
        try:
            print("Installing bitsandbytes for quantization support...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "bitsandbytes"])
            import bitsandbytes
            bitsandbytes_available = True
            print("bitsandbytes installed successfully")
        except Exception as e:
            print(f"WARNING: Could not install/upgrade bitsandbytes: {e}")
            print("   Quantization will be disabled. Model will use more memory.")
            bitsandbytes_available = False
else:
    print("INFO: CUDA not available, skipping bitsandbytes installation (CPU/MPS mode)")

# Try to use official GaLore, fall back to custom implementation if it's not available
try:
    from galore_torch import GaLoreAdamW  # type: ignore
except ImportError:
    try:
        print("Installing GaLore from GitHub...")
        subprocess.check_call(
            [sys.executable, "-m", "pip", "install", "git+https://github.com/jiaweizzhao/GaLore.git"]
        )
        from galore_torch import GaLoreAdamW  # type: ignore
    except Exception as galore_err:
        print(f"WARNING: GaLore official package unavailable: {galore_err}")
        GaLoreAdamW = None  # fall back to custom gradient projection

from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    get_cosine_schedule_with_warmup,
)
from peft import PeftModel, get_peft_model

# Try multiple import paths for DoRAConfig (different PEFT versions)
# If not available, implement true DoRA (Weight-Decomposed Low-Rank Adaptation)
DoRAConfig = None
try:
    from peft import DoRAConfig
    print("Using native DoRAConfig from PEFT")
except ImportError:
    try:
        from peft.tuners.dora import DoRAConfig
        print("Using DoRAConfig from peft.tuners.dora")
    except ImportError:
        try:
            from peft.tuners.dora.config import DoRAConfig
            print("Using DoRAConfig from peft.tuners.dora.config")
        except ImportError:
            # Implement true DoRA: Weight decomposition into magnitude and direction
            from peft import LoraConfig
            from dataclasses import dataclass, field

            @dataclass
            class DoRAConfig(LoraConfig):
                """True DoRA (Weight-Decomposed Low-Rank Adaptation) configuration.

                DoRA decomposes weights W into magnitude m and direction V:
                W = m * V / ||V||_c

                Where:
                - m: learnable magnitude vector
                - V: direction matrix (adapted via LoRA)
                - ||V||_c: column-wise L2 norm
                """
                use_dora: bool = field(default=True, metadata={"help": "Enable DoRA weight decomposition"})

            print("Created true DoRAConfig implementation (weight decomposition into magnitude + direction)")

# ---------------- Heavy synthetic data controls ----------------
SYNTHETIC_SAMPLES_PER_GROUP = int(os.getenv("SYNTHETIC_SAMPLES_PER_GROUP", 120))  # per ATS/type group (raised default)
MAX_INPUT_CHARS = int(os.getenv("MAX_INPUT_CHARS", 100000))  # 100K chars to cover full resume (100% coverage)
EXHAUSTIVE_MAX_LINES = int(os.getenv("EXHAUSTIVE_MAX_LINES", 10000))  # 10K lines to cover all lines
STAR_SAMPLES = int(os.getenv("STAR_SAMPLES", 200))  # number of STAR/free-form samples
# 100% coverage flags
ENABLE_WORD_LEVEL_COVERAGE = os.getenv("WORD_LEVEL_COVERAGE", "1") not in {"0", "false", "False"}
ENABLE_SENTENCE_LEVEL_COVERAGE = os.getenv("SENTENCE_LEVEL_COVERAGE", "1") not in {"0", "false", "False"}
ENABLE_PARAGRAPH_LEVEL_COVERAGE = os.getenv("PARAGRAPH_LEVEL_COVERAGE", "1") not in {"0", "false", "False"}
# --------------------------------------------------------------

def _get_working_directory() -> Path:
    """Determine the working directory.

    If running as a script, use the script's directory.
    If in Colab or interactive mode, use the current working directory.
    """
    # Try __file__ first (works when running as a script)
    # In Colab notebooks, __file__ doesn't exist, so this will raise NameError
    try:
        file_path = __file__
        return Path(file_path).parent.absolute()
    except NameError:
        # In Colab or interactive mode, use current directory
        cwd = Path(os.getcwd())

        # Colab usually uses /content as the working directory
        if str(cwd).startswith('/content'):
            print(f"Detected Google Colab environment, using working directory: {cwd}")
            return cwd

        # Otherwise use the current directory
        print(f"Using current working directory: {cwd}")
        return cwd


@dataclass
class TrainingConfig:
    model_name: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    # Use working directory (compatible with local scripts and Google Colab)
    _script_dir = _get_working_directory()
    adapter_path: str = str(_script_dir / "fine-tuned-model")
    data_dir: Path = _script_dir / "torch_data"
    # Reduced defaults for Colab memory constraints
    max_seq_length: int = int(os.getenv("MAX_SEQ_LENGTH", 1024))  # Reduced from 2048
    per_device_batch_size: int = int(os.getenv("BATCH_SIZE", 1))
    grad_accum_steps: int = int(os.getenv("GRAD_ACCUM_STEPS", 8))  # Increased to compensate
    learning_rate: float = float(os.getenv("LEARNING_RATE", 5e-6))  # Very conservative default to avoid NaN
    weight_decay: float = 0.01
    warmup_steps: int = int(os.getenv("WARMUP_STEPS", 50))
    target_steps: int = int(os.getenv("TRAIN_STEPS", 100))
    max_epochs: int = int(os.getenv("MAX_EPOCHS", 3))
    dora_rank: int = int(os.getenv("DORA_RANK", 8))
    dora_alpha: int = int(os.getenv("DORA_ALPHA", 64))
    dora_dropout: float = float(os.getenv("DORA_DROPOUT", 0.05))
    galore_rank_ratio: float = float(os.getenv("GALORE_RANK_RATIO", 0.08))
    galore_update_interval: int = int(os.getenv("GALORE_UPDATE_INTERVAL", 1))
    galore_project_on: bool = os.getenv("GALORE_ENABLE", "1") not in {"0", "false", "False"}
    max_grad_norm: float = float(os.getenv("MAX_GRAD_NORM", 0.5))  # Stricter gradient clipping to prevent NaN
    use_8bit: bool = os.getenv("USE_8BIT", "1") not in {"0", "false", "False"}  # 8-bit quantization
    use_4bit: bool = os.getenv("USE_4BIT", "0") not in {"0", "false", "False"}  # 4-bit quantization (more aggressive)


class ConversationDataset(Dataset):
    """Dataset turning chat-style messages into supervised fine-tuning tensors."""

    def __init__(self, samples: Sequence[Dict[str, Any]], tokenizer: AutoTokenizer, max_length: int):
        self.samples = samples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self) -> int:
        return len(self.samples)

    def __getitem__(self, idx: int) -> Dict[str, Any]:
        sample = self.samples[idx]
        messages = sample.get("messages", [])
        if len(messages) < 2:
            raise ValueError("Sample must contain at least a user and assistant message.")

        # All but final assistant response considered prompt; final message treated as answer.
        prompt_messages = messages[:-1]
        answer_message = messages[-1]
        if answer_message.get("role") != "assistant":
            # enforce assistant role for target
            answer_message = {"role": "assistant", "content": str(answer_message.get("content", ""))}

        eos_token = self.tokenizer.eos_token or self.tokenizer.pad_token or ""

        if hasattr(self.tokenizer, "apply_chat_template"):
            prompt_text = self.tokenizer.apply_chat_template(
                prompt_messages, tokenize=False, add_generation_prompt=True
            )
        else:
            prompt_parts = []
            for msg in prompt_messages:
                role = msg.get("role", "user").upper()
                prompt_parts.append(f"{role}: {msg.get('content', '')}")
            prompt_parts.append("ASSISTANT:")
            prompt_text = "\n".join(prompt_parts) + " "

        answer_text = answer_message.get("content", "")

        prompt_ids = self.tokenizer(
            prompt_text,
            add_special_tokens=False,
            truncation=True,
            max_length=self.max_length,
        )["input_ids"]

        answer_suffix = eos_token if eos_token else ""
        answer_ids = self.tokenizer(
            answer_text + answer_suffix,
            add_special_tokens=False,
            truncation=True,
            max_length=self.max_length,
        )["input_ids"]

        input_ids = prompt_ids + answer_ids
        input_ids = input_ids[: self.max_length]

        # Mask prompt tokens (only supervise assistant response)
        labels = [-100] * len(prompt_ids)
        labels += answer_ids
        labels = labels[: self.max_length]

        attention_mask = [1] * len(input_ids)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }


def collate_conversations(batch: Sequence[Dict[str, Any]], pad_token_id: int) -> Dict[str, torch.Tensor]:
    max_len = max(len(item["input_ids"]) for item in batch)
    batch_size = len(batch)

    input_ids = torch.full((batch_size, max_len), pad_token_id, dtype=torch.long)
    attention_mask = torch.zeros((batch_size, max_len), dtype=torch.long)
    labels = torch.full((batch_size, max_len), -100, dtype=torch.long)

    for i, item in enumerate(batch):
        length = len(item["input_ids"])
        input_ids[i, :length] = torch.tensor(item["input_ids"], dtype=torch.long)
        attention_mask[i, :length] = 1
        labels[i, :length] = torch.tensor(item["labels"], dtype=torch.long)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


@torch.no_grad()
def project_gradients_low_rank(
    model: nn.Module,
    rank_ratio: float = 0.08,
) -> None:
    """Apply GaLore-style low-rank projection to gradients (fallback when GaLoreAdamW unavailable)."""

    for name, param in model.named_parameters():
        if not param.requires_grad or param.grad is None:
            continue
        grad = param.grad.data
        if grad.ndim < 2:
            # Skip bias / vector params
            continue
        if grad.is_sparse:
            continue

        rows, cols = grad.shape[0], grad.reshape(grad.shape[0], -1).shape[1]
        min_dim = min(rows, cols)
        if min_dim < 2:
            continue

        rank = max(1, int(min_dim * rank_ratio))
        if rank >= min_dim:
            continue

        grad_matrix = grad.reshape(rows, -1)
        try:
            u, s, vh = torch.linalg.svd(grad_matrix, full_matrices=False)
            s[rank:] = 0
            approx = (u[:, :rank] * s[:rank]) @ vh[:rank, :]
        except RuntimeError:
            q, r = torch.linalg.qr(grad_matrix)
            approx = q[:, :rank] @ r[:rank, :]

        grad.copy_(approx.reshape_as(grad))

def _extract_fields_from_resume(resume_text):
    """Extract basic fields from resume: email, phone, name, education, experience, skills, etc."""
    import re
    # Find email
    email_match = re.search(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", resume_text)
    email = email_match.group() if email_match else "lxc645@alumni.bham.ac.uk"
    # Find phone number
    phone_match = re.search(r"(\+?[\d\s\-\(\)]{10,})", resume_text)
    phone = phone_match.group().strip() if phone_match else "+447827396618"
    # Name is usually the first non-empty line
    first_line = next((ln.strip() for ln in resume_text.split("\n") if ln.strip()), "Linlin Chen")
    name = first_line
    # Split into first/last name
    parts = name.split()
    first_name = parts[0] if parts else "Linlin"
    last_name = parts[-1] if len(parts) > 1 else "Chen"
    # Try to guess education level (not very accurate, but good enough)
    edu = "Master's degree" if re.search(r"Master|MSc|MS|MA", resume_text, re.I) else ("Bachelor's degree" if re.search(r"Bachelor|BSc|BA", resume_text, re.I) else "")
    # Try to find years of experience
    years = 2
    m_years = re.findall(r"(\d+)\+?\s*(years|yrs)", resume_text, re.I)
    if m_years:
        try:
            years = min(10, max(1, int(m_years[0][0])))
        except:
            pass
    # Look for common skills and languages
    skills = []
    for kw in ["Python", "AI/ML", "Machine Learning", "Deep Learning", "NLP", "LLM", "Full Stack", "Selenium", "OpenAI", "Ollama"]:
        if re.search(rf"\b{re.escape(kw)}\b", resume_text, re.I):
            skills.append(kw)
    if not skills:
        skills = ["Python", "AI/ML", "Full Stack Development"]
    languages = []
    for lang in ["English", "Chinese", "Mandarin", "Cantonese"]:
        if re.search(rf"\b{re.escape(lang)}\b", resume_text, re.I):
            languages.append(lang)
    if not languages:
        languages = ["English", "Chinese"]
    summary = resume_text[:600]
    return {
        "name": name,
        "first_name": first_name,
        "last_name": last_name,
        "email": email,
        "phone": phone,
        "education": edu,
        "experience_years": years,
        "skills": skills,
        "languages": languages,
        "summary": summary,
    }

def _generate_exhaustive_line_samples(resume_text: str) -> list[dict]:
    """Synthesize per-line/word/sentence/paragraph coverage samples for 100% resume coverage."""
    import re
    lines = [ln.strip() for ln in resume_text.split('\n') if ln.strip()]
    # Use ALL lines (no truncation for 100% coverage)
    lines = lines[:min(len(lines), EXHAUSTIVE_MAX_LINES)]

    def msg(user, assistant):
        return {"messages": [{"role": "user", "content": user}, {"role": "assistant", "content": assistant}]}

    samples: list[dict] = []

    # 1. Verbatim line reproduction (100% line coverage)
    for ln in lines:
        samples.append(msg("Return this resume line verbatim (output only the original line):", ln))

    # 2. Key-value parsing for structured lines
    kv_pattern = re.compile(r"^([A-Za-z \-_/]+):\s*(.+)$")
    for ln in lines:
        m = kv_pattern.match(ln)
        if m:
            key = m.group(1).strip()
            val = m.group(2).strip()
            samples.append(msg(
                f"Parse this line into a JSON key-value (output JSON only): {ln}",
                json.dumps({key: val}, ensure_ascii=False)
            ))

    # 3. Word-level coverage (if enabled)
    if ENABLE_WORD_LEVEL_COVERAGE:
        words = re.findall(r'\b\w+\b', resume_text)
        # Sample every Nth word to avoid too many samples, but ensure coverage
        word_sample_rate = max(1, len(words) // 500)  # ~500 word samples max
        for i in range(0, len(words), word_sample_rate):
            word = words[i]
            if len(word) > 2:  # Skip very short words
                word_lower = word.lower()
                resume_lower = resume_text.lower()
                word_pos = resume_lower.find(word_lower)
                if word_pos >= 0:
                    context_start = max(0, word_pos - 50)
                    context_end = min(len(resume_text), word_pos + len(word) + 50)
                    context = resume_text[context_start:context_end]
                    samples.append(msg(
                        f"Does my resume contain the word '{word}'? If yes, provide context (one sentence).",
                        f"Yes, '{word}' appears in my resume. Context: {context}"
                    ))

    # 4. Sentence-level coverage (if enabled)
    if ENABLE_SENTENCE_LEVEL_COVERAGE:
        sentences = re.split(r'[.!?]+', resume_text)
        sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
        for sent in sentences[:min(500, len(sentences))]:  # Limit to 500 sentences
            if len(sent) > 10:
                samples.append(msg(
                    "Return this sentence from my resume verbatim:",
                    sent
                ))

    # 5. Paragraph-level coverage (if enabled)
    if ENABLE_PARAGRAPH_LEVEL_COVERAGE:
        paragraphs = [p.strip() for p in resume_text.split('\n\n') if len(p.strip()) > 20]
        for para in paragraphs[:min(200, len(paragraphs))]:  # Limit to 200 paragraphs
            samples.append(msg(
                "Return this paragraph from my resume verbatim:",
                para
            ))
            # Also extract structured info from paragraphs
            samples.append(msg(
                f"Extract all key information from this resume paragraph as JSON: {para[:200]}...",
                json.dumps({"paragraph_content": para[:500]}, ensure_ascii=False)
            ))

    return samples

def generate_training_data(resume_text):
    """Generate training samples from resume. Creates lots of examples for form filling and ATS scenarios."""
    print("Generating training data from resume (enhanced/heavy-weight)...")
    fields = _extract_fields_from_resume(resume_text)

    def msg(user, assistant):
        return {"messages": [{"role": "user", "content": user}, {"role": "assistant", "content": assistant}]}

    samples: list[dict] = []

    # Cover every line of the resume
    samples.extend(_generate_exhaustive_line_samples(resume_text))

    # Extract all the basic fields and create Q&A pairs
    base_kv = {
        "firstName": fields["first_name"],
        "lastName": fields["last_name"],
        "email": fields["email"],
        "phone": fields["phone"],
        "education": fields["education"] or "Master's degree",
        "experience_years": fields["experience_years"],
        "skills": fields["skills"],
        "languages": fields["languages"],
        "summary": fields["summary"][:MAX_INPUT_CHARS],
        "linkedin": "linkedin.com/in/lchen198",
        "github": "github.com/Reene444",
        "website": "https://reene4444.com/",
    }

    import json as _json

    # Create simple Q&A for each field, plus JSON mapping
    field_aliases = {
        "first name": base_kv["firstName"],
        "given_name": base_kv["firstName"],
        "family name": base_kv["lastName"],
        "last name": base_kv["lastName"],
        "email": base_kv["email"],
        "e_mail": base_kv["email"],
        "phone": base_kv["phone"],
        "mobile": base_kv["phone"],
        "linkedin": base_kv["linkedin"],
        "github": base_kv["github"],
        "website": base_kv["website"],
    }

    for alias, val in field_aliases.items():
        samples.append(msg(f"Form field to fill: {alias}?\nOutput answer only.", str(val)))
        samples.append(msg(f"Analyze HTML input: <input name='{alias}'>. What value should be filled?\nOutput answer only.", str(val)))

    samples.append(msg(
        "Extract from resume and return JSON (output JSON only): {firstName,lastName,email,phone,education,experience_years,skills,languages,summary}",
        _json.dumps({k: base_kv[k] for k in ["firstName","lastName","email","phone","education","experience_years","skills","languages","summary"]}, ensure_ascii=False)
    ))

    # Create samples for different ATS platforms (Greenhouse, Workday, Lever, etc.)
    ats_types = ["greenhouse", "workday", "lever", "generic_form"]

    action_plan_template = {
        "greenhouse": [
            "Detect site type = greenhouse",
            "Locate external apply link (avoid 'Easy Apply')",
            "Click external apply button (opens new tab)",
            "Switch to new tab",
            "Wait for form to load",
            "Fill fields using resume mapping",
            "Upload resume if required",
            "Submit or continue to next step",
        ],
        "workday": [
            "Detect site type = workday",
            "Click 'Apply' or 'Apply Now'",
            "Handle authentication modal if present (skip if not)",
            "Navigate to personal info section",
            "Fill fields using resume mapping",
            "Upload resume",
            "Review and submit",
        ],
        "lever": [
            "Detect site type = lever",
            "Click 'Apply for this job'",
            "Wait for form iframe or modal",
            "Fill fields using resume mapping",
            "Upload resume",
            "Submit",
        ],
        "generic_form": [
            "Detect site type = generic_form",
            "Find any button containing 'Apply' (exclude 'Easy Apply')",
            "Click and wait for form",
            "Fill fields using resume mapping",
            "Submit",
        ],
    }

    selector_map = {
        "first_name": ["input[name='first_name']", "input#first_name", "input[placeholder*='first']"],
        "last_name": ["input[name='last_name']", "input#last_name", "input[placeholder*='last']"],
        "email": ["input[type='email']", "input[name='email']", "input[placeholder*='email']"],
        "phone": ["input[type='tel']", "input[name='phone']", "input[placeholder*='phone']"],
        "resume": ["input[type='file'][name*='resume']", "input[name='file']"],
    }

    def build_action_json(ats):
        return {
            "page_type": ats,
            "form_fields": [
                {"field_name": "first_name", "field_type": "text", "selector": selector_map["first_name"][0], "required": True, "suggested_value": base_kv["firstName"]},
                {"field_name": "last_name", "field_type": "text", "selector": selector_map["last_name"][0], "required": True, "suggested_value": base_kv["lastName"]},
                {"field_name": "email", "field_type": "email", "selector": selector_map["email"][0], "required": True, "suggested_value": base_kv["email"]},
                {"field_name": "phone", "field_type": "phone", "selector": selector_map["phone"][0], "required": True, "suggested_value": base_kv["phone"]},
                {"field_name": "resume", "field_type": "file", "selector": selector_map["resume"][0], "required": False, "suggested_value": "<path-to-resume>"},
            ],
            "submit_button": {"selector": "button[type='submit']", "text": "Submit"},
            "action_plan": action_plan_template[ats],
            "confidence": 0.9
        }

    page_stub = (
        "Analyze the job application page (HTML omitted). Using my resume context, output JSON with:\n"
        "- page_type (greenhouse|workday|lever|generic_form)\n"
        "- form_fields: [{field_name, field_type, selector, required, suggested_value}]\n"
        "- submit_button: {selector, text}\n"
        "- action_plan: [step1, step2, ...]\n"
        "Output JSON only, no extra text."
    )

    for ats in ats_types:
        samples.append(msg(page_stub, _json.dumps(build_action_json(ats), ensure_ascii=False)))
        for i in range(SYNTHETIC_SAMPLES_PER_GROUP):
            btn_text = ["Apply", "Submit", "Send", "Continue"][i % 4]
            var = build_action_json(ats)
            var["submit_button"]["text"] = btn_text
            if i % 3 == 0:
                var["form_fields"].insert(0, var["form_fields"].pop())
            if i % 5 == 0:
                var["form_fields"].append({
                    "field_name": "linkedin", "field_type": "text", "selector": "input[name='linkedin']",
                    "required": False, "suggested_value": base_kv["linkedin"]
                })
            samples.append(msg(page_stub, _json.dumps(var, ensure_ascii=False)))

    robustness_user = (
        "If the page has both 'Easy Apply' and an external 'Apply' button, which should be used? Output 'external' or 'easy' only."
    )
    samples.append(msg(robustness_user, "external"))

    need_click_more_user = (
        "Form is not visible initially. Which element should be clicked to reveal it? Output one CSS selector only."
    )
    samples.append(msg(need_click_more_user, "button:contains('Apply')"))

    full_form_user = (
        "Output JSON values for autofill (output JSON only): {first_name,last_name,email,phone,linkedin,github,website,education,experience_years,skills}"
    )
    full_form_out = _json.dumps({
        "first_name": base_kv["firstName"],
        "last_name": base_kv["lastName"],
        "email": base_kv["email"],
        "phone": base_kv["phone"],
        "linkedin": base_kv["linkedin"],
        "github": base_kv["github"],
        "website": base_kv["website"],
        "education": base_kv["education"],
        "experience_years": base_kv["experience_years"],
        "skills": base_kv["skills"],
    }, ensure_ascii=False)
    samples.append(msg(full_form_user, full_form_out))

    print(f"Generated samples (including augmentation): {len(samples)}")
    # 5) Experience synthesis / Project details / STAR narratives & free-form QA coverage
    # Use resume summary/full text slices to generate STAR, project details, strengths/weaknesses, quantified outcomes, timeline, responsibilities, etc.
    def msg(user, assistant):
        return {"messages": [{"role": "user", "content": user}, {"role": "assistant", "content": assistant}]}

    star_prompts = [
        "Summarize a representative project using the STAR method (Situation, Task, Action, Result). Output a single paragraph.",
        "List 3-5 bullet points for my core experience. Start each with a verb and quantify results where possible.",
        "Summarize my tech stack and proficiency (concise bullets).",
        "Extract transferable skills from my experience (e.g., Communication, Leadership, Ownership, Problem Solving).",
        "Output my project list as JSON array: name, objective, my role, key tech, outcome.",
        "Provide a 30-second self-introduction focusing on AI/Agents/form automation.",
        "In first person: What was your most challenging project? What did you do and what was the result?",
        "Summarize a 3-year timeline (year-company/project-responsibilities-outcomes) as a JSON array.",
        "Generate standard ATS answers as JSON (visa, availability, remote, salary expectations).",
        "List 3 role-aligned achievements with measurable metrics (KPI/speed/accuracy).",
    ]

    # Generate STAR method samples using the full resume text (no truncation)
    knowledge_blob = resume_text[:MAX_INPUT_CHARS]  # MAX_INPUT_CHARS is 100K, so this covers the whole resume
    for i in range(min(STAR_SAMPLES, len(star_prompts))):
        prompt = star_prompts[i % len(star_prompts)] + "\n\nFull resume content:\n" + knowledge_blob
        # Use the full resume as the answer - don't truncate
        synthesized_answer = knowledge_blob
        samples.append(msg(prompt, synthesized_answer))

    # Add free-form Q&A templates so the model can answer random questions based on the resume
    free_qa_templates = [
        "Answer based on my resume only: {q} (no external knowledge).",
        "Using my experience only, answer concisely: {q} (say 'unknown' if not present).",
        "Answer using resume highlights only: {q} (do not fabricate).",
    ]
    generic_questions = [
        "You recently did an automation/Agent-related work?",
        "What are your top three skills?",
        "Describe a case where you improved efficiency (include metrics).",
        "Which languages and frameworks are you familiar with?",
        "Your education background and graduation time?",
        "If you need to fill first_name/last_name/email/phone, what are these values?",
        "What are your common responsibilities?",
        "List your most proud achievements and their impact.",
        "Do you have any open-source/portfolio links?",
        "Your visa and available time for employment?",
    ]
    for i in range(STAR_SAMPLES):
        q = generic_questions[i % len(generic_questions)]
        tpl = free_qa_templates[i % len(free_qa_templates)]
        prompt = tpl.format(q=q) + "\n\nFull resume content:\n" + knowledge_blob
        # Use the full resume as answer - don't truncate anything
        answer = knowledge_blob
        samples.append(msg(prompt, answer))

    print(f"Generated samples (incl. augmentation/STAR/free QA): {len(samples)}")
    return samples

def create_training_jsonl(data, output_file="training_data.jsonl"):
    """Create JSONL file for training"""
    print(f"Creating training file: {output_file}")

    with open(output_file, 'w') as f:
        for item in data:
            line = json.dumps(item, ensure_ascii=False)
            f.write(line + '\n')

    print(f"Created {len(data)} training samples")
    return output_file

def _resolve_device() -> tuple[torch.device, str]:
    if torch.backends.mps.is_available():
        return torch.device("mps"), "mps"
    if torch.cuda.is_available():
        return torch.device("cuda"), "cuda"
    return torch.device("cpu"), "cpu"


def _apply_dora_decomposition(model: PeftModel, device: torch.device) -> None:
    """Apply true DoRA (Weight-Decomposed Low-Rank Adaptation) to LoRA layers.

    DoRA decomposes weights W into magnitude m and direction V:
    W = m * V / ||V||_c

    This function:
    1. Finds all LoRA layers in the model
    2. Adds magnitude parameters for each LoRA layer
    3. Wraps the forward method to apply DoRA decomposition
    """
    try:
        from peft.tuners.lora import Linear, LoraLayer
    except ImportError:
        try:
            from peft.tuners.lora.layer import Linear, LoraLayer
        except ImportError:
            print("WARNING: Cannot import LoRA layers, skipping DoRA decomposition")
            return

    dora_layers = []

    def _find_lora_layers(module, prefix=""):
        """Recursively find all LoRA layers."""
        for name, child in module.named_children():
            full_name = f"{prefix}.{name}" if prefix else name
            if isinstance(child, (Linear, LoraLayer)):
                dora_layers.append((full_name, child))
            else:
                _find_lora_layers(child, full_name)

    _find_lora_layers(model)

    if not dora_layers:
        print("WARNING: No LoRA layers found, skipping DoRA decomposition")
        return

    print(f"Applying DoRA to {len(dora_layers)} LoRA layers...")

    for layer_name, lora_layer in dora_layers:
        # Get the base weight shape to determine output features
        try:
            if hasattr(lora_layer, 'base_layer'):
                base_weight = lora_layer.base_layer.weight
            elif hasattr(lora_layer, 'weight'):
                base_weight = lora_layer.weight
            else:
                continue

            # Create magnitude parameter (one per output dimension)
            # For linear layers: weight shape is (out_features, in_features)
            out_features = base_weight.shape[0]
            # Initialize magnitude to a very small value (0.01) to avoid numerical instability
            # Starting from 1.0 or even 0.1 can cause immediate NaN when combined with LoRA updates
            # Using 0.01 allows very gradual learning of magnitude scaling
            # Alternative: initialize based on LoRA alpha scaling
            magnitude_init_value = 0.01  # Very small initial value
            magnitude_init = torch.full(
                (out_features,),
                magnitude_init_value,
                device=device,
                dtype=base_weight.dtype
            )
            magnitude = nn.Parameter(magnitude_init, requires_grad=True)

            # Store magnitude in the layer
            lora_layer.register_parameter('dora_magnitude', magnitude)

            # Wrap the forward method to apply DoRA
            original_forward = lora_layer.forward

            def make_dora_forward(orig_fwd, mag_param, layer_nm):
                def dora_forward(*args, **kwargs):
                    # Call original forward to get LoRA output
                    output = orig_fwd(*args, **kwargs)

                    # Apply DoRA: scale by magnitude on the output feature dimension
                    # Use torch operations that preserve gradients
                    if isinstance(output, torch.Tensor) and output.dim() >= 2:
                        # Get the last dimension (feature dimension)
                        feature_dim = output.shape[-1]
                        mag_size = mag_param.shape[0]

                        # Only apply if dimensions match
                        if feature_dim == mag_size:
                            # Clamp magnitude to prevent extreme values that cause NaN
                            # This adds numerical stability
                            mag_clamped = torch.clamp(mag_param, min=1e-6, max=10.0)

                            # Reshape magnitude to broadcast correctly
                            # Use view() which preserves gradients
                            view_shape = [1] * (output.dim() - 1) + [mag_size]
                            mag_view = mag_clamped.view(*view_shape)

                            # Check for NaN/Inf in output before applying DoRA
                            if torch.any(torch.isnan(output)) or torch.any(torch.isinf(output)):
                                # If output already has NaN, don't apply DoRA scaling
                                return output

                            # Use mul() to ensure gradient flow
                            output_scaled = torch.mul(output, mag_view)

                            # Check for NaN/Inf after scaling
                            if torch.any(torch.isnan(output_scaled)) or torch.any(torch.isinf(output_scaled)):
                                # If scaling caused NaN, return original output
                                return output

                            return output_scaled
                        # Silently skip if dimensions don't match (may happen with reshaped outputs)

                    return output
                return dora_forward

            lora_layer.forward = make_dora_forward(original_forward, magnitude, layer_name)
        except Exception as e:
            print(f"WARNING: Skipping DoRA for layer {layer_name}: {e}")
            continue

    print(f"DoRA decomposition applied to {len(dora_layers)} layers")


def evaluate_model(
    model: PeftModel,
    dataloader: Optional[DataLoader],
    device: torch.device,
    device_type: str,
    autocast_dtype: torch.dtype,
) -> Optional[float]:
    if dataloader is None or len(dataloader) == 0:
        return None

    model.eval()
    losses: List[float] = []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.autocast(device_type=device_type, dtype=autocast_dtype, enabled=device_type != "cpu"):
                outputs = model(**batch)
                loss = outputs.loss
            losses.append(loss.item())

    model.train()
    return float(sum(losses) / max(len(losses), 1))


def train_with_dora_galore(
    train_samples: Sequence[Dict[str, Any]],
    valid_samples: Sequence[Dict[str, Any]],
    cfg: TrainingConfig,
) -> Dict[str, Any]:
    device, device_type = _resolve_device()
    autocast_dtype = torch.float16 if device_type in {"cuda", "mps"} else torch.float32

    print(f"Using device: {device} (dtype={autocast_dtype})")

    # Clear CUDA cache before loading model
    if device_type == "cuda":
        torch.cuda.empty_cache()
        import gc
        gc.collect()

    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token or tokenizer.unk_token
    tokenizer.padding_side = "right"

    # Prepare quantization config if needed
    quantization_config = None
    load_kwargs = {}

    if device_type == "cuda":
        # Check if bitsandbytes is available before trying to use quantization
        try:
            from transformers.utils import is_bitsandbytes_available
            bnb_available = is_bitsandbytes_available()
        except (ImportError, AttributeError):
            # Fallback: try to import bitsandbytes directly
            try:
                import bitsandbytes
                bnb_available = True
            except ImportError:
                bnb_available = False

        if not bnb_available and (cfg.use_4bit or cfg.use_8bit):
            print("WARNING: bitsandbytes not available or version too old")
            print("Attempting to upgrade bitsandbytes...")
            try:
                subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "bitsandbytes"])
                # Reload to check again
                try:
                    from transformers.utils import is_bitsandbytes_available
                    bnb_available = is_bitsandbytes_available()
                except:
                    import bitsandbytes
                    bnb_available = True
                print("bitsandbytes upgraded successfully")
            except Exception as upgrade_err:
                print(f"ERROR: Failed to upgrade bitsandbytes: {upgrade_err}")
                print("   Falling back to float16 (no quantization)")
                cfg.use_4bit = False
                cfg.use_8bit = False
                bnb_available = False

        if bnb_available:
            if cfg.use_4bit:
                try:
                    from transformers import BitsAndBytesConfig
                    quantization_config = BitsAndBytesConfig(
                        load_in_4bit=True,
                        bnb_4bit_compute_dtype=torch.float16,
                        bnb_4bit_use_double_quant=True,
                        bnb_4bit_quant_type="nf4"
                    )
                    load_kwargs["quantization_config"] = quantization_config
                    load_kwargs["device_map"] = "auto"
                    print("Using 4-bit quantization (most memory efficient)")
                except Exception as e:
                    print(f"WARNING: 4-bit quantization failed: {e}, falling back to 8-bit")
                    cfg.use_4bit = False
                    cfg.use_8bit = True
            elif cfg.use_8bit:
                try:
                    from transformers import BitsAndBytesConfig
                    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
                    load_kwargs["quantization_config"] = quantization_config
                    load_kwargs["device_map"] = "auto"
                    print("Using 8-bit quantization (memory efficient)")
                except Exception as e:
                    print(f"WARNING: 8-bit quantization failed: {e}, using float16")
                    cfg.use_8bit = False
        else:
            if cfg.use_4bit or cfg.use_8bit:
                print("WARNING: Quantization requested but bitsandbytes unavailable, using float16")
                cfg.use_4bit = False
                cfg.use_8bit = False

    model_dtype = torch.float16 if device_type in {"cuda", "mps"} and not cfg.use_8bit and not cfg.use_4bit else torch.float32
    if quantization_config is None:
        load_kwargs["torch_dtype"] = model_dtype

    print(f"Loading model with config: {load_kwargs}")
    base_model = AutoModelForCausalLM.from_pretrained(
        cfg.model_name,
        **load_kwargs
    )

    # Only resize if not using quantization (quantized models handle this differently)
    if quantization_config is None:
        base_model.resize_token_embeddings(len(tokenizer))
    else:
        # For quantized models, we need to ensure token embeddings are resized
        # but this might be handled automatically
        try:
            base_model.resize_token_embeddings(len(tokenizer))
        except Exception as e:
            print(f"WARNING: Could not resize token embeddings (may be handled automatically): {e}")

    # Always use DoRA configuration
    adapter_config = DoRAConfig(
        r=cfg.dora_rank,
        lora_alpha=cfg.dora_alpha,
        lora_dropout=cfg.dora_dropout,
        task_type="CAUSAL_LM",
    )
    print("Using DoRA configuration")

    model: PeftModel = get_peft_model(base_model, adapter_config)

    # Ensure inputs require grad when using gradient checkpointing (fixes no-grad backward)
    if hasattr(model, "enable_input_require_grads"):
        try:
            model.enable_input_require_grads()
            print("Enabled input requires_grad for checkpointing")
        except Exception as e:
            print(f"WARNING: Could not enable input requires_grad: {e}")

    # Apply true DoRA: Add magnitude parameters and modify forward logic
    if not hasattr(adapter_config, 'use_dora') or adapter_config.use_dora:
        _apply_dora_decomposition(model, device)
        print("Applied DoRA weight decomposition (magnitude + direction)")

        # Verify DoRA magnitude parameters are trainable
        dora_params = [p for name, p in model.named_parameters() if 'dora_magnitude' in name and p.requires_grad]
        if dora_params:
            print(f"Found {len(dora_params)} DoRA magnitude parameters (total: {sum(p.numel() for p in dora_params)} params)")

    model.print_trainable_parameters()

    # Enable gradient checkpointing to save memory (if available)
    # This trades compute for memory, which is crucial for Colab
    # Note: Gradient checkpointing can sometimes cause numerical instability with custom DoRA
    # If NaN persists, try disabling it by setting DISABLE_GRAD_CHECKPOINT=1
    disable_checkpoint = os.getenv("DISABLE_GRAD_CHECKPOINT", "0") not in {"0", "false", "False"}
    if not disable_checkpoint and hasattr(model, "gradient_checkpointing_enable"):
        try:
            model.gradient_checkpointing_enable()
            print("Enabled gradient checkpointing (memory optimization)")
        except Exception as e:
            print(f"WARNING: Could not enable gradient checkpointing: {e}")
    else:
        if hasattr(model, "gradient_checkpointing_disable"):
            model.gradient_checkpointing_disable()
            if disable_checkpoint:
                print("INFO: Gradient checkpointing disabled (may help with NaN stability)")

    # Only move to device if not using device_map="auto" (quantization handles this)
    if "device_map" not in load_kwargs or load_kwargs.get("device_map") != "auto":
        model.to(device)
    else:
        print("Model loaded with device_map='auto' (quantization)")

    # Clear cache before training
    if device_type == "cuda":
        torch.cuda.empty_cache()
        import gc
        gc.collect()
        if torch.cuda.is_available():
            print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB total")
            print(f"GPU memory allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
            print(f"GPU memory reserved: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")

    train_dataset = ConversationDataset(train_samples, tokenizer, cfg.max_seq_length)
    valid_dataset = ConversationDataset(valid_samples, tokenizer, cfg.max_seq_length) if valid_samples else None

    collate_fn = lambda batch: collate_conversations(batch, tokenizer.pad_token_id)

    train_loader = DataLoader(
        train_dataset,
        batch_size=cfg.per_device_batch_size,
        shuffle=True,
        collate_fn=collate_fn,
        drop_last=False,
    )
    valid_loader = (
        DataLoader(
            valid_dataset,
            batch_size=max(1, cfg.per_device_batch_size),
            shuffle=False,
            collate_fn=collate_fn,
        )
        if valid_dataset
        else None
    )

    steps_per_epoch = max(1, math.ceil(len(train_loader) / cfg.grad_accum_steps))
    planned_steps = cfg.target_steps
    epochs = min(cfg.max_epochs, max(1, math.ceil(planned_steps / steps_per_epoch)))
    total_steps = epochs * steps_per_epoch

    print(f"Training plan: epochs={epochs}, steps/epoch≈{steps_per_epoch}, total_steps={total_steps}")

    trainable_params = [p for p in model.parameters() if p.requires_grad]

    optimizer = None
    galore_handler = None

    if GaLoreAdamW is not None:
        try:
            # Try different GaLoreAdamW API signatures
            galore_rank = min(256, max(4, int(cfg.max_seq_length * cfg.galore_rank_ratio)))
            try:
                # Try with rank as positional or keyword arg
                optimizer = GaLoreAdamW(
                    trainable_params,
                    lr=cfg.learning_rate,
                    betas=(0.9, 0.999),
                    eps=1e-8,
                    weight_decay=cfg.weight_decay,
                    rank=galore_rank,
                )
                print(f"GaLoreAdamW initialized with rank={galore_rank}")
            except TypeError:
                # Try without rank parameter (some versions don't need it)
                try:
                    optimizer = GaLoreAdamW(
                        trainable_params,
                        lr=cfg.learning_rate,
                        betas=(0.9, 0.999),
                        eps=1e-8,
                        weight_decay=cfg.weight_decay,
                    )
                    print(f"GaLoreAdamW initialized (without rank parameter)")
                except Exception as e2:
                    raise e2
            galore_handler = None
        except Exception as init_err:
            print(f"WARNING: GaLoreAdamW initialization failed: {init_err}. Using fallback projection.")

    if optimizer is None:
        optimizer = torch.optim.AdamW(
            trainable_params,
            lr=cfg.learning_rate,
            weight_decay=cfg.weight_decay,
            betas=(0.9, 0.999),
        )
        galore_handler = project_gradients_low_rank

    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=min(cfg.warmup_steps, total_steps // 2),
        num_training_steps=total_steps,
    )

    global_step = 0
    running_loss = 0.0
    optimizer.zero_grad(set_to_none=True)
    nan_count = 0
    max_nan_steps = 5  # Stop if NaN persists for 5 steps

    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        for step, batch in enumerate(train_loader):
            model.train()
            batch = {k: v.to(device) for k, v in batch.items()}

            with torch.autocast(device_type=device_type, dtype=autocast_dtype, enabled=device_type != "cpu"):
                outputs = model(**batch)
                loss = outputs.loss / cfg.grad_accum_steps

            # Check for NaN loss
            if torch.isnan(loss) or torch.isinf(loss):
                nan_count += 1
                print(f"WARNING: NaN/Inf loss detected at step {global_step} (count: {nan_count}/{max_nan_steps})")
                if nan_count >= max_nan_steps:
                    print("ERROR: Too many NaN losses, stopping training. Try:")
                    print("   1. Reduce learning rate (set LEARNING_RATE=1e-5)")
                    print("   2. Reduce batch size (set BATCH_SIZE=1)")
                    print("   3. Reduce sequence length (set MAX_SEQ_LENGTH=512)")
                    return {"error": "NaN loss", "steps": global_step}
                optimizer.zero_grad(set_to_none=True)
                continue
            else:
                nan_count = 0  # Reset counter on valid loss

            loss.backward()
            running_loss += loss.item()

            if (step + 1) % cfg.grad_accum_steps == 0:
                # Gradient clipping to prevent explosion
                torch.nn.utils.clip_grad_norm_(trainable_params, cfg.max_grad_norm)

                if cfg.galore_project_on and galore_handler is not None:
                    galore_handler(model, rank_ratio=cfg.galore_rank_ratio)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad(set_to_none=True)

                global_step += 1
                avg_loss = running_loss
                running_loss = 0.0

                if global_step % 5 == 0:
                    print(f"   step {global_step}/{total_steps} - loss: {avg_loss:.4f}")

                if global_step >= planned_steps:
                    break

        if global_step >= planned_steps:
            break

        val_loss = evaluate_model(model, valid_loader, device, device_type, autocast_dtype)
        if val_loss is not None:
            print(f"   Validation loss: {val_loss:.4f}")

    final_val_loss = evaluate_model(model, valid_loader, device, device_type, autocast_dtype)

    Path(cfg.adapter_path).mkdir(parents=True, exist_ok=True)
    model.to("cpu")
    model.save_pretrained(cfg.adapter_path)
    tokenizer.save_pretrained(cfg.adapter_path)

    metrics = {
        "train_steps": global_step,
        "validation_loss": final_val_loss,
        "device": str(device),
        "dtype": str(model_dtype),
        "galore_backend": "official" if GaLoreAdamW is not None else "fallback_svd",
    }

    with open(Path(cfg.adapter_path) / "training_summary.json", "w") as fp:
        json.dump(metrics, fp, indent=2)

    print(f"\nTraining complete! Adapter saved to: {cfg.adapter_path}")
    if final_val_loss is not None:
        print(f"Final validation loss: {final_val_loss:.4f}")

    return metrics


def fine_tune_model():
    """Execute DoRA + GaLore fine-tuning"""
    print("\n" + "=" * 60)
    print("Starting DoRA + GaLore Fine-tuning (PyTorch + PEFT)")
    print("=" * 60 + "\n")

    cfg = TrainingConfig()

    # Print directory information (shows where files will be created)
    print(f"Working Directory: {cfg._script_dir}")
    print(f"Training Data Directory: {cfg.data_dir}")
    print(f"Model Save Directory: {cfg.adapter_path}")
    print()

    # Create necessary directories
    cfg.data_dir.mkdir(parents=True, exist_ok=True)
    Path(cfg.adapter_path).mkdir(parents=True, exist_ok=True)
    print(f"Created directory: {cfg.data_dir}")
    print(f"Created directory: {cfg.adapter_path}")
    print()

    resume_text = extract_resume_info()
    if not resume_text:
        return False

    training_samples = generate_training_data(resume_text)

    total = len(training_samples)
    valid_size = 1 if total < 5 else max(1, int(0.1 * total))
    train_samples = training_samples[:-valid_size] if total > valid_size else training_samples
    valid_samples = training_samples[-valid_size:] if total > valid_size else []

    train_path = str(cfg.data_dir / "train.jsonl")
    valid_path = str(cfg.data_dir / "valid.jsonl")

    create_training_jsonl(train_samples, train_path)
    create_training_jsonl(valid_samples, valid_path)

    # Print complete training data directory information
    print("\n" + "=" * 60)
    print("Training Data Summary")
    print("=" * 60)
    print(f"Training Data Directory: {cfg.data_dir}")
    print(f"   ├─ Train file: {train_path}")
    print(f"   │  └─ Samples: {len(train_samples)}")
    print(f"   └─ Validation file: {valid_path}")
    print(f"      └─ Samples: {len(valid_samples)}")
    print(f"Model Save Directory: {cfg.adapter_path}")
    print(f"Total Training Samples: {len(train_samples)}")
    print(f"Total Validation Samples: {len(valid_samples)}")
    print("=" * 60 + "\n")

    if len(train_samples) == 0:
        print("ERROR: Training data is empty, terminating training")
        return False

    metrics = train_with_dora_galore(train_samples, valid_samples, cfg)
    return metrics

def extract_resume_info():
    """Extract raw text from PDF resume (supports local and Google Colab)"""
    print("Reading resume PDF...")

    # Check if path specified via environment variable
    resume_path = os.getenv("RESUME_PDF_PATH", None)

    # If not specified, try common locations
    if not resume_path or not os.path.exists(resume_path):
        work_dir = _get_working_directory()
        possible_paths = [
            # Check current working directory
            work_dir / "F_CV_Chen_Linlin_formal_cv_itp.pdf",
            work_dir / "resume.pdf",
            work_dir / "CV.pdf",
            # Check content subdirectory (common in Colab)
            work_dir / "content" / "F_CV_Chen_Linlin_formal_cv_itp.pdf",
            work_dir / "content" / "resume.pdf",
            work_dir / "content" / "CV.pdf",
            # Check Google Drive if mounted
            Path("/content/drive/MyDrive/F_CV_Chen_Linlin_formal_cv_itp.pdf"),
            Path("/content/drive/MyDrive/resume.pdf"),
            Path("/content/drive/MyDrive/CV.pdf"),
            # macOS default location
            Path("/Users/oww/Dropbox/F_CV_Chen_Linlin_formal_cv_itp.pdf"),
            # Colab root directory
            Path("/content/F_CV_Chen_Linlin_formal_cv_itp.pdf"),
            Path("/content/resume.pdf"),
        ]

        for path in possible_paths:
            if path.exists():
                resume_path = str(path)
                print(f"Found resume file: {resume_path}")
                break

        if not resume_path or not os.path.exists(resume_path):
            print(f"ERROR: Resume file not found. Please place your resume PDF in one of the following locations:")
            for path in possible_paths:
                print(f"   - {path}")
            print(f"   Or set environment variable: RESUME_PDF_PATH=/path/to/resume.pdf")
            return None

    try:
        import PyPDF2
        print(f"Reading: {resume_path}")
        with open(resume_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page_num, page in enumerate(pdf_reader.pages, 1):
                page_text = page.extract_text()
                text += page_text + "\n"
                print(f"   Read page {page_num} ({len(page_text)} characters)")

        total_chars = len(text)
        total_words = len(text.split())
        print(f"Resume extraction complete: {total_chars} characters, {total_words} words, {len(text.splitlines())} lines")
        return text
    except Exception as e:
        print(f"ERROR: Failed to read PDF: {e}")
        return None

if __name__ == "__main__":
    result = fine_tune_model()
    if result:
        print("\nFine-tuning successful! You can now use the fine-tuned model")
        print(f"Training summary: {json.dumps(result, indent=2)}")
    else:
        print("\nWARNING: Fine-tuning failed, suggest using default deepseek-coder:latest")



Upgrading bitsandbytes to latest version...
bitsandbytes upgraded successfully
Created true DoRAConfig implementation (weight decomposition into magnitude + direction)
Detected Google Colab environment, using working directory: /content

Starting DoRA + GaLore Fine-tuning (PyTorch + PEFT)

Working Directory: /content
Training Data Directory: /content/torch_data
Model Save Directory: /content/fine-tuned-model

Created directory: /content/torch_data
Created directory: /content/fine-tuned-model

Reading resume PDF...
Detected Google Colab environment, using working directory: /content
Found resume file: /content/content/F_CV_Chen_Linlin_formal_cv_itp.pdf
Reading: /content/content/F_CV_Chen_Linlin_formal_cv_itp.pdf
   Read page 1 (11619 characters)
Resume extraction complete: 11620 characters, 1300 words, 101 lines
Generating training data from resume (enhanced/heavy-weight)...
Generated samples (including augmentation): 839
Generated samples (incl. augmentation/STAR/free QA): 869
Creating

## **README**
### - Load adaptor to llm model
```
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer


# 1. Load base model
base_model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

# 2. Load adapter
model = PeftModel.from_pretrained(base_model, "./fine-tuned-model")

# 3. Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("./fine-tuned-model")

# 4. Finished
inputs = tokenizer("question", return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0]))
```