In [None]:
!pip install torch huggingface_hub unsloth trl transformers datasets python-dotenv rouge_score evaluate nltk

Collecting unsloth
  Downloading unsloth-2025.3.19-py3-none-any.whl.metadata (46 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Do

In [None]:
#!/usr/bin/env python3
"""
Agentic DeepSeek R1 Auto-Finetuner with Gemini 1.5 Flash Integration
An intelligent fine-tuning system that uses Gemini for dynamic optimization.
"""

import os
import json
import random
from datetime import datetime
import torch
from huggingface_hub import login
from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
import google.generativeai as genai
from typing import Dict, Any, Optional
from dotenv import load_dotenv
load_dotenv('.env')

# Configuration - Initial values can be modified by the agent
CONFIG = {
    # Model Configuration
    "model_name": "unsloth/DeepSeek-R1-Distill-Llama-8B",
    "max_seq_length": 2048,
    "load_in_4bit": True,

    # Dataset Configuration
    "dataset_name": "chemouda/legal_reason",
    "dataset_config": "default",
    "dataset_split": "train[0:500]",
    "trust_remote_code": True,

    # LoRA Configuration
    "lora_rank": 16,
    "lora_alpha": 16,
    "lora_dropout": 0.05,
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj",
                     "gate_proj", "up_proj", "down_proj"],

    # Training Configuration
    "batch_size": 2,
    "gradient_accumulation_steps": 4,
    "epochs": 10,
    "warmup_steps": 5,
    "max_steps": 5,
    "learning_rate": 2e-4,
    "weight_decay": 0.01,
    "lr_scheduler_type": "cosine",
    "optimizer": "adamw_8bit",

    # Output
    "output_dir": "outputs",
    "logging_steps": 10,

    # Agent Configuration
    "enable_agent": True,
    "gemini_api_key": "------------------",
    "auto_optimize": True,
    "auto_evaluate": True,
}

class Memory:
    """Simple memory module for agent coordination"""
    def __init__(self):
        self.store = {
            "plan": {},       # Planner's strategy
            "training": {},   # Hyperparams/logs
            "evaluation": {}  # Scores/feedback
        }

    def update(self, key: str, data: dict):
        """Update memory store"""
        self.store[key].update(data)

class DomainInjectorAgent:
    """Agent to detect domain and expert role from dataset samples."""

    def __init__(self, gemini):
        self.gemini = gemini

    def detect_domain(self, sample_questions: list) -> dict:
        """Detect domain and expert role from sample questions."""
        if not self.gemini or not sample_questions:
            return {
                "domain_name": "general",
                "expert_role": "subject matter expert"
            }

        sample_text = "\n".join([f"Question {i+1}: {q}" for i, q in enumerate(sample_questions)])

        prompt = f"""
        Analyze these questions from dataset {CONFIG['dataset_name']} and determine:
        1. The primary domain (single word or short phrase)
        2. The appropriate expert role description

        Questions:
        {sample_text}

        Return ONLY JSON format with:
        {{
            "domain_name": "domain",
            "expert_role": "expert description"
        }}

        Example expert roles:
        - "medical expert with advanced knowledge in clinical reasoning"
        - "legal expert with deep knowledge of case law"
        - "financial expert with expertise in market analysis"
        - "subject matter expert with specialized knowledge in [domain]"

        DO NOT SPECIFY A SUBDOMAIN IN domain_name GIVE A MORE GENERALIZABLE domain_name
        Example:
        - if detected domain_name is "Gastroenterology" make it "Medical"
        - if detected domain_name is "Nephrology" make it "Medical"
        - if detected domain_name is "Constitutional Law" make it "Legal"
        """

        try:
            response = self.gemini.generate_content(prompt)
            if response.text:
                start_idx = response.text.find('{')
                end_idx = response.text.rfind('}') + 1
                return json.loads(response.text[start_idx:end_idx])
        except Exception as e:
            print(f"Domain detection failed: {e}")

        return {
            "domain_name": "general",
            "expert_role": "subject matter expert"
        }

class FineTuningAgent:
    """An agent that manages the fine-tuning process using Gemini for optimization."""

    def __init__(self, config: Dict[str, Any]):
        self.config = config
        self.gemini = None
        self.memory = Memory()
        self.domain_info = {"domain_name": "", "expert_role": ""}
        self.training_losses = []
        self.initialize_gemini()

    def initialize_gemini(self):
        """Initialize the Gemini client if enabled."""
        if self.config.get("enable_agent", False):
            try:
                genai.configure(api_key=self.config["gemini_api_key"])
                self.gemini = genai.GenerativeModel('gemini-1.5-flash')
                print("Gemini 1.5 Flash agent initialized successfully.")
            except Exception as e:
                print(f"Failed to initialize Gemini agent: {e}")
                self.gemini = None

    def query_gemini(self, prompt: str, system_message: Optional[str] = None) -> str:
        """Query Gemini with a prompt and optional system message."""
        if not self.gemini:
            return ""

        try:
            messages = []
            if system_message:
                messages.append({"role": "user", "parts": [system_message]})
                messages.append({"role": "model", "parts": ["Understood."]})

            messages.append({"role": "user", "parts": [prompt]})

            response = self.gemini.generate_content(messages)
            return response.text
        except Exception as e:
            print(f"Gemini query failed: {e}")
            return ""

    def optimize_config(self, dataset_info: Dict[str, Any]) -> Dict[str, Any]:
        """Use Gemini to optimize the configuration based on dataset info."""
        if not self.config["auto_optimize"] or not self.gemini:
            return self.config

        prompt = f"""
        You are an expert AI model training optimizer. Your task is to recommend optimal hyperparameters
        for fine-tuning the {self.config['model_name']} model on a {self.domain_info['domain_name']} dataset.

        Current configuration:
        {json.dumps(self.config, indent=2)}

        Dataset information:
        {json.dumps(dataset_info, indent=2)}

        Please recommend optimized hyperparameters considering:
        1. The model size and architecture
        2. The dataset characteristics
        3. Available hardware (assuming Colab Notebook T4 16GB GPU)
        4. Best practices for efficient fine-tuning

        Provide your recommendations in JSON format with the same structure as the current config.
        Only suggest changes to all parameters ideally for the domain: batch_size, learning_rate,
        lora_rank, lora_alpha, and epochs.

        KEEP HIGHER EPOCHS and keep the max_steps exactly the same.

        Explain each change briefly in a 'reason' field for each parameter.
        """

        response = self.query_gemini(
            prompt,
            system_message=f"You are an expert AI model trainer specializing in efficient {self.domain_info['domain_name']} fine-tuning."
        )

        try:
            start_idx = response.find('{')
            end_idx = response.rfind('}') + 1
            json_str = response[start_idx:end_idx]
            optimized = json.loads(json_str)

            for key in optimized:
                if key in self.config and key != "reason":
                    print(f"Updating {key} from {self.config[key]} to {optimized[key]}")
                    self.config[key] = optimized[key]

            return self.config
        except Exception as e:
            print(f"Failed to parse optimization recommendations: {e}")
            return self.config

    def evaluate_model(self, model, tokenizer, eval_samples: list) -> Dict[str, Any]:
        """Evaluate model on multiple samples and return aggregate scores."""
        if not self.config["auto_evaluate"] or not self.gemini:
            return {}

        evaluations = []
        for sample in eval_samples:
            question = sample["question"]
            response = generate_response(model, tokenizer, question, self.config, self.domain_info)
            evaluation = self.evaluate_single_response(question, response)
            if evaluation:
                evaluations.append(evaluation)

        if not evaluations:
            return {}

        # Calculate average scores
        avg_scores = {
            "accuracy": sum(e["scores"]["accuracy"] for e in evaluations) / len(evaluations),
            "reasoning": sum(e["scores"]["reasoning"] for e in evaluations) / len(evaluations),
            "completeness": sum(e["scores"]["completeness"] for e in evaluations) / len(evaluations),
            "clarity": sum(e["scores"]["clarity"] for e in evaluations) / len(evaluations),
        }

        return {
            "scores": avg_scores,
            "num_samples": len(evaluations),
            "domain": self.domain_info["domain_name"]
        }

    def evaluate_single_response(self, question: str, response: str) -> Dict[str, Any]:
        """Evaluate a single response."""
        prompt = f"""
        As a {self.domain_info['expert_role']}, rate this response (1-10) on:
        1. Accuracy for {self.domain_info['domain_name']} domain
        2. Logical Reasoning
        3. Completeness
        4. Clarity

        Question: {question}
        Response: {response}

        Return JSON ONLY with scores (1-10 for each category) and specific feedback:
        {{
            "scores": {{
                "accuracy": 8,
                "reasoning": 7,
                "completeness": 9,
                "clarity": 8
            }},
            "feedback": "Detailed feedback here..."
        }}
        """

        evaluation = self.query_gemini(
            prompt,
            system_message=f"You are a {self.domain_info['expert_role']} evaluating AI responses for {self.domain_info['domain_name']} accuracy."
        )

        try:
            start_idx = evaluation.find('{')
            end_idx = evaluation.rfind('}') + 1

            print(evaluation[start_idx:end_idx])

            return json.loads(evaluation[start_idx:end_idx])
        except Exception as e:
            print(f"Failed to parse evaluation: {e}")
            return None

    def generate_retry_plan(self, evaluation_results: Dict[str, Any], previous_losses: list) -> Dict[str, Any]:
        """Generate a new training plan based on evaluation results and previous losses."""
        PLANNER_PROMPT = f"""Analyze these training results for {self.domain_info['domain_name']} fine-tuning:
        Evaluation metrics: {json.dumps(evaluation_results, indent=2)}
        Training losses: {previous_losses[-5:] if len(previous_losses) >= 5 else previous_losses}
        Current config: {json.dumps({k: v for k, v in self.config.items() if k in ['learning_rate', 'batch_size', 'lora_rank', 'lora_alpha', 'lora_dropout']}, indent=2)}

        Suggest conservative hyperparameter adjustments considering:
        1. Current performance
        2. Training loss trajectory
        3. Best practices for {self.domain_info['domain_name']} fine-tuning

        Return JSON with adjusted parameters and reasoning for each change.
        Focus on stable convergence rather than aggressive optimization."""

        response = self.query_gemini(
            PLANNER_PROMPT,
            system_message=f"You are an AI training planner specializing in stable fine-tuning for {self.domain_info['domain_name']}."
        )

        try:
            start_idx = response.find('{')
            end_idx = response.rfind('}') + 1
            plan = json.loads(response[start_idx:end_idx])

            # Apply conservative adjustments (limit changes to 20% of original value)
            for param in ['learning_rate', 'lora_rank', 'lora_alpha']:
                if param in plan and param in self.config:
                    original = self.config[param]
                    suggested = plan[param]
                    # Limit adjustment to ±20% of original value
                    adjusted = min(max(suggested, original * 0.8), original * 1.2)
                    plan[param] = adjusted
                    plan[f"{param}_reason"] = plan.get(f"{param}_reason", "") + f" (Limited to ±20% adjustment from original value {original})"

            return plan
        except Exception as e:
            print(f"Failed to parse retry plan: {e}")
            return {}

def format_dataset(examples, domain_info):
    """Format dataset examples into proper instruction-following format."""
    inputs = examples["Case_Description"]
    cots = examples["Argument"]
    outputs = examples["Outcome"]

    formatted = []
    for input, cot, output in zip(inputs, cots, outputs):
        formatted.append({
            "instruction": f"You are a {domain_info['expert_role']}. Answer this {domain_info['domain_name']} question.",
            "question": input,
            "response": f"<think>{cot}</think>\n{output}"
        })

    return {"text": [json.dumps(item) for item in formatted]}

def load_and_prepare_dataset(agent: FineTuningAgent):
    """Load and prepare the dataset for fine-tuning, returning dataset and eval samples."""
    print("Loading dataset...")
    dataset = load_dataset(
        agent.config["dataset_name"],
        agent.config.get("dataset_config"),
        split=agent.config["dataset_split"],
        trust_remote_code=agent.config.get("trust_remote_code", True)
    )
    print(dataset)

    # Get 5 random samples for domain detection
    sample_questions = []
    if agent.config["enable_agent"] and agent.gemini:
        sample_indices = random.sample(range(len(dataset)), min(5, len(dataset)))
        sample_questions = [dataset[i]["Case_Description"] for i in sample_indices]
        domain_agent = DomainInjectorAgent(agent.gemini)
        agent.domain_info = domain_agent.detect_domain(sample_questions)
        print(f"\n🔍 Detected Domain: {agent.domain_info['domain_name']}")
        print(f"👨‍⚕️ Expert Role: {agent.domain_info['expert_role']}\n")

    # Get dataset info for optimization
    dataset_info = {
        "size": len(dataset),
        "features": list(dataset.features.keys()),
        "sample_questions": sample_questions if agent.config["enable_agent"] else []
    }

    # Optimize configuration based on dataset
    if agent.config["auto_optimize"]:
        agent.optimize_config(dataset_info)

    print("Formatting dataset...")
    def format_example(example):
      return {
          "text": json.dumps({
              "instruction": f"You are a {agent.domain_info['expert_role']}. Answer this {agent.domain_info['domain_name']} question.",
              "question": example["Case_Description"],  # Changed from "Question"
              "response": f"<think>{example['Argument']}</think>\n{example['Outcome']}"  # Changed fields
          })
      }

    formatted_dataset = dataset.map(format_example)

    # Split dataset into train and eval (90/10 split)
    split_dataset = formatted_dataset.train_test_split(test_size=0.1, seed=42)

    # Store evaluation samples for later use
    eval_samples = []
    for i in range(min(5, len(split_dataset["test"]))):
        eval_sample = json.loads(split_dataset["test"][i]["text"])
        eval_samples.append(eval_sample)

    # Print some evaluation samples for inspection
    print("\nEvaluation samples to be used:")
    for i, sample in enumerate(eval_samples):
        print(f"\nSample {i+1}:")
        print(f"Instruction: {sample['instruction']}")
        print(f"Question: {sample['question'][:100]}...")
        print(f"Response: {sample['response'][:100]}...")

    return split_dataset["train"], eval_samples

def setup_lora_model(model, config: Dict[str, Any]):
    """Apply LoRA configuration to the model."""
    print("Setting up LoRA...")
    return FastLanguageModel.get_peft_model(
        model,
        r=config["lora_rank"],
        target_modules=config["target_modules"],
        lora_alpha=config["lora_alpha"],
        lora_dropout=config["lora_dropout"],
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        use_rslora=False,
        loftq_config=None,
    )

def run_training(model, tokenizer, dataset, config: Dict[str, Any], agent: FineTuningAgent):
    """Run the fine-tuning process."""
    print("Preparing for training...")

    # Determine precision
    fp16 = not is_bfloat16_supported()
    bf16 = is_bfloat16_supported()

    # Training arguments
    training_args = TrainingArguments(
        per_device_train_batch_size=config["batch_size"],
        gradient_accumulation_steps=config["gradient_accumulation_steps"],
        num_train_epochs=config["epochs"],
        warmup_steps=config["warmup_steps"],
        max_steps=config["max_steps"],
        learning_rate=config["learning_rate"],
        fp16=fp16,
        bf16=bf16,
        logging_steps=config["logging_steps"],
        optim=config["optimizer"],
        weight_decay=config["weight_decay"],
        lr_scheduler_type=config["lr_scheduler_type"],
        seed=3407,
        output_dir=config["output_dir"],
        report_to="none",
    )

    # Initialize trainer
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset,
        dataset_text_field="text",
        max_seq_length=config["max_seq_length"],
        args=training_args,
    )

    # Start training
    print("Starting fine-tuning...")
    trainer.train()

    # Store losses for retry planning
    if hasattr(trainer.state, "log_history"):
        agent.training_losses = [log["loss"] for log in trainer.state.log_history if "loss" in log]

    return trainer

def train_with_retry(agent, model, tokenizer, train_dataset, eval_samples, max_retries=2):
    """Training loop with evaluation and retry logic"""
    best_score = -1
    best_model = None
    best_tokenizer = None
    best_path = None
    original_config = agent.config.copy()  # Store original config for comparison

    for attempt in range(max_retries):
        print(f"\n=== Training Attempt {attempt + 1}/{max_retries} ===")

        # Print config changes if this is a retry
        if attempt > 0:
            print("\n🔧 Configuration changes for this attempt:")
            for key in ['learning_rate', 'batch_size', 'lora_rank', 'lora_alpha', 'lora_dropout']:
                if key in original_config and key in agent.config:
                    if agent.config[key] != original_config[key]:
                        print(f"  - {key}: {original_config[key]} → {agent.config[key]}")

        # Train
        trainer = run_training(model, tokenizer, train_dataset, agent.config, agent)

        # Evaluate on 5 evaluation samples
        evaluation = agent.evaluate_model(trainer.model, tokenizer, eval_samples)
        avg_accuracy = evaluation.get("scores", {}).get("accuracy", 0) if evaluation else 0
        print(f"💡 Evaluation Score (accuracy): {avg_accuracy:.2f}/10 (based on {evaluation.get('num_samples', 0)} samples)")

        # Save model if it's the best so far
        if avg_accuracy > best_score:
            print("🎯 New best model found. Saving...")
            best_score = avg_accuracy
            best_model = trainer.model
            best_tokenizer = tokenizer
            best_path = save_model(best_model, best_tokenizer, agent.config)

        # Stop if good enough
        if avg_accuracy >= 8:
            print("✅ Threshold met. Stopping further retries.")
            break

        # Retry if allowed
        if attempt < max_retries - 1:
            print("🔁 Preparing for retry...")
            retry_plan = agent.generate_retry_plan(evaluation, agent.training_losses)

            # Store current config before changes for comparison
            current_config = agent.config.copy()

            # Apply retry plan
            for key in ['learning_rate', 'batch_size', 'lora_rank', 'lora_alpha', 'lora_dropout']:
                if key in retry_plan:
                    print(f"🔧 Adjusting {key} from {agent.config[key]} to {retry_plan[key]}")
                    agent.config[key] = retry_plan[key]

            # Free memory - CRITICAL
            import gc
            torch.cuda.empty_cache()
            gc.collect()
            del trainer, model, tokenizer

            # Reload model with new config
            hf_token = "--------------------------------"
            model, tokenizer = FastLanguageModel.from_pretrained(
                model_name=agent.config["model_name"],
                max_seq_length=agent.config["max_seq_length"],
                dtype=None,
                load_in_4bit=agent.config["load_in_4bit"],
                token=hf_token,
            )
            model = setup_lora_model(model, agent.config)

    # Print final config changes summary
    print("\n📊 Final Configuration Changes Summary:")
    for key in ['learning_rate', 'batch_size', 'lora_rank', 'lora_alpha', 'lora_dropout']:
        if key in original_config and key in agent.config:
            if agent.config[key] != original_config[key]:
                print(f"  - {key}: {original_config[key]} → {agent.config[key]}")

    print(f"\n🏁 Best model achieved accuracy: {best_score:.2f}/10")
    print(f"📦 Saved at: {best_path}")
    return best_model, best_tokenizer

def save_model(model, tokenizer, config: Dict[str, Any]):
    """Save the fine-tuned model and tokenizer."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    save_path = f"{config['output_dir']}/finetuned_model_{timestamp}"

    print(f"Saving model to {save_path}...")
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    return save_path

def generate_response(model, tokenizer, question: str, config: Dict[str, Any], domain_info: Dict[str, str]):
    """Generate a response using the fine-tuned model."""
    FastLanguageModel.for_inference(model)

    prompt = json.dumps({
        "instruction": f"You are a {domain_info['expert_role']}. Answer this {domain_info['domain_name']} question.",
        "question": question,
        "response": ""
    })

    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=1200,
        use_cache=True,
    )

    response = tokenizer.batch_decode(outputs)
    try:
        return json.loads(response[0])["response"]
    except:
        # Try to extract just the response part if JSON parsing fails
        if "### Response:" in response[0]:
            return response[0].split("### Response:")[1]
        return response[0]

def final_evaluation(model, tokenizer, eval_samples, agent):
    """Run final evaluation on all stored evaluation samples."""
    print("\n=== Final Model Evaluation ===")

    total_scores = {"accuracy": 0, "reasoning": 0, "completeness": 0, "clarity": 0}
    for i, sample in enumerate(eval_samples):
        question = sample["question"]
        print(f"\nQuestion {i+1}: {question[:100]}...")

        # Generate response with the fine-tuned model
        response = generate_response(model, tokenizer, question, agent.config, agent.domain_info)
        print(f"Model Response: {response[:200]}...")

        # Evaluate response
        evaluation = agent.evaluate_single_response(question, response)
        if evaluation and "scores" in evaluation:
            print(f"Evaluation Scores: {evaluation['scores']}")
            for key in total_scores:
                total_scores[key] += evaluation["scores"].get(key, 0)

    # Calculate averages
    num_samples = len(eval_samples)
    avg_scores = {k: v/num_samples for k, v in total_scores.items()}

    print("\n=== Average Scores ===")
    print(f"Accuracy: {avg_scores['accuracy']:.2f}/10")
    print(f"Reasoning: {avg_scores['reasoning']:.2f}/10")
    print(f"Completeness: {avg_scores['completeness']:.2f}/10")
    print(f"Clarity: {avg_scores['clarity']:.2f}/10")
    print(f"Overall: {sum(avg_scores.values())/len(avg_scores):.2f}/10")

    return avg_scores

def main():
    # Initialize the agent
    global agent
    agent = FineTuningAgent(CONFIG)

    # Login to Hugging Face if token is available
    hf_token = "--------------------------------"
    if hf_token:
        login(hf_token)

    # Load and prepare dataset (with potential auto-optimization)
    train_dataset, eval_samples = load_and_prepare_dataset(agent)

    # Load base model
    print("Loading base model...")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=agent.config["model_name"],
        max_seq_length=agent.config["max_seq_length"],
        dtype=None,
        load_in_4bit=agent.config["load_in_4bit"],
        token=hf_token
    )

    # Setup LoRA model
    model = setup_lora_model(model, agent.config)

    # Run training with retry logic
    model, tokenizer = train_with_retry(agent, model, tokenizer, train_dataset, eval_samples)

    # Run final evaluation on all evaluation samples
    final_scores = final_evaluation(model, tokenizer, eval_samples, agent)

    print(f"\nFine-tuning complete! Final evaluation score: {sum(final_scores.values())/len(final_scores):.2f}/10")

if __name__ == "__main__":
    main()

Gemini 1.5 Flash agent initialized successfully.
Loading dataset...


legal_reason.csv:   0%|          | 0.00/26.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['ID', 'Case_Description', 'Argument', 'Technique', 'Category', 'Outcome', 'Court_Level', 'Key_Statutes_Cited'],
    num_rows: 100
})

🔍 Detected Domain: Legal
👨‍⚕️ Expert Role: Legal expert with deep knowledge of case law and civil procedure

Updating model_name from unsloth/DeepSeek-R1-Distill-Llama-8B to unsloth/DeepSeek-R1-Distill-Llama-8B
Updating max_seq_length from 2048 to 2048
Updating load_in_4bit from True to True
Updating dataset_name from chemouda/legal_reason to chemouda/legal_reason
Updating dataset_config from default to default
Updating dataset_split from train[0:500] to train[0:500]
Updating trust_remote_code from True to True
Updating lora_rank from 16 to 8
Updating lora_alpha from 16 to 16
Updating lora_dropout from 0.05 to 0.05
Updating target_modules from ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'] to ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
Updating batch_size from 2 to 

Map:   0%|          | 0/100 [00:00<?, ? examples/s]


Evaluation samples to be used:

Sample 1:
Instruction: You are a Legal expert with deep knowledge of case law and civil procedure. Answer this Legal question.
Question: Family law case involving spousal support...
Response: <think>The court determines appropriate spousal support based on the duration of the marriage and fi...

Sample 2:
Instruction: You are a Legal expert with deep knowledge of case law and civil procedure. Answer this Legal question.
Question: Wrongful termination claim based on retaliation...
Response: <think>The employee was fired shortly after reporting workplace harassment, indicating retaliatory m...

Sample 3:
Instruction: You are a Legal expert with deep knowledge of case law and civil procedure. Answer this Legal question.
Question: Civil rights lawsuit against police misconduct...
Response: <think>The plaintiff alleges excessive force was used during an arrest, violating their civil rights...

Sample 4:
Instruction: You are a Legal expert with deep knowledge

model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Setting up LoRA...


Unsloth 2025.3.19 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.



=== Training Attempt 1/2 ===
Preparing for training...


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/90 [00:00<?, ? examples/s]

Starting fine-tuning...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 90 | Num Epochs = 1 | Total steps = 5
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 20,971,520/8,000,000,000 (0.26% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss


{
  "scores": {
    "accuracy": 8,
    "reasoning": 7,
    "completeness": 9,
    "clarity": 8
  },
  "feedback": "The response demonstrates a good understanding of the key factors considered in spousal support cases.  It correctly identifies financial resources, length of marriage, health, employability, adherence to legal standards (including mention of the Uniform Marriage and Divorce Act, although the applicability varies by jurisdiction and should be specified), and the needs-based approach as crucial elements. The discussion of potential errors by the trial court is also insightful and relevant.  The mention of setting precedent is accurate.

However, the reasoning could be improved by providing specific examples of how a failure in each area could lead to a reversal. For instance, instead of simply stating that the trial court 'may not have adequately evaluated the financial situation,' it would be stronger to give an example:  'For example, if the trial court failed to consider