# AgileAI training notebook

## Preprocessing

In [1]:
pip install pypdf


Defaulting to user installation because normal site-packages is not writeable
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os

INPUT_FOLDER = "dataset"          # folder where your PDFs are saved
OUTPUT_FOLDER = "clean_text"      # output folder for processed text

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

print("Folders ready")


Folders ready


In [2]:
import re
from pypdf import PdfReader
import os
from glob import glob

def extract_text_from_pdf(path: str) -> str:
    reader = PdfReader(path)
    return "\n".join((page.extract_text() or "") for page in reader.pages)


def clean_project_text(raw_text: str) -> str:
    # Normalize newlines
    text = raw_text.replace("\r\n", "\n").replace("\r", "\n")

    # Remove pure page numbers
    lines = []
    for line in text.splitlines():
        if re.fullmatch(r"\s*\d+\s*", line):  # line is just a number
            continue
        lines.append(line.rstrip())
    text = "\n".join(lines)

    # 1Ô∏è‚É£ Try to start from the REAL "1. INTRODUCTION"
    # We look for the exact string "1. INTRODUCTION" (case-sensitive) because
    # TOC usually uses "1. Introduction" and body uses full caps.
    idx = text.find("1. INTRODUCTION")
    if idx != -1:
        text = text[idx:]
    else:
        # Fallback: if not found, keep whole text (e.g., FarmAuto style docs)
        text = text.lstrip()

    # 2Ô∏è‚É£ Drop REFERENCES / RELATED WORK / BIBLIOGRAPHY and everything after
    drop_pattern = re.compile(
        r"(?im)^\s*(REFERENCES|REFERENCE|RELATED WORKS?|BIBLIOGRAPHY)\b.*$",
        re.MULTILINE,
    )
    match = drop_pattern.search(text)
    if match:
        text = text[:match.start()]

    # 3Ô∏è‚É£ Collapse multiple blank lines
    text = re.sub(r"\n{3,}", "\n\n", text)

    # 4Ô∏è‚É£ Strip leading/trailing whitespace
    text = text.strip()

    return text


def preprocess_pdf_folder(input_dir: str, output_dir: str):
    os.makedirs(output_dir, exist_ok=True)

    pdf_paths = glob(os.path.join(input_dir, "*.pdf"))
    print(f"Found {len(pdf_paths)} PDFs in {input_dir}")

    for pdf_path in pdf_paths:
        base = os.path.splitext(os.path.basename(pdf_path))[0]
        out_path = os.path.join(output_dir, f"{base}_clean.txt")

        raw = extract_text_from_pdf(pdf_path)
        cleaned = clean_project_text(raw)

        with open(out_path, "w", encoding="utf-8") as f:
            f.write(cleaned)

        print(f"‚úÖ Saved cleaned: {out_path}")


In [3]:
preprocess_pdf_folder(INPUT_FOLDER, OUTPUT_FOLDER)

Found 20 PDFs in dataset
‚úÖ Saved cleaned: clean_text/NotesTaker_clean.txt
‚úÖ Saved cleaned: clean_text/TravelBuddy_clean.txt
‚úÖ Saved cleaned: clean_text/AquaGuard_clean.txt
‚úÖ Saved cleaned: clean_text/FarmAuto_clean.txt
‚úÖ Saved cleaned: clean_text/ShopEase_clean.txt
‚úÖ Saved cleaned: clean_text/RideSense_clean.txt
‚úÖ Saved cleaned: clean_text/MovieStreaming_clean.txt
‚úÖ Saved cleaned: clean_text/TrackFleet_clean.txt
‚úÖ Saved cleaned: clean_text/HealthConnect_clean.txt
‚úÖ Saved cleaned: clean_text/AgroVision_clean.txt
‚úÖ Saved cleaned: clean_text/LearnMate_clean.txt
‚úÖ Saved cleaned: clean_text/FoodOrder_clean.txt
‚úÖ Saved cleaned: clean_text/HealthInsight_clean.txt
‚úÖ Saved cleaned: clean_text/HomeSense_clean.txt
‚úÖ Saved cleaned: clean_text/SmartFit_clean.txt
‚úÖ Saved cleaned: clean_text/MedAlert_clean.txt
‚úÖ Saved cleaned: clean_text/SalesPulse_clean.txt
‚úÖ Saved cleaned: clean_text/QuizMaster_clean.txt
‚úÖ Saved cleaned: clean_text/SmartDetect_clean.txt
‚úÖ Sav

In [4]:
with open("clean_text/FoodOrder_clean.txt", "r", encoding="utf-8") as f:
    print(f.read()[:800])

with open("clean_text/FarmAuto_clean.txt", "r", encoding="utf-8") as f:
    print(f.read()[:800])


1. INTRODUCTION
The demand for online food ordering has rapidly increased due to changing consumer preferences,
time constraints, and the convenience of home delivery. Traditional dine-in and phone-based
ordering methods are limited by communication gaps, long wait times, and lack of transparency in
order tracking. Customers expect a seamless digital experience that allows them to browse menus,
customize orders, and receive timely delivery updates.
DineEasy is a web-based food ordering system designed to connect customers with restaurants
through an intuitive online platform. The application enables users to browse menus, place orders,
and track delivery status while providing restaurant owners with tools to manage orders and menu
items effectively. The system aims to enhance ordering effi
FarmAuto - Smart Irrigation and Resource Optimization System
Page 1
TITLE PAGE
Project Title: FarmAuto - IoT Enabled Smart Irrigation and Resource Optimization
System
Prepared For: Government Agricul

In [5]:
import os
import json
from glob import glob

CLEAN_TEXT_DIR = "clean_text"
TRAIN_JSON_DIR = "training_sample"

def normalize_stem(name: str) -> str:
    """
    Normalize filename stem so txt/json can be matched.
    Example: 'MovieStream_Project_clean' -> 'moviestream'
             'movie_stream' -> 'moviestream'
    """
    base = name.lower()
    base = base.replace(".txt", "").replace(".json", "")
    base = base.replace("_project", "")
    base = base.replace("_clean", "")
    base = base.replace(" ", "")
    base = base.replace("-", "")
    return base

def load_clean_texts(clean_dir=CLEAN_TEXT_DIR):
    txt_map = {}
    for path in glob(os.path.join(clean_dir, "*.txt")):
        stem = os.path.basename(path)
        key = normalize_stem(stem)
        with open(path, "r", encoding="utf-8") as f:
            txt = f.read().strip()
        txt_map[key] = {"path": path, "text": txt}
    return txt_map

def load_json_labels(json_dir=TRAIN_JSON_DIR):
    json_map = {}
    for path in glob(os.path.join(json_dir, "*.json")):
        stem = os.path.basename(path)
        key = normalize_stem(stem)
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        json_map[key] = {"path": path, "data": data}
    return json_map

clean_map = load_clean_texts()
json_map = load_json_labels()

print("Clean texts:", len(clean_map))
print("JSON labels:", len(json_map))


Clean texts: 20
JSON labels: 20


In [6]:
training_pairs = []

for key, clean_entry in clean_map.items():
    if key not in json_map:
        print(f"‚ö†Ô∏è No JSON found for: {key} ({clean_entry['path']})")
        continue

    label_entry = json_map[key]
    project_text = clean_entry["text"]
    label_json = label_entry["data"]

    training_pairs.append({
        "key": key,
        "project_text": project_text,
        "label_json": label_json
    })

print(f"\n‚úÖ Matched {len(training_pairs)} project(s) with both text and JSON.")



‚úÖ Matched 20 project(s) with both text and JSON.


In [7]:
example = training_pairs[1]
example["key"], example["project_text"][:500]


('shopease',
 '1. INTRODUCTION\nShopEase is a mobile application designed to address modern user needs through an intuitive and\nefficient digital platform. The goal of the project is to provide a seamless and accessible experience\nthat enables users to perform tasks conveniently from their smartphones.\n2. PROBLEM STATEMENT\nTraditional methods often result in inefficiencies, lack of centralization, and limited accessibility.\nUsers require mobile-friendly solutions that offer real-time information, consistent per')

## Build the Prompt

In [8]:
# ========== Title-only training prompts ==========

import json

def truncate(text: str, max_chars: int = 800) -> str:
    """
    Shorten project text for training prompts.
    """
    text = (text or "").strip()
    if len(text) <= max_chars:
        return text
    truncated = text[:max_chars]
    last_space = truncated.rfind(" ")
    if last_space != -1:
        truncated = truncated[:last_space]
    return truncated

# ---- EPIC TITLE TRAIN PROMPT ----
EPIC_TITLE_TRAIN_PROMPT = """
Project description:
{project_text}

Task:
- Write ONE short epic title that captures the main goal of the project.

Rules:
- Output ONLY the epic title on a single line.
- No labels, no quotes, no bullets, no numbering.

Epic title:
"""

# ---- FEATURE TITLES TRAIN PROMPT ----
FEATURE_TITLES_TRAIN_PROMPT = """
Project description:
{project_text}

Task:
- Propose exactly {num_features} high-level feature titles that break the project
  into major functional chunks.

Rules:
- Output EXACTLY {num_features} lines.
- Each line must be ONE feature title.
- No bullets, no numbering, no quotes.

Feature titles (one per line):
"""

# ---- STORY TITLES TRAIN PROMPT ----
STORY_TITLES_TRAIN_PROMPT = """
Project description:
{project_text}

Epic title:
{epic_title}

Feature title:
{feature_title}

Task:
- Propose exactly {num_stories} user story titles for this feature.

Rules:
- Output EXACTLY {num_stories} lines.
- Each line is a user story title.
- Prefer concise titles (they may start with "As a <role>, I want ..." but are OPTIONAL).
- No bullets, no numbering, no quotes.

User story titles (one per line):
"""


In [9]:
from datasets import Dataset
from copy import deepcopy

title_examples = []

for pair in training_pairs:
    project_text = pair["project_text"]
    label = deepcopy(pair["label_json"])
    
    proj = truncate(project_text, max_chars=800)
    epic_obj = label.get("epic", {}) or {}
    features = label.get("features", []) or []

    # ---------- EPIC TITLE EXAMPLE ----------
    epic_title = str(epic_obj.get("title", "")).strip()
    if epic_title:
        prompt = EPIC_TITLE_TRAIN_PROMPT.format(project_text=proj)
        completion = epic_title  # ONE line, no labels
        title_examples.append({"text": prompt + completion})

    # ---------- FEATURE TITLES EXAMPLE ----------
    feature_titles = []
    for f in features:
        t = str(f.get("title", "")).strip()
        if t:
            feature_titles.append(t)

    if feature_titles:
        num_feats = len(feature_titles)
        feat_prompt = FEATURE_TITLES_TRAIN_PROMPT.format(
            project_text=proj,
            num_features=num_feats,
        )
        feat_completion = "\n".join(feature_titles)  # one title per line
        title_examples.append({"text": feat_prompt + feat_completion})

    # ---------- STORY TITLES EXAMPLES (ONE PER FEATURE) ----------
    for f in features:
        f_title = str(f.get("title", "")).strip() or "Feature"
        stories = f.get("user_stories", []) or []

        story_titles = []
        for s in stories:
            st = str(s.get("title", "")).strip()
            if st:
                story_titles.append(st)

        if not story_titles:
            continue

        num_stories = len(story_titles)
        story_prompt = STORY_TITLES_TRAIN_PROMPT.format(
            project_text=proj,
            epic_title=epic_title if epic_title else "Project Epic",
            feature_title=f_title,
            num_stories=num_stories,
        )
        story_completion = "\n".join(story_titles)
        title_examples.append({"text": story_prompt + story_completion})

print(f"Total training examples: {len(title_examples)}")

dataset = Dataset.from_list(title_examples)
dataset


Total training examples: 141


Dataset({
    features: ['text'],
    num_rows: 141
})

## Training

In [10]:
!pip install -q transformers datasets accelerate


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [11]:
from copy import deepcopy

MAX_LEN = 1024  # keep it modest for GPU memory

def tokenize_record(record):
    # record["text"] contains prompt + JSON target
    encoded = tokenizer(
        record["text"],
        max_length=MAX_LEN,
        truncation=True,
        padding="max_length",
    )
    return {
        "input_ids": encoded["input_ids"],
        "attention_mask": encoded["attention_mask"],
    }

# Turn your python list `dataset` into a tokenized list
tokenized_ds = [tokenize_record(r) for r in dataset]

print(f"Tokenized samples: {len(tokenized_ds)}")
print("Example input_ids[:20]:", tokenized_ds[0]["input_ids"][:20])


Tokenized samples: 141
Example input_ids[:20]: [1, 29871, 13, 7653, 6139, 29901, 13, 29896, 29889, 19578, 1672, 14849, 9838, 13, 2111, 824, 14379, 5706, 2919, 18167]


In [16]:
!pip install dill==0.3.8 --force-reinstall
!pip show dill


Defaulting to user installation because normal site-packages is not writeable
[0mCollecting dill==0.3.8
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m116.3/116.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: dill
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 2.16.1 requires dill<0.3.8,>=0.3.0, but you have dill 0.3.8 which is incompatible.[0m[31m
[0mSuccessfully installed dill-0.3.8

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip ins

In [12]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # standard for LLAMA/TinyLlama
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 2,252,800 || all params: 1,102,301,184 || trainable%: 0.20437245579516677


In [13]:
import torch

def data_collator(features):
    batch = {
        "input_ids": [f["input_ids"] for f in features],
        "attention_mask": [f["attention_mask"] for f in features],
    }
    batch["labels"] = [f["input_ids"] for f in features]
    batch = {k: torch.tensor(v) for k, v in batch.items()}
    return batch


In [14]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="agileai_tinyllama_qlora",
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_steps=5,
    save_steps=50,
    save_total_limit=2,
    fp16=True,      
    bf16=False,
    optim="paged_adamw_8bit",
    report_to="none",
    remove_unused_columns=False,
)


In [15]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,  
    data_collator=data_collator,
)

trainer.train()


Step,Training Loss
5,11.9515
10,9.6115
15,5.4934
20,3.0334
25,1.308
30,0.8305
35,0.7394
40,0.7799
45,0.7048
50,0.6929


TrainOutput(global_step=350, training_loss=0.7595001772471837, metrics={'train_runtime': 268.6774, 'train_samples_per_second': 2.624, 'train_steps_per_second': 1.303, 'total_flos': 2375431068057600.0, 'train_loss': 0.7595001772471837, 'epoch': 4.96})

In [16]:
SAVE_DIR = "agileai_tinyllama_qlora_v4"   # or similar
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)


('agileai_tinyllama_qlora_v4/tokenizer_config.json',
 'agileai_tinyllama_qlora_v4/special_tokens_map.json',
 'agileai_tinyllama_qlora_v4/tokenizer.model',
 'agileai_tinyllama_qlora_v4/added_tokens.json',
 'agileai_tinyllama_qlora_v4/tokenizer.json')

# INFERENCE

In [17]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import os

# üî¥ DO NOT rely on SAVE_DIR if you restarted the kernel
# üëá Put the actual folder name you used in training:
MODEL_DIR = "agileai_tinyllama_qlora_v4"   # or "agileai_tinyllama_qlora", etc.

assert os.path.isdir(MODEL_DIR), f"Model directory not found: {MODEL_DIR}"

gen_model = AutoPeftModelForCausalLM.from_pretrained(
    MODEL_DIR,
    device_map="auto",
)

gen_tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
if gen_tokenizer.pad_token is None:
    gen_tokenizer.pad_token = gen_tokenizer.eos_token

gen_model.eval()
print("‚úÖ Loaded trained AgileAI model from:", MODEL_DIR)


You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32000. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


‚úÖ Loaded trained AgileAI model from: agileai_tinyllama_qlora_v4


In [30]:
# =========================
# Cell 1: helpers & cleaning
# =========================

import re
import math
from collections import Counter
import torch

# --- Truncate helper ---

def truncate(text: str, max_chars: int = 800) -> str:
    """
    Safe truncation for project text.
    """
    if not isinstance(text, str):
        return ""
    text = text.strip()
    if len(text) <= max_chars:
        return text
    truncated = text[:max_chars]
    last_space = truncated.rfind(" ")
    if last_space != -1:
        truncated = truncated[:last_space]
    return truncated

# --- Clean title lines ---

def _clean_title_line(line: str) -> str:
    line = line.strip()
    # remove bullets / numbering
    line = re.sub(r'^[\-\*\d\.\)\s]+', '', line)
    # remove "Feature X:" / "Story X:" prefixes
    line = re.sub(r'^Feature\s*\d+\s*:\s*', '', line, flags=re.IGNORECASE)
    line = re.sub(r'^Story\s*\d+\s*:\s*',   '', line, flags=re.IGNORECASE)
    # strip quotes
    line = line.strip('"‚Äú‚Äù ').strip()
    return line

def limit_words(text: str, max_words: int) -> str:
    words = text.strip().split()
    if len(words) <= max_words:
        return text.strip()
    return " ".join(words[:max_words])


In [31]:
# =====================================
# Cell 2: reuse training prompt templates
# =====================================

# These should already exist from your training cells:
# EPIC_TITLE_TRAIN_PROMPT
# FEATURE_TITLES_TRAIN_PROMPT
# STORY_TITLES_TRAIN_PROMPT

EPIC_TITLE_PROMPT_TEXT     = EPIC_TITLE_TRAIN_PROMPT
FEATURE_TITLES_PROMPT_TEXT = FEATURE_TITLES_TRAIN_PROMPT
STORY_TITLES_PROMPT_TEXT   = STORY_TITLES_TRAIN_PROMPT


In [32]:
# ===========================================
# Cell 3: core text generation + confidence
# ===========================================

def generate_text_and_confidence(
    prompt: str,
    max_new_tokens: int = 80,
    do_sample: bool = False,
):
    """
    Returns:
      completion (str),
      confidence (float in [0,1]),
      mean_log_prob (float)
    """
    # 1) Generate
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        gen_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
        )

    decoded = tokenizer.decode(gen_ids[0], skip_special_tokens=True)

    if decoded.startswith(prompt):
        completion = decoded[len(prompt):].strip()
    else:
        completion = decoded.strip()

    # 2) Recompute log-probs over prompt + completion
    full_text = prompt + completion
    enc = tokenizer(full_text, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model(**enc, labels=enc["input_ids"])
        logits = outputs.logits

    # Shift for next-token prediction
    shift_logits = logits[:, :-1, :]
    shift_labels = enc["input_ids"][:, 1:]

    log_probs = torch.log_softmax(shift_logits, dim=-1)
    token_log_probs = log_probs.gather(2, shift_labels.unsqueeze(-1)).squeeze(-1)

    # only consider tokens after the prompt
    prompt_len = len(tokenizer(prompt)["input_ids"])
    gen_token_log_probs = token_log_probs[0, prompt_len-1:]

    if gen_token_log_probs.numel() == 0:
        mean_log_prob = -5.0
    else:
        mean_log_prob = gen_token_log_probs.mean().item()

    confidence = float(torch.sigmoid(torch.tensor(mean_log_prob)))

    return completion, float(confidence), float(mean_log_prob)


In [33]:
# ===========================================
# Cell 4: epic, features, and stories titles
# ===========================================

def generate_epic_title(project_text: str) -> dict:
    """
    Returns:
      {
        "id": "E1",
        "title": "...",
        "confidence": float,
        "mean_log_prob": float
      }
    """
    proj = truncate(project_text, max_chars=800)
    prompt = EPIC_TITLE_PROMPT_TEXT.format(project_text=proj)

    raw, conf, mean_lp = generate_text_and_confidence(
        prompt,
        max_new_tokens=40,
        do_sample=False,
    )

    lines = [l for l in raw.splitlines() if l.strip()]
    if not lines:
        epic_title = "Project Epic"
    else:
        epic_title = _clean_title_line(lines[0])

    epic_title = limit_words(epic_title, max_words=10) or "Project Epic"

    return {
        "id": "E1",
        "title": epic_title,
        "confidence": conf,
        "mean_log_prob": mean_lp,
    }


def generate_feature_titles(
    project_text: str,
    epic_obj: dict,
    num_features: int = 5
):
    """
    Returns:
      feature_dicts: list of {
         "id","title","confidence","mean_log_prob","user_stories":[]
      }
      feat_conf: float
      feat_mean_lp: float
    """
    proj = truncate(project_text, max_chars=800)

    prompt = FEATURE_TITLES_PROMPT_TEXT.format(
        project_text=proj,
        num_features=num_features,
    )

    raw, conf, mean_lp = generate_text_and_confidence(
        prompt,
        max_new_tokens=120,
        do_sample=False,
    )

    lines = [l for l in raw.splitlines() if l.strip()]
    titles = []

    for l in lines:
        t = _clean_title_line(l)
        if t:
            t = limit_words(t, max_words=10)
            titles.append(t)

    # enforce exactly num_features
    if len(titles) < num_features:
        for i in range(len(titles) + 1, num_features + 1):
            titles.append(f"Feature {i}")
    else:
        titles = titles[:num_features]

    feature_dicts = []
    for i, t in enumerate(titles, start=1):
        feature_dicts.append({
            "id": f"F{i}",
            "title": t,
            "confidence": conf,
            "mean_log_prob": mean_lp,
            "user_stories": [],
        })

    return feature_dicts, conf, mean_lp


def generate_story_titles_for_feature(
    project_text: str,
    epic_obj: dict,
    feature_obj: dict,
    num_stories: int = 3
):
    """
    Returns:
      stories: list of {
        "id","title","confidence","mean_log_prob"
      }
      s_conf: float
      s_mean_lp: float
    """
    proj = truncate(project_text, max_chars=800)
    epic_title = epic_obj.get("title", "Project Epic")
    feature_title = feature_obj.get("title", "Feature")

    prompt = STORY_TITLES_PROMPT_TEXT.format(
        project_text=proj,
        epic_title=epic_title,
        feature_title=feature_title,
        num_stories=num_stories,
    )

    raw, conf, mean_lp = generate_text_and_confidence(
        prompt,
        max_new_tokens=120,
        do_sample=False,
    )

    lines = [l for l in raw.splitlines() if l.strip()]
    titles = []

    for l in lines:
        t = _clean_title_line(l)
        if not t:
            continue
        t = limit_words(t, max_words=15)
        titles.append(t)

    if len(titles) < num_stories:
        for i in range(len(titles) + 1, num_stories + 1):
            titles.append(f"Story {i}: basic capability for {feature_title.lower()}")
    else:
        titles = titles[:num_stories]

    stories = []
    # e.g. F3 -> "3" for user story ID
    feature_index = feature_obj["id"][1:] if len(feature_obj["id"]) > 1 else "1"

    for j, st in enumerate(titles, start=1):
        stories.append({
            "id": f"US{feature_index}_{j}",
            "title": st,
            "confidence": conf,
            "mean_log_prob": mean_lp,
        })

    return stories, conf, mean_lp


In [34]:
# ===========================================
# Cell 5: metrics helpers (separate report)
# ===========================================

def compute_pseudo_perplexity_over_titles(model, tokenizer, titles_text: str):
    """
    Pseudo-perplexity over concatenated titles.
    """
    titles_text = (titles_text or "").strip()
    if not titles_text:
        return {
            "mean_log_prob_titles": float("nan"),
            "pseudo_perplexity_titles": float("nan"),
        }

    inputs = tokenizer(titles_text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        neg_log_likelihood = outputs.loss.item()

    ppl = math.exp(neg_log_likelihood)
    return {
        "mean_log_prob_titles": -neg_log_likelihood,
        "pseudo_perplexity_titles": ppl,
    }


def compute_structure_ratios(features, expected_features=5, expected_stories=3):
    num_features = len(features)
    feature_ratio = num_features / expected_features if expected_features > 0 else 0.0

    story_ratios = []
    for f in features:
        stories = f.get("user_stories", [])
        story_ratios.append(
            len(stories) / expected_stories if expected_stories > 0 else 0.0
        )

    story_count_ratio = (
        sum(story_ratios) / len(story_ratios) if story_ratios else 0.0
    )

    return {
        "feature_count_ratio": feature_ratio,
        "story_count_ratio": story_count_ratio,
    }


def compute_repetition_rates_for_titles(titles_text: str):
    tokens = titles_text.split()
    if len(tokens) < 2:
        return {
            "bigram_repetition_rate": 0.0,
            "trigram_repetition_rate": 0.0,
        }

    bigrams = [" ".join(tokens[i:i+2]) for i in range(len(tokens) - 1)]
    trigrams = [" ".join(tokens[i:i+3]) for i in range(len(tokens) - 2)] if len(tokens) >= 3 else []

    bigram_counts = Counter(bigrams)
    trigram_counts = Counter(trigrams)

    repeated_bigrams = sum(1 for _, v in bigram_counts.items() if v > 1)
    repeated_trigrams = sum(1 for _, v in trigram_counts.items() if v > 1)

    bigram_rep_rate = repeated_bigrams / max(1, len(bigram_counts))
    trigram_rep_rate = (
        repeated_trigrams / max(1, len(trigram_counts)) if trigram_counts else 0.0
    )

    return {
        "bigram_repetition_rate": bigram_rep_rate,
        "trigram_repetition_rate": trigram_rep_rate,
    }


In [35]:
# ===============================================
# Cell 6: main inference runner + metrics report
# ===============================================

def run_agileai_titles_only_with_report(
    project_text: str,
    num_features: int = 5,
    stories_per_feature: int = 3,
):
    # 1. Epic
    epic = generate_epic_title(project_text)

    # 2. Features
    feature_dicts, feat_conf, feat_mean_lp = generate_feature_titles(
        project_text,
        epic,
        num_features=num_features,
    )

    story_block_confs = []
    story_block_lps = []

    for feat in feature_dicts:
        stories, s_conf, s_lp = generate_story_titles_for_feature(
            project_text,
            epic,
            feat,
            num_stories=stories_per_feature,
        )
        feat["user_stories"] = stories
        story_block_confs.append(s_conf)
        story_block_lps.append(s_lp)

    # ---------- Clean Agile output (titles only) ----------
    clean_epic = {
        "id": epic["id"],
        "title": epic["title"],
    }

    clean_features = []
    for f in feature_dicts:
        clean_stories = [
            {"id": us["id"], "title": us["title"]}
            for us in f.get("user_stories", [])
        ]
        clean_features.append({
            "id": f["id"],
            "title": f["title"],
            "user_stories": clean_stories,
        })

    agile_output = {
        "epic": clean_epic,
        "features": clean_features,
    }

    # ---------- Metrics report ----------
    def _safe_avg(vals):
        return float(sum(vals) / len(vals)) if vals else 0.0

    confidence_metrics = {
        "epic_confidence": epic.get("confidence", 0.0),
        "epic_mean_log_prob": epic.get("mean_log_prob", 0.0),
        "features_block_confidence": feat_conf,
        "features_block_mean_log_prob": feat_mean_lp,
        "stories_avg_block_confidence": _safe_avg(story_block_confs),
        "stories_avg_block_mean_log_prob": _safe_avg(story_block_lps),
    }

    # Flat titles text
    all_titles = [clean_epic["title"]]
    for f in clean_features:
        all_titles.append(f["title"])
        for us in f["user_stories"]:
            all_titles.append(us["title"])
    titles_text = " ".join(all_titles)

    ppl_metrics = compute_pseudo_perplexity_over_titles(
        model, tokenizer, titles_text
    )
    struct_metrics = compute_structure_ratios(
        clean_features,
        expected_features=num_features,
        expected_stories=stories_per_feature,
    )
    rep_metrics = compute_repetition_rates_for_titles(titles_text)

    metrics_report = {
        **confidence_metrics,
        **ppl_metrics,
        **struct_metrics,
        **rep_metrics,
    }

    return agile_output, metrics_report


In [36]:
# ==============================
# Cell 7: test run on a project
# ==============================

import json

# Example: use one of your training projects, or a cleaned new project description
proj_text = training_pairs[0]["project_text"]  # or your cleaned text from PDF

agile_output, metrics_report = run_agileai_titles_only_with_report(
    proj_text,
    num_features=5,
    stories_per_feature=3,
)

print("=== AGILE OUTPUT ===")
print(json.dumps(agile_output, indent=2, ensure_ascii=False))

print("\n=== METRICS REPORT ===")
print(json.dumps(metrics_report, indent=2, ensure_ascii=False))


=== AGILE OUTPUT ===
{
  "epic": {
    "id": "E1",
    "title": "SmartDetect Anomaly Detection and Edge Deployment System"
  },
  "features": [
    {
      "id": "F1",
      "title": "Data Preprocessing and Feature Extraction",
      "user_stories": [
        {
          "id": "US1_1",
          "title": "Preprocess Input Data for Feature Extraction"
        },
        {
          "id": "US1_2",
          "title": "Process Encoded Input Data for Feature Extraction"
        },
        {
          "id": "US1_3",
          "title": "Extract Feature Values from Encoded Input Data"
        }
      ]
    },
    {
      "id": "F2",
      "title": "Model Training and Evaluation",
      "user_stories": [
        {
          "id": "US2_1",
          "title": "Train Model"
        },
        {
          "id": "US2_2",
          "title": "Evaluate Model"
        },
        {
          "id": "US2_3",
          "title": "Deploy Model to Edge Device"
        }
      ]
    },
    {
      "id": "F3",
 