<a href="https://colab.research.google.com/github/Shrimayee30/AgileAI/blob/main/playground1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AgileAI

### Data Preprocessing


In [36]:
# ============================
# Data Preprocessing - Cell 1
# Setup: installs, Drive, paths
# ============================

# Install PDF libraries for Colab
!pip install -q pypdf pdfminer.six

# Mount Google Drive
from google.colab import drive
drive.mount("/content/drive")

# Core imports
from pathlib import Path
import re
import json
import textwrap
from pypdf import PdfReader
from pdfminer.high_level import extract_text as pdfminer_extract_text

# ---- Project paths (adjusted to your structure) ----

# Root folder for your AgileAI Part 1 project in Drive
PROJECT_ROOT = Path("/content/drive/MyDrive/AgileAI/Part 1")

# Folder where your raw PDFs live
DATA_DIR = PROJECT_ROOT / "data"

# Folder where we will save cleaned text + artifacts
OUTPUT_DIR = PROJECT_ROOT / "preprocessed"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Main input PDF for now
PDF_PATH = DATA_DIR / "Website - E commerce fashion.pdf"

assert PDF_PATH.exists(), f"PDF not found: {PDF_PATH}"
print(f"Using input PDF: {PDF_PATH}")
print(f"Output directory: {OUTPUT_DIR}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using input PDF: /content/drive/MyDrive/AgileAI/Part 1/data/Website - E commerce fashion.pdf
Output directory: /content/drive/MyDrive/AgileAI/Part 1/preprocessed


In [37]:
# =====================================
# Cell 2 — Helpers + PDF → raw text
# =====================================

import os
from io import StringIO
from typing import List, Tuple, Dict

from pypdf import PdfReader
from pdfminer.high_level import extract_text_to_fp

# Work inside your project folder so outputs land in AgileAI/Part 1
os.chdir(PROJECT_ROOT)
print("Current working directory:", os.getcwd())

# ---- Simple helpers to read/write text ----

def write_text(p: Path, s: str):
    p.write_text(s, encoding="utf-8")

def read_text(p: Path) -> str:
    return p.read_text(encoding="utf-8")


# ---- PDF extraction: PyPDF primary, pdfminer fallback ----

def extract_text_pypdf(path: Path) -> str:
    reader = PdfReader(str(path))
    pages = []
    for p in reader.pages:
        try:
            pages.append(p.extract_text() or "")
        except Exception:
            pages.append("")
    return "\n\n".join(pages)

def extract_text_pdfminer(path: Path) -> str:
    out = StringIO()
    with open(path, "rb") as f:
        extract_text_to_fp(f, out)
    return out.getvalue()

def extract_pdf_text(path: Path) -> str:
    """
    Try PyPDF first; if that fails or is empty, fall back to pdfminer.
    Returns one big raw text string.
    """
    try:
        raw = extract_text_pypdf(path)
        if not raw.strip():
            raise ValueError("PyPDF returned empty text")
        print("Extracted text using PyPDF.")
    except Exception as e:
        print("PyPDF failed or was empty, falling back to pdfminer:", e)
        raw = extract_text_pdfminer(path)
        print("Extracted text using pdfminer.")
    return raw


# ---- Extract and save raw text for debugging ----

raw_text = extract_pdf_text(PDF_PATH)

RAW_TEXT_PATH = PROJECT_ROOT / "raw_text_from_pdf.txt"
write_text(RAW_TEXT_PATH, raw_text)

print(f"Raw text length: {len(raw_text)} characters")
print("Saved raw text to:", RAW_TEXT_PATH)
print("\n--- RAW TEXT PREVIEW (first 500 chars) ---\n")
print(raw_text[:500])


Current working directory: /content/drive/MyDrive/AgileAI/Part 1
Extracted text using PyPDF.
Raw text length: 16085 characters
Saved raw text to: /content/drive/MyDrive/AgileAI/Part 1/raw_text_from_pdf.txt

--- RAW TEXT PREVIEW (first 500 chars) ---

Governors State University
OPUS Open Portal to University Scholarship
All Capstone Projects Student Capstone Projects
Summer 2014
Design and Implementation of E-Commerce Site
for Online Shopping
Sidhartha Reddy Vatrapu
Governors State University
Follow this and additional works at: https://opus.govst.edu/capstones
Part of the Databases and Information Systems Commons, and the Systems Architecture
Commons
For more information about the academic degree, extended learning, and certificate programs 


In [38]:
# =====================================
# Cell 3 — Low-level cleaning helpers
# =====================================

def remove_nonprintable_and_glyphs(text: str) -> str:
    """
    Fix common unicode/glyph issues and drop non-printable chars.
    """
    replacements = {
        '\xa0': ' ',   # non-breaking space
        '\u2018': "'", # left single quote
        '\u2019': "'", # right single quote
        '\u201c': '"', # left double quote
        '\u201d': '"', # right double quote
        '\u2013': '-', # en dash
        '\u2014': '-', # em dash
        '\uf0d8': '-', # bullet-like glyph
        '\uf0b7': '-', # bullet-like glyph
        '\u2022': '-', # bullet
    }
    for bad, good in replacements.items():
        text = text.replace(bad, good)

    # Keep only printable chars + newlines
    clean = ''.join(ch for ch in text if ch.isprintable() or ch == '\n')
    return clean

def remove_figure_placeholders(text: str) -> str:
    """
    Remove figure tags, captions, and similar noise.
    """
    patterns = [
        r"<figure>.*?</figure>",
        r"<figcaption>.*?</figcaption>",
        r"Figure\s*\d+[:\-]?.*?$",
        r"Copyright[^\\n]*",
        r"PageNumber=\"\d+\"",
        r"<!--.*?-->",
    ]
    for pat in patterns:
        text = re.sub(pat, "", text, flags=re.IGNORECASE | re.DOTALL | re.MULTILINE)
    return text

def canonicalize_bullets_and_lists(text: str) -> str:
    """
    Convert many bullet characters to '-', normalize numbered list format.
    """
    # Replace various bullet-like characters with '-'
    text = re.sub(r"[•\u2022\u2023\u25E6\u2043\u2219\uf0d8\uf0b7\*]", "-", text)
    text = re.sub(r"[·•]", "-", text)

    # Normalize numbered lists: "1. Something" -> "1. Something"
    text = re.sub(r"^(\d+)\.\s*", r"\1. ", text, flags=re.MULTILINE)
    return text

def remove_repeated_footers(text: str) -> str:
    """
    Detect lines that repeat many times (likely headers/footers) and remove them.
    """
    lines = text.splitlines()
    counts: Dict[str, int] = {}
    for ln in lines:
        ln_s = ln.strip()
        if not ln_s:
            continue
        counts[ln_s] = counts.get(ln_s, 0) + 1

    # Anything that appears 2+ times is suspicious
    repeated = {ln for ln, c in counts.items() if c >= 2}
    if repeated:
        pattern = "(" + "|".join(re.escape(r) for r in repeated) + ")"
        text = re.sub(pattern, "", text)

    return text

def normalize_whitespace(text: str) -> str:
    """
    Normalize spaces + newlines:
      - collapse multiple spaces/tabs
      - collapse 3+ blank lines into 2
      - trim leading/trailing blank lines
    """
    # Collapse long runs of spaces/tabs
    text = re.sub(r"[ \t]+", " ", text)

    # Collapse 3+ newlines into 2
    text = re.sub(r"\n{3,}", "\n\n", text)

    # Strip blank lines at start/end
    lines = [ln.strip() for ln in text.splitlines()]
    while lines and not lines[0]:
        lines.pop(0)
    while lines and not lines[-1]:
        lines.pop()

    return "\n".join(lines)


In [39]:
# =====================================
# Cell 4 — Boilerplate + TOC removal + full cleaning
# =====================================

from typing import List, Tuple

def truncate_boilerplate_start(text: str) -> str:
    """
    Try to skip front matter and jump to ABSTRACT / INTRODUCTION section.
    """
    markers = [
        r"\bABSTRACT\b",
        r"\b1\.0\s+INTRODUCTION\b",
        r"\bDesign and Implementation of E-Commerce Site\b",
    ]
    first_idx = None
    for m in markers:
        match = re.search(m, text, flags=re.IGNORECASE)
        if match:
            idx = match.start()
            if first_idx is None or idx < first_idx:
                first_idx = idx

    if first_idx is not None:
        return text[first_idx:]

    # Fallback: scan first ~120 lines for 1.0 / ABSTRACT / INTRODUCTION
    lines = text.splitlines()
    for i, ln in enumerate(lines[:120]):
        if re.match(r'^\s*(1\.0|ABSTRACT|INTRODUCTION)', ln, flags=re.IGNORECASE):
            return "\n".join(lines[i:])

    return text

def remove_toc_blocks(lines: List[str], min_block_len: int = 3) -> Tuple[List[str], List[Tuple[int, int]]]:
    """
    Remove contiguous blocks of TOC-like lines.
    Returns (filtered_lines, ranges_removed).
    """

    def is_toc_line(ln: str) -> bool:
        s = ln.strip()
        if not s:
            return False
        # pattern like "4.1  Intro........ 12"
        if re.search(r"\.{3,}\s*\d+\s*$", s):
            return True
        if re.match(r"^\s*(TABLE OF CONTENTS|CONTENTS|INDEX)\b", s, flags=re.IGNORECASE):
            return True
        return False

    n = len(lines)
    toc_ranges: List[Tuple[int, int]] = []
    i = 0
    while i < n:
        if is_toc_line(lines[i]):
            j = i
            while j < n and is_toc_line(lines[j]):
                j += 1
            if j - i >= min_block_len:
                toc_ranges.append((i, j))
                i = j
                continue
        i += 1

    # Fallback: single TABLE OF CONTENTS mention near top
    if not toc_ranges:
        for idx, ln in enumerate(lines[:200]):
            if re.search(r"\b(table of contents|contents|index)\b", ln, flags=re.IGNORECASE):
                end_idx = None
                for j in range(idx + 1, min(len(lines), idx + 60)):
                    if re.search(r"\b(1\.0|introduction|abstract)\b", lines[j], flags=re.IGNORECASE):
                        end_idx = j
                        break
                if end_idx is None:
                    end_idx = min(len(lines), idx + 40)
                if end_idx - idx >= 2:
                    toc_ranges.append((idx, end_idx))
                break

    kept = []
    removed_idx = set()
    for start, end in toc_ranges:
        for k in range(start, end):
            removed_idx.add(k)
    for idx, ln in enumerate(lines):
        if idx not in removed_idx:
            kept.append(ln)

    return kept, toc_ranges

def drop_residual_toc_fragments(lines: List[str]) -> List[str]:
    """
    Drop leftover TOC-like lines (dots + page numbers, etc.).
    """
    filtered = []
    for ln in lines:
        s = ln.strip()
        if re.search(r"\.{3,}\s*\d+\s*$", s):
            continue
        if re.match(r"^\s*(table of contents|contents|index)\b", s, flags=re.IGNORECASE):
            continue
        filtered.append(ln)
    return filtered

def clean_text_pipeline_with_toc_removal(raw_text: str) -> str:
    """
    Full cleaning pipeline:
      1) glyph cleanup
      2) truncate front boilerplate
      3) remove figures
      4) remove repeated footers
      5) normalize bullets/lists
      6) whitespace normalization
      7) remove TOC blocks + leftovers
      8) final whitespace cleanup
    """
    t = remove_nonprintable_and_glyphs(raw_text)
    t = truncate_boilerplate_start(t)
    t = remove_figure_placeholders(t)
    t = remove_repeated_footers(t)
    t = canonicalize_bullets_and_lists(t)
    t = normalize_whitespace(t)

    lines = t.splitlines()
    filtered, removed = remove_toc_blocks(lines)
    if removed:
        print("Removed TOC/index block ranges:", removed)
    filtered = drop_residual_toc_fragments(filtered)
    t = "\n".join(filtered)

    # Extra noise cleanup
    t = re.sub(r"[-]{4,}", "", t)
    t = re.sub(r"PageHeader=.*", "", t)
    t = re.sub(r"PageNumber=.*", "", t)
    t = normalize_whitespace(t)

    # Ensure big headings stay as separate lines
    t = re.sub(r"\n([A-Z0-9 \-]{4,})\n", r"\n\1\n", t)
    return t


In [40]:
# =====================================
# Cell 5 — Canonical page mapping + final refinement
# =====================================

CANONICAL_MAP = {
    r"\bhome\s*page\b": "Home Page",
    r"\bclothing\s*page\b": "Clothing Page",
    r"\border\s*us\s*page\b": "Order Us Page",
    r"\border\s*us\b": "Order Us Page",
    r"\bcontact\s*us\s*page\b": "Contact Us Page",
    r"\babout\s*us\s*page\b": "About Us Page",
    r"\btrack\s*for\s*admin\s*page\b": "Track For Admin Page",
    r"\btrack\b": "Track",
    r"\bregister\s*page\b": "Register Page",
    r"\blogin\s*page\b": "Login Page",
    r"\badmin\s*page\b": "Admin Page",
    r"\border\s*view\s*for\s*user\b": "Order View for User",
    r"\bpaypal\s*for\s*payment\b": "PayPal For Payment",
    r"\bsuccess\s*page\b": "Success Page",
    r"\bfailed\s*page\b": "Failed Page",
}

STOP_HEADINGS = {
    "ABSTRACT",
    "ACKNOWLEDGEMENTS",
    "ACKNOWLEDGEMENT",
    "TABLE OF CONTENTS",
    "CONTENTS",
    "INDEX",
    "LIST OF FIGURES",
    "REFERENCES",
}

IRRELEVANT_PATTERNS = [
    r"https?://\S+",
    r"opus@govst.edu",
    r"^page \d+$",
    r"^\d+\s*$",
]

def is_irrelevant_line(line: str) -> bool:
    """
    Return True if a line is clearly not useful as content/heading.
    """
    s = line.strip()
    if not s:
        return True  # empty lines irrelevant for our purposes

    low = s.lower()

    # URL / email / trivial numeric lines
    for pat in IRRELEVANT_PATTERNS:
        if re.search(pat, low, flags=re.IGNORECASE):
            return True

    # Very punctuation-heavy → probably junk or TOC artifact
    punct_ratio = len(re.sub(r"[A-Za-z0-9\s]", "", s)) / max(1, len(s))
    if punct_ratio > 0.25:
        return True

    # Bare page numbers like "12."
    if re.match(r'^\d{1,3}\.?$', s):
        return True

    # Known boilerplate headings
    if s.upper() in STOP_HEADINGS:
        return True

    return False

def normalize_heading_variant(ln: str) -> str:
    """
    Normalize a heading/page-like line into a clean, title-cased label
    using CANONICAL_MAP when possible.
    """
    ln0 = ln.strip()

    # Remove leading numbering like "4.1 HOME PAGE"
    ln0 = re.sub(r'^\s*\d+(\.\d+)*\s*[:\-\)]*\s*', '', ln0)

    # Remove trailing page numbers
    ln0 = re.sub(r'\s+\d{1,3}\s*$', '', ln0)

    # Remove leading bullets
    ln0 = re.sub(r'^[\-\u2022\uf0d8\uf0b7\*]+\s*', '', ln0)

    # Simplify to letters/digits/spaces, and lowercase
    key = re.sub(r'[^A-Za-z0-9 ]+', ' ', ln0).strip().lower()

    # Map to canonical if pattern matches
    for pat, canon in CANONICAL_MAP.items():
        if re.search(pat, key, flags=re.IGNORECASE):
            return canon

    # If it already looks like "... page", keep but title-case
    if key.endswith(" page"):
        return " ".join(w.capitalize() for w in key.split())

    # Drop super short / pure numeric stuff
    if len(key) < 3 or re.search(r'^\d+$', key):
        return ""

    # Fallback: title-case the cleaned key
    return " ".join(w.capitalize() for w in key.split())

def refine_cleaned_text_and_pages(cleaned_text: str) -> Dict[str, object]:
    """
    Final step:
      - Skip preface lines until 'ABSTRACT' or '1.0'
      - Drop irrelevant lines
      - (Temporary) pages will be replaced by stricter detector later
      - Wrap trimmed text in <<<PROJECT>>> block
    """
    lines = cleaned_text.splitlines()

    # Start from ABSTRACT or 1.0 if present
    start_idx = 0
    for i, ln in enumerate(lines[:200]):
        if re.search(r'\babstract\b', ln, flags=re.IGNORECASE) or re.search(r'\b1\.0\b', ln):
            start_idx = i
            break

    useful_lines = lines[start_idx:]
    trimmed_lines = [ln for ln in useful_lines if not is_irrelevant_line(ln)]
    trimmed_text = "\n".join(trimmed_lines).strip()

    # We'll recompute pages with a stricter detector in the next cell.
    pages = []  # placeholder

    project_block = "<<<PROJECT>>>\n" + trimmed_text + "\n<<<ENDPROJECT>>>"

    return {
        "block": project_block,
        "pages": pages,
        "trimmed_text": trimmed_text,
    }


In [41]:
# =====================================
# Cell 6 — Run full pipeline and save outputs
# =====================================

# 1. Extract raw text
raw = extract_pdf_text(PDF_PATH)

# 2. Run cleaning + TOC removal
cleaned = clean_text_pipeline_with_toc_removal(raw)

# 3. Final refinement: block + trimmed text
refined = refine_cleaned_text_and_pages(cleaned)

# Save outputs in project root
TRIMMED_TEXT_PATH = PROJECT_ROOT / "trimmed_project_text.txt"
BLOCK_PATH = PROJECT_ROOT / "full_text_of_pdf_block.txt"
PAGES_PATH = PROJECT_ROOT / "detected_webpages.json"  # will be filled next cell

write_text(TRIMMED_TEXT_PATH, refined["trimmed_text"])
write_text(BLOCK_PATH, refined["block"])
# temporarily write empty list for pages; we overwrite in next cell
write_text(PAGES_PATH, json.dumps(refined["pages"], indent=2))

print("Saved trimmed project text to:", TRIMMED_TEXT_PATH)
print("Saved project block to:", BLOCK_PATH)
print("Draft pages (to be overwritten) saved to:", PAGES_PATH)

print("\n--- PROJECT BLOCK PREVIEW (first 800 chars) ---\n")
print(refined["block"][:800])


Extracted text using PyPDF.
Removed TOC/index block ranges: [(121, 123)]
Saved trimmed project text to: /content/drive/MyDrive/AgileAI/Part 1/trimmed_project_text.txt
Saved project block to: /content/drive/MyDrive/AgileAI/Part 1/full_text_of_pdf_block.txt
Draft pages (to be overwritten) saved to: /content/drive/MyDrive/AgileAI/Part 1/detected_webpages.json

--- PROJECT BLOCK PREVIEW (first 800 chars) ---

<<<PROJECT>>>
In today's fast -changing business environment, it's extremely important to be able to respond
to client needs in the most effective and timely manner. If your customers wish to see your
business online and have instant access to your products or services.
Online Shopping is a lifestyle e -commerce web application, which retails various fashion and
lifestyle products (Currently Men's Wear). This project allows viewing various products
available enables registered users to purchase desired products instantly using PayPal payment
processor (Instant Pay) a nd also can place

In [42]:
# =====================================
# Cell 7 — Stricter generic UI page detection
# =====================================

from typing import List

def detect_ui_pages_from_text(text: str) -> List[str]:
    """
    Detect 'UI pages' in a generic way:
      - Only consider lines that contain the word 'page' (word-level).
      - Ignore irrelevant / noisy lines (using is_irrelevant_line).
      - Require lines to be short-ish (<= 100 chars, <= 10 words).
      - Normalize them via normalize_heading_variant (which uses CANONICAL_MAP).
    This will pick:
      - '4.1 HOME PAGE'
      - '4.2 CLOTHING PAGE ( PRODUCTS )'
      - 'Contact Us Page'
    and ignore things like '1 Data Description', 'Use Case Diagram', etc.
    """
    lines = text.splitlines()
    candidates = []

    for ln in lines:
        if is_irrelevant_line(ln):
            continue

        s = ln.strip()
        if not s:
            continue

        # Keep short headings; avoid full paragraphs
        if len(s) > 100:
            continue

        # Must contain the word 'page' (word-level)
        if not re.search(r"\bpage\b", s, flags=re.IGNORECASE):
            continue

        # Avoid long headings with too many words
        if len(s.split()) > 10:
            continue

        norm = normalize_heading_variant(s)
        if norm:
            candidates.append(norm)

    # Preserve order but dedupe
    seen = set()
    ordered = []
    for c in candidates:
        if c not in seen:
            seen.add(c)
            ordered.append(c)

    return ordered

# --- Recompute UI pages from the *trimmed* text and overwrite detected_webpages.json ---

trimmed_text = TRIMMED_TEXT_PATH.read_text(encoding="utf-8")
ui_pages = detect_ui_pages_from_text(trimmed_text)

write_text(PAGES_PATH, json.dumps(ui_pages, indent=2))

print("\n--- FINAL DETECTED UI PAGES ---")
print(json.dumps(ui_pages, indent=2))



--- FINAL DETECTED UI PAGES ---
[
  "Home Page",
  "Clothing Page",
  "Order Us Page",
  "Contact Us Page",
  "About Us Page",
  "Track For Admin Page",
  "Register Page",
  "Admin Page",
  "Success Page",
  "Failed Page",
  "Login Page",
  "Page"
]


### Prompt Engineering

In [43]:
# =====================================
# Prompt Engineering - Cell 1
# Load project block + pages, define prompt template
# =====================================

import json
from pathlib import Path

# Reuse paths from preprocessing
print("BLOCK_PATH:", BLOCK_PATH)
print("PAGES_PATH:", PAGES_PATH)

# Load the cleaned project block and canonical UI pages
FULL_TEXT_OF_PDF = read_text(BLOCK_PATH)
CANONICAL_PAGES = json.loads(read_text(PAGES_PATH))

print("\n--- DETECTED UI PAGES ---")
print(json.dumps(CANONICAL_PAGES, indent=2))

# We'll insert this into the prompt so the model knows allowed web pages
CANONICAL_PAGES_STR = ", ".join(CANONICAL_PAGES)

# Separator between instructions+input and JSON output (used in training)
SEPARATOR = "\n\n### OUTPUT JSON START ###\n\n"

# ---- Prompt template ----
# We keep it generic but structured like your original:
PROMPT_TEMPLATE = f"""
You are AgileAI, an assistant that helps project managers break down software projects
into an Agile backlog structure of EPIC -> FEATURES -> USER STORIES.

You will be given a SOFTWARE PROJECT DESCRIPTION, delimited by the markers:

    <<<PROJECT>>>
    (project text here)
    <<<ENDPROJECT>>>

Your job is to:
  1. Understand the project scope, goals, and main flows.
  2. Create ONE high-level EPIC that captures the overall project.
  3. Create several FEATURES that group related functionality under that epic.
  4. For each feature, create USER STORIES that are small, sprint-sized, and testable.
  5. Return everything as a SINGLE JSON OBJECT that follows the schema below.

------------------ JSON OUTPUT SCHEMA ------------------

The output must be ONLY valid JSON, with this structure:

{{
  "epic": {{
    "title": "...",
    "summary": "..."
  }},
  "features": [
    {{
      "title": "...",
      "description": "...",
      "webpage": "One of the allowed page names",
      "stories": [
        {{
          "id": "F1-S1",
          "title": "...",
          "description": "...",
          "acceptance_criteria": [
            "...",
            "...",
            ...
          ],
          "definition_of_ready": [
            "...",
            "...",
            ...
          ]
        }}
      ]
    }}
  ]
}}

Rules and constraints:

- "epic":
  - "title": short (max 10 words), high-level.
  - "summary": 2–4 sentences summarizing the entire project.

- "features":
  - Group related functionality; 3–8 features is typical.
  - "title": short (max 8 words), describing the feature.
  - "description": 1–3 sentences about this feature.
  - "webpage":
      * MUST be chosen from this allowed list:
        [{CANONICAL_PAGES_STR}]
      * If multiple pages could apply, choose the one that best fits.
  - "stories": list of user stories for that feature.

- "stories" (user stories):
  - Each story represents one sprint-sized chunk of work.
  - "id": must be in the form "F{{feature_index}}-S{{story_index}}",
      e.g., "F1-S1", "F1-S2", "F2-S1", etc.
  - "title": short (max 10 words), in the style of a user story title.
  - "description": 1–3 sentences, clearly describing the behaviour and value.
  - "acceptance_criteria":
      * 2–5 bullet points.
      * Each item is a clear, testable condition.
  - "definition_of_ready":
      * 1–5 items describing preconditions (e.g., "UI mockups approved").

Source-of-truth and assumptions:

- Use ONLY information from the project description between <<<PROJECT>>> and <<<ENDPROJECT>>>.
- Do NOT invent new modules or flows that are not implied by the text.
- If you must introduce something that is not explicitly in the project text,
  mark it clearly with the token "[ASSUMPTION]" inside the string.

Formatting:

- Output must be a single JSON object, no markdown, no comments.
- Do NOT include the project description in the output.
- Do NOT include explanation text; ONLY the JSON object.

------------------ PROJECT DESCRIPTION ------------------

{FULL_TEXT_OF_PDF}

---------------------------------------------------------
Remember:
- Think through the project first.
- Then produce the JSON object in the required schema.
"""

# Save a concrete prompt instance so you can inspect it
FINAL_PROMPT_PATH = PROJECT_ROOT / "final_prompt_input.txt"
write_text(FINAL_PROMPT_PATH, PROMPT_TEMPLATE + SEPARATOR)

print("\nSaved full prompt (instructions + project + separator) to:")
print(FINAL_PROMPT_PATH)

print("\n--- PROMPT PREVIEW (first 800 chars) ---\n")
print((PROMPT_TEMPLATE + SEPARATOR)[:800])


BLOCK_PATH: /content/drive/MyDrive/AgileAI/Part 1/full_text_of_pdf_block.txt
PAGES_PATH: /content/drive/MyDrive/AgileAI/Part 1/detected_webpages.json

--- DETECTED UI PAGES ---
[
  "Home Page",
  "Clothing Page",
  "Order Us Page",
  "Contact Us Page",
  "About Us Page",
  "Track For Admin Page",
  "Register Page",
  "Admin Page",
  "Success Page",
  "Failed Page",
  "Login Page",
  "Page"
]

Saved full prompt (instructions + project + separator) to:
/content/drive/MyDrive/AgileAI/Part 1/final_prompt_input.txt

--- PROMPT PREVIEW (first 800 chars) ---


You are AgileAI, an assistant that helps project managers break down software projects
into an Agile backlog structure of EPIC -> FEATURES -> USER STORIES.

You will be given a SOFTWARE PROJECT DESCRIPTION, delimited by the markers:

    <<<PROJECT>>>
    (project text here)
    <<<ENDPROJECT>>>

Your job is to:
  1. Understand the project scope, goals, and main flows.
  2. Create ONE high-level EPIC that captures the overall project.
 

In [44]:
# =====================================
# Prompt Engineering - Cell 2
# JSON schema validator and repair helpers
# =====================================

from typing import Any, Dict, List, Tuple
from difflib import SequenceMatcher

def validate_epic(epic: Dict[str, Any]) -> List[str]:
    errs = []
    if not isinstance(epic, dict):
        return ["epic must be an object"]
    if "title" not in epic or not isinstance(epic["title"], str) or not epic["title"].strip():
        errs.append("epic.title must be a non-empty string")
    if "summary" not in epic or not isinstance(epic["summary"], str) or not epic["summary"].strip():
        errs.append("epic.summary must be a non-empty string")
    return errs

def validate_story(story: Dict[str, Any], feature_index: int, story_index: int) -> List[str]:
    errs = []
    prefix = f"features[{feature_index}].stories[{story_index}]"

    if not isinstance(story, dict):
        return [f"{prefix} must be an object"]

    # id
    sid = story.get("id")
    if not isinstance(sid, str) or not sid.strip():
        errs.append(f"{prefix}.id must be a non-empty string")

    # title
    title = story.get("title")
    if not isinstance(title, str) or not title.strip():
        errs.append(f"{prefix}.title must be a non-empty string")

    # description
    desc = story.get("description")
    if not isinstance(desc, str) or not desc.strip():
        errs.append(f"{prefix}.description must be a non-empty string")

    # acceptance_criteria
    ac = story.get("acceptance_criteria")
    if not isinstance(ac, list) or not ac:
        errs.append(f"{prefix}.acceptance_criteria must be a non-empty list")
    else:
        for i, item in enumerate(ac):
            if not isinstance(item, str) or not item.strip():
                errs.append(f"{prefix}.acceptance_criteria[{i}] must be a non-empty string")

    # definition_of_ready
    dor = story.get("definition_of_ready")
    if not isinstance(dor, list) or not dor:
        errs.append(f"{prefix}.definition_of_ready must be a non-empty list")
    else:
        for i, item in enumerate(dor):
            if not isinstance(item, str) or not item.strip():
                errs.append(f"{prefix}.definition_of_ready[{i}] must be a non-empty string")

    return errs

def validate_feature(feature: Dict[str, Any], idx: int, allowed_pages: List[str]) -> List[str]:
    errs = []
    prefix = f"features[{idx}]"

    if not isinstance(feature, dict):
        return [f"{prefix} must be an object"]

    # title
    if "title" not in feature or not isinstance(feature["title"], str) or not feature["title"].strip():
        errs.append(f"{prefix}.title must be a non-empty string")

    # description
    if "description" not in feature or not isinstance(feature["description"], str) or not feature["description"].strip():
        errs.append(f"{prefix}.description must be a non-empty string")

    # webpage
    wp = feature.get("webpage")
    if not isinstance(wp, str) or not wp.strip():
        errs.append(f"{prefix}.webpage must be a non-empty string")
    elif allowed_pages and wp not in allowed_pages:
        errs.append(f"{prefix}.webpage '{wp}' is not in allowed pages {allowed_pages}")

    # stories
    stories = feature.get("stories")
    if not isinstance(stories, list) or not stories:
        errs.append(f"{prefix}.stories must be a non-empty list")
    else:
        for s_idx, story in enumerate(stories):
            errs.extend(validate_story(story, idx, s_idx))

    return errs

def validate_schema(obj: Dict[str, Any], allowed_pages: List[str]) -> List[str]:
    """
    Top-level schema validator for the JSON structure.
    Returns a list of error strings (empty means 'valid').
    """
    errs = []

    if not isinstance(obj, dict):
        return ["Top-level JSON must be an object"]

    # epic
    if "epic" not in obj:
        errs.append("Missing 'epic' object")
    else:
        errs.extend(validate_epic(obj["epic"]))

    # features
    features = obj.get("features")
    if not isinstance(features, list) or not features:
        errs.append("'features' must be a non-empty list")
    else:
        for i, feat in enumerate(features):
            errs.extend(validate_feature(feat, i, allowed_pages))

    return errs


# ---------- Repair helpers ----------

def fuzzy_match_page_name(name: str, allowed_pages: List[str], threshold: float = 0.6) -> str:
    """
    If 'name' is not exactly in allowed_pages, try to fuzzy-match it.
    Returns a best match or the original name if nothing is close enough.
    """
    if not allowed_pages:
        return name
    best_score = 0.0
    best_page = name
    for p in allowed_pages:
        score = SequenceMatcher(None, name.lower(), p.lower()).ratio()
        if score > best_score:
            best_score = score
            best_page = p
    if best_score >= threshold:
        return best_page
    return name

def repair_story_ids(features: List[Dict[str, Any]]) -> None:
    """
    Ensure story 'id' fields follow F{feature_index}-S{story_index}.
    Modifies the features list in-place.
    """
    for f_idx, feat in enumerate(features, start=1):
        stories = feat.get("stories", [])
        if not isinstance(stories, list):
            continue
        for s_idx, story in enumerate(stories, start=1):
            if not isinstance(story, dict):
                continue
            story["id"] = f"F{f_idx}-S{s_idx}"

def repair_webpages(features: List[Dict[str, Any]], allowed_pages: List[str]) -> None:
    """
    Fuzzy-match feature.webpage to the closest allowed page.
    Modifies the features list in-place.
    """
    if not allowed_pages:
        return
    for feat in features:
        if not isinstance(feat, dict):
            continue
        wp = feat.get("webpage")
        if isinstance(wp, str) and wp.strip():
            feat["webpage"] = fuzzy_match_page_name(wp, allowed_pages)

def repair_json(obj: Dict[str, Any], allowed_pages: List[str]) -> Dict[str, Any]:
    """
    Apply simple, conservative repairs:
      - Fix story IDs to consistent pattern.
      - Fuzzy-match feature.webpage names to allowed_pages.
    Returns the modified object.
    """
    if not isinstance(obj, dict):
        return obj

    features = obj.get("features")
    if isinstance(features, list):
        repair_story_ids(features)
        repair_webpages(features, allowed_pages)

    return obj


def validate_and_repair(obj: Dict[str, Any], allowed_pages: List[str]) -> Tuple[Dict[str, Any], List[str], List[str]]:
    """
    Run validation, then attempt repairs, then re-validate.

    Returns:
      - repaired_obj
      - errors_before
      - errors_after
    """
    errs_before = validate_schema(obj, allowed_pages)
    if not errs_before:
        # Already valid
        return obj, [], []

    repaired = repair_json(obj, allowed_pages)
    errs_after = validate_schema(repaired, allowed_pages)
    return repaired, errs_before, errs_after

print("Validator and repair helpers defined. They will be used after generation.")


Validator and repair helpers defined. They will be used after generation.


### Training

In [50]:
!pip install -q "transformers>=4.39.0" "datasets>=2.18.0" "accelerate>=0.30.0" \
               "bitsandbytes>=0.43.0" "peft>=0.10.0"


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [51]:
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# --- Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# TinyLlama tokenizer sometimes has no pad_token → reuse eos_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# --- 4-bit quantization config (QLoRA style) ---
compute_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

# --- Load base model in 4-bit ---
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",  # spreads across GPU(s) if available
)

# Ensure embedding size matches tokenizer
model.resize_token_embeddings(len(tokenizer))

# Prepare for k-bit training (freezes norms, etc.)
model = prepare_model_for_kbit_training(model)

# --- LoRA config (only trains a few adapter weights) ---
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`