In [22]:
import os,json
import faiss
from tqdm import tqdm
from dotenv import load_dotenv
from collections import defaultdict
import numpy as np

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [11]:
EMBED_MODEL = "text-embedding-3-large"
TOP_K = 5
SIM_THRESHOLD = 0.75   # tune this later

In [12]:
def embed(texts):
    resp = client.embeddings.create(
        model=EMBED_MODEL,
        input=texts
    )
    return np.array([d.embedding for d in resp.data], dtype="float32")

#### domain Loading + Indicator Flattening

In [20]:
def load_domains(path="../data/aacn_domain_consolidated.json"):
    with open(path) as f:
        return json.load(f)

def flatten_indicators(domains):
    flat = []
    for domain in domains:
        name = domain["domain_name"]
        for idx, indicator in enumerate(domain["progression_indicators"]):
            flat.append({
                "text": indicator,
                "domain": name,
                "indicator_index": idx
            })
    return flat

def build_faiss(indicators):
    texts = [d["text"] for d in indicators]
    vecs = embed(texts)

    faiss.normalize_L2(vecs)
    index = faiss.IndexFlatIP(vecs.shape[1])
    index.add(vecs)
    return index, indicators


#### normalized syllabi loading

In [24]:
def load_syllabi(path="../data/syllabi.json"):
    with open(path) as f:
        return json.load(f)


def extract_text_from_syllabus(syllabus):
    """
    Recursively extract ALL text-like values from a syllabus
    with unknown / inconsistent structure.
    """
    collected = []

    def crawl(value):
        if value is None:
            return

        # Strings
        if isinstance(value, str):
            text = value.strip()
            if text:
                collected.append(text)

        # Lists
        elif isinstance(value, list):
            for item in value:
                crawl(item)

        # Dicts
        elif isinstance(value, dict):
            for v in value.values():
                crawl(v)

        # Ignore numbers, booleans, etc.

    crawl(syllabus)
    return collected


def syllabus_to_queries(syl):
    """
    Build queries for FAISS from ANY syllabus structure.
    """
    text_items = extract_text_from_syllabus(syl)

    queries = []

    # 1. Big combined context for coarse matching
    full_context = "\n".join(text_items)
    queries.append(full_context)

    # 2. Smaller chunks (high-signal text for fine-grained matching)
    for item in text_items:
        # Only keep medium-length items (good for embeddings)
        if 20 < len(item) < 350:
            queries.append(item)

    return queries


#### faiss + retrieve 

In [None]:
def build_faiss_index(indicators):
    """
    Build FAISS index from indicator embeddings.
    """
    texts = [d["text"] for d in indicators]
    vecs = embed(texts)

    # normalize for cosine similarity
    faiss.normalize_L2(vecs)

    index = faiss.IndexFlatIP(vecs.shape[1])
    index.add(vecs)

    return index

def retrieve(index, indicators, query, k=TOP_K):
    """
    Retrieve top-k indicators from FAISS given a query string.
    """
    qvec = embed([query])
    faiss.normalize_L2(qvec)

    D, I = index.search(qvec, k)

    results = []
    for score, idx in zip(D[0], I[0]):
        results.append((indicators[idx], float(score)))

    return results


In [4]:
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

def llm_verify_domain_coverage(syllabus_text, domain_name, indicators):
    """
    Ask GPT to decide if the syllabus truly covers the domain,
    and justify why.
    """
    indicator_text = "\n".join(
        [f"- {ind['indicator']} (sim={ind['similarity']:.2f})" for ind in indicators]
    )

    prompt = f"""
You are an expert in AACN nursing competencies.

Given the following syllabus text and the retrieved AACN indicators, 
decide whether this syllabus truly covers the domain:

DOMAIN: {domain_name}

SYLLABUS TEXT:
{syllabus_text}

RETRIEVED INDICATORS:
{indicator_text}

Please respond in STRICT JSON format:

{{
  "domain": "{domain_name}",
  "covered": true/false,
  "confidence": float between 0 and 1,
  "justification": "short explanation referencing the syllabus and indicators"
}}
    """

    resp = client.chat.completions.create(
        model="gpt-4.1",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return resp.choices[0].message.content


In [9]:
def llm_generate_syllabus_alignment(syllabus, domain_hits):
    """
    Generates a final alignment report for ONE syllabus.
    Includes:
      - domain-level decisions
      - justification
      - confidence
    """
    syllabus_text = "\n".join(extract_text_from_syllabus(syllabus))

    results = []

    for domain_name, indicators in domain_hits.items():
        llm_json = llm_verify_domain_coverage(
            syllabus_text=syllabus_text,
            domain_name=domain_name,
            indicators=indicators
        )
        results.append(json.loads(llm_json))

    return results


In [10]:
def compute_domain_coverage(index, indicators, syllabus):
    """
    Retrieve indicators, group by domain, then
    use GPT to validate coverage.
    """
    queries = syllabus_to_queries(syllabus)
    domain_hits = defaultdict(list)

    for q in queries:
        hits = retrieve(index, indicators, q)
        for doc, sim in hits:
            if sim >= SIM_THRESHOLD:
                domain_hits[doc["domain"]].append({
                    "indicator": doc["text"],
                    "similarity": sim,
                    "query": q
                })

    # LLM validation
    llm_results = llm_generate_syllabus_alignment(syllabus, domain_hits)

    return {
        "course_code": syllabus.get("course_code"),
        "course_name": syllabus.get("course_name"),
        "llm_validated_domains": llm_results
    }


In [16]:
def main():
    domains = load_domains()
    indicators = flatten_indicators(domains)
    index = build_faiss_index(indicators)
    syllabi = load_syllabi()

    results = []

    for syl in tqdm(syllabi):
        res = compute_domain_coverage(index, indicators, syl)
        results.append(res)

    with open("llm_domain_coverage.json", "w") as f:
        json.dump(results, f, indent=2)

    print("\nSaved LLM-validated coverage → llm_domain_coverage.json")


In [25]:
if __name__ == "__main__":
    main()

100%|██████████| 29/29 [04:54<00:00, 10.14s/it]


Saved LLM-validated coverage → llm_domain_coverage.json





In [26]:
def analyze_domain_coverage(results):
    """
    Given the LLM-validated RAG results (list of dicts),
    compute quantitative domain coverage statistics.

    Returns a dictionary with all computed analytics.
    """

    import pandas as pd
    from collections import defaultdict

    # flatten results
    course_domain_map = {}
    domain_course_map = defaultdict(list)

    all_domains = set()
    courses = []

    for entry in results:
        course = entry["course_code"]
        courses.append(course)

        validated = entry.get("llm_validated_domains", [])

        covered_domains = [
            d["domain"]
            for d in validated
            if d.get("covered", False) is True
        ]

        course_domain_map[course] = covered_domains

        for dom in covered_domains:
            domain_course_map[dom].append(course)
            all_domains.add(dom)

    # Summary Metrics
    num_courses = len(course_domain_map)
    domain_counts = {dom: len(domain_course_map[dom]) for dom in all_domains}
    courses_with_no_domains = [c for c, ds in course_domain_map.items() if len(ds) == 0]

    # Domain Coverage Matrix (Course × Domain)
    sorted_domains = sorted(all_domains)
    matrix = []

    for course in courses:
        row = {"course": course}
        for dom in sorted_domains:
            row[dom] = 1 if dom in course_domain_map[course] else 0
        matrix.append(row)

    coverage_df = pd.DataFrame(matrix)

    # Print Summary
    print("\n==========================")
    print(" Curriculum Domain Coverage Summary")
    print("==========================")
    print(f"Total courses analyzed: {num_courses}")
    print(f"Domains detected across curriculum: {len(all_domains)}")
    print()

    print("=== Domains Covered Frequency ===")
    for dom, count in sorted(domain_counts.items(), key=lambda x: -x[1]):
        print(f"{dom}: {count} course(s)")

    print("\n=== Courses with ZERO domain coverage ===")
    for c in courses_with_no_domains:
        print(f"- {c}")
    print()

    print("=== Course-by-Domain Coverage Matrix ===")
    print(coverage_df)

    # Return structured results for further comparison/evaluation
    return {
        "num_courses": num_courses,
        "all_domains": sorted_domains,
        "domain_counts": domain_counts,
        "courses_with_no_domains": courses_with_no_domains,
        "coverage_matrix": coverage_df,
        "course_domain_map": course_domain_map,
        "domain_course_map": domain_course_map
    }


In [None]:
results = json.load(open("llm_domain_coverage.json"))
analysis = analyze_domain_coverage(results)


 Curriculum Domain Coverage Summary
Total courses analyzed: 28
Domains detected across curriculum: 5

=== Domains Covered Frequency ===
Domain 9: Professionalism: 3 course(s)
Domain 6: Interprofessional Partnerships: 2 course(s)
Domain 2: Person -Centered Care: 1 course(s)
Domain 4: Scholarship for the Nursing Discipline: 1 course(s)
Domain 10: Personal, Professional, and Leadership Development: 1 course(s)

=== Courses with ZERO domain coverage ===
- N279P
- N354
- N325
- N277P
- N275
- N 344P
- N375P
- N356
- N320
- N223
- N266
- N244
- N 366P
- N127P
- N321
- N255C
- N255D
- N157P
- N273
- N355P
- N274

=== Course-by-Domain Coverage Matrix ===
    course  Domain 10: Personal, Professional, and Leadership Development  \
0    N279P                                                  0               
1     N310                                                  0               
2     N354                                                  0               
3     N325                          

### trying a different similarity threshold (threshold = 0.5, 0.6)

In [None]:
def main():
    # ---- define thresholds you want to sweep ----
    thresholds = [0.50, 0.60, 0.70, 0.75]

    # ---- load your fixed data once ----
    domains = load_domains()
    indicators = flatten_indicators(domains)
    index = build_faiss_index(indicators)
    syllabi = load_syllabi()

    all_results = {}  # store results in memory too

    # ---- loop over thresholds ----
    for thresh in thresholds:
        global SIM_THRESHOLD
        SIM_THRESHOLD = thresh

        print(f"\n==============================================")
        print(f" Running RAG pipeline with SIM_THRESHOLD = {thresh}")
        print(f"==============================================\n")

        results = []

        for syl in tqdm(syllabi):
            res = compute_domain_coverage(index, indicators, syl)
            results.append(res)

        # ---- save results for this threshold ----
        out_path = f"llm_domain_coverage_sim_{thresh:.2f}.json"
        with open(out_path, "w") as f:
            json.dump(results, f, indent=2)

        print(f"Saved → {out_path}")
        all_results[thresh] = results

    print("\n=== Finished all threshold runs ===")
    return all_results


In [None]:
if __name__ == "__main__":
    main()


 Running RAG pipeline with SIM_THRESHOLD = 0.5



 34%|███▍      | 10/29 [03:21<05:44, 18.11s/it]