In [1]:
# Cell 1 ‚Äî Imports, folders, and official source URLs (no Selenium, no args)

import os
from datetime import datetime
from typing import Optional

import requests
import pandas as pd
from bs4 import BeautifulSoup  # for robust parsing if needed
# pandas.read_html usually works; BeautifulSoup is a fallback

# --- Folders (relative; change if you want an absolute path) ---
BASE_DIR   = os.path.abspath(".")
DATA_DIR   = os.path.join(BASE_DIR, "data")
RAW_DIR    = os.path.join(DATA_DIR, "raw")
CLEAN_DIR  = os.path.join(DATA_DIR, "clean")
SFT_DIR    = os.path.join(DATA_DIR, "sft")

for d in (DATA_DIR, RAW_DIR, CLEAN_DIR, SFT_DIR):
    os.makedirs(d, exist_ok=True)

# --- Official NIRF 2025 category pages ---
NIRF_URLS = {
    "OVERALL":    "https://www.nirfindia.org/Rankings/2025/OverallRanking.html",
    "ENGINEERING":"https://www.nirfindia.org/Rankings/2025/EngineeringRanking.html",
    "UNIVERSITY": "https://www.nirfindia.org/Rankings/2025/UniversityRanking.html",
    "COLLEGE":    "https://www.nirfindia.org/Rankings/2025/CollegeRanking.html",
}

# --- Official AISHE 2021‚Äì22 landing (Excel is linked from here) ---
AISHE_LANDING = "https://aishe.gov.in/aishe-final-report/"
AISHE_DOCS    = "https://aishe.gov.in/documents/"

# --- Simple fetch helper (HTML only). We'll use it in the next cell. ---
def fetch_html(url: str, out_filename: Optional[str] = None) -> str:
    """
    Fetch an HTML page and save it under RAW_DIR. Returns saved file path.
    """
    if out_filename is None:
        stamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
        # keep only the last path segment for readability
        tail = url.rstrip("/").split("/")[-1] or "index.html"
        out_filename = f"{tail}.{stamp}.html"

    out_path = os.path.join(RAW_DIR, out_filename)
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; NIRF-AISHE-Scraper/1.0)",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    }
    resp = requests.get(url, headers=headers, timeout=30)
    resp.raise_for_status()
    with open(out_path, "wb") as f:
        f.write(resp.content)
    return out_path

print("‚úÖ Setup ready. Next: fetch NIRF Overall HTML and parse the main table.")


‚úÖ Setup ready. Next: fetch NIRF Overall HTML and parse the main table.


In [3]:
# Cell 2 ‚Äî Fetch NIRF Overall HTML and parse the main table into a clean DataFrame

import re

# 1) Fetch and cache the HTML locally
overall_url = NIRF_URLS["OVERALL"]
overall_html_path = fetch_html(overall_url)
print(f"üì• Saved NIRF Overall HTML to: {overall_html_path}")

# 2) Parse tables with pandas; keep the largest table (the rankings table)
tables = pd.read_html(overall_html_path, flavor="lxml")
if not tables:
    raise ValueError("No tables found on the NIRF Overall page.")

# Heuristic: choose the table with the most rows
df_raw = max(tables, key=lambda t: len(t))
df = df_raw.copy()

# 3) Standardize column names
def normcol(c: str) -> str:
    return (
        str(c)
        .strip()
        .replace("\xa0", " ")
        .replace("\n", " ")
        .lower()
        .replace("  ", " ")
    )

df.columns = [normcol(c) for c in df.columns]

# Common column variants we might see across categories
col_map_candidates = {
    "rank": ["rank"],
    "institute_name_raw": ["institute", "name of the institute", "institute name"],
    "institute_id": ["institute id", "ir-code", "ir code", "ir id"],
    "city": ["city"],
    "state": ["state"],
    "score": ["score", "overall score"],
    "tlr": ["tlr"],
    "rpc": ["rpc"],
    "go": ["go"],
    "oi": ["oi"],
    "perception": ["perception"],
}

# 4) Build a robust column mapper
mapper = {}
for std, variants in col_map_candidates.items():
    for v in variants:
        if v in df.columns:
            mapper[v] = std
            break

df = df.rename(columns=mapper)

# 5) If 'institute_id' is missing, try to extract IR-code from the institute text
if "institute_id" not in df.columns:
    # Make sure we have an institute text column to search
    name_col = "institute_name_raw" if "institute_name_raw" in df.columns else None
    if name_col is None:
        # try to detect a likely name column (first non-rank/score column)
        likely_name_cols = [c for c in df.columns if c not in {"rank", "score"}]
        name_col = likely_name_cols[0] if likely_name_cols else None

    if name_col:
        ir_pat = re.compile(r"\bIR[-\s]?[A-Z0-9-]+", re.I)
        df["institute_id"] = (
            df[name_col]
            .astype(str)
            .str.extract(ir_pat, expand=False)
            .str.upper()
            .str.replace(r"\s+", "-", regex=True)
        )

# 6) Ensure required columns exist; fill if absent
required_cols = [
    "rank", "score", "institute_name_raw", "institute_id",
    "state", "city", "tlr", "rpc", "go", "oi", "perception"
]
for col in required_cols:
    if col not in df.columns:
        df[col] = None

# 7) Coerce types safely
def to_int_safe(x):
    try:
        return int(str(x).strip())
    except:
        return None

def to_float_safe(x):
    try:
        # handle commas or stray chars
        return float(str(x).replace(",", "").strip())
    except:
        return None

df["rank"] = df["rank"].map(to_int_safe)
for c in ["score", "tlr", "rpc", "go", "oi", "perception"]:
    df[c] = df[c].map(to_float_safe)

# 8) Add metadata columns
df["category"] = "OVERALL"
df["year"] = 2025
df["page_url"] = overall_url
df["crawl_time"] = datetime.utcnow().isoformat(timespec="seconds") + "Z"
df["row_anchor"] = None  # NIRF tables typically don't expose row anchors in plain HTML

# 9) Keep exact schema order we want going forward
ordered_cols = [
    "rank", "score",
    "institute_name_raw", "institute_id",
    "state", "city",
    "tlr", "rpc", "go", "oi", "perception",
    "category", "year", "page_url", "row_anchor", "crawl_time"
]
df = df[ordered_cols]

print("‚úÖ Parsed NIRF Overall table preview:")
display(df.head(10))
print(f"üìä Rows parsed: {len(df)}")


üì• Saved NIRF Overall HTML to: /teamspace/studios/this_studio/data/raw/OverallRanking.html.20250907T061729Z.html
‚úÖ Parsed NIRF Overall table preview:


Unnamed: 0,rank,score,institute_name_raw,institute_id,state,city,tlr,rpc,go,oi,perception,category,year,page_url,row_anchor,crawl_time
0,1,87.31,,IR-O-U-0456,Tamil Nadu,Chennai,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
1,2,85.0,,IR-O-U-0220,Karnataka,Bengaluru,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
2,3,81.62,,IR-O-U-0306,Maharashtra,Mumbai,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
3,4,80.67,,IR-O-I-1074,Delhi,New Delhi,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
4,5,77.25,,IR-O-I-1075,Uttar Pradesh,Kanpur,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
5,6,73.99,,IR-O-U-0573,West Bengal,Kharagpur,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
6,7,71.73,,IR-O-U-0560,Uttarakhand,Roorkee,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
7,8,70.57,,IR-O-N-15,Delhi,New Delhi,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
8,9,69.62,,IR-O-U-0109,Delhi,New Delhi,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
9,10,68.71,,IR-O-U-0500,Uttar Pradesh,Varanasi,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z


üìä Rows parsed: 100


In [6]:
# Cell 3 (fixed) ‚Äî Robust BeautifulSoup parse to fill names & sub-scores (no lxml, no args)

from bs4 import BeautifulSoup
import pathlib, re

HTML_PATH = pathlib.Path(overall_html_path)

with open(HTML_PATH, "rb") as f:
    soup = BeautifulSoup(f.read(), "html.parser")  # <-- built-in parser

def _norm_header(s: str) -> str:
    s = (s or "").strip().replace("\xa0", " ").lower()
    s = re.sub(r"\s+", " ", s)
    replacements = {
        "institute": "institute_name_raw",
        "name of the institute": "institute_name_raw",
        "institute name": "institute_name_raw",
        "institute id": "institute_id",
        "ir code": "institute_id",
        "ir-code": "institute_id",
        "ir id": "institute_id",
        "overall score": "score",
    }
    return replacements.get(s, s)

def _cell_text(td) -> str:
    # Always return a string (avoid None)
    txt = td.get_text(separator=" ", strip=True) if td else ""
    return re.sub(r"\s+", " ", txt)

# Find ranking-like tables
candidates = []
for tbl in soup.find_all("table"):
    txt = tbl.get_text(" ", strip=True).lower()
    if "rank" in txt and "score" in txt:
        candidates.append(tbl)
if not candidates:
    raise RuntimeError("No table found that looks like a NIRF ranking table (needs 'Rank' and 'Score').")

# Choose the largest
tbl = max(candidates, key=lambda t: len(t.find_all("tr")))

# Headers
thead = tbl.find("thead")
if thead and thead.find_all("th"):
    raw_headers = [th.get_text(" ", strip=True) for th in thead.find_all("th")]
else:
    # Fall back to first row as header
    first_tr = tbl.find("tr")
    raw_headers = [el.get_text(" ", strip=True) for el in (first_tr.find_all(["th","td"]) if first_tr else [])]

headers_norm = [_norm_header(h) for h in raw_headers]

# Body rows (skip header row if duplicated)
tbody = tbl.find("tbody") or tbl
rows = tbody.find_all("tr")
if rows and any(h in rows[0].get_text(" ", strip=True).lower() for h in ["rank", "score"]):
    rows = rows[1:]

ir_re = re.compile(r"\bIR[-\s]?[A-Z0-9-]+", re.I)

records = []
for tr in rows:
    tds = tr.find_all("td")
    if not tds:
        continue

    row_vals = [_cell_text(td) for td in tds]
    if not any(v.strip() for v in row_vals):
        continue

    # Align to header length
    if headers_norm:
        if len(row_vals) > len(headers_norm):
            row_vals = row_vals[:len(headers_norm)]
        elif len(row_vals) < len(headers_norm):
            row_vals += [""] * (len(headers_norm) - len(row_vals))

    row = {headers_norm[i] if i < len(headers_norm) else f"col_{i}": row_vals[i] for i in range(len(row_vals))}

    # IR-code extraction ‚Äî guard against None by casting to str
    id_text_pool = " ".join(v if isinstance(v, str) else "" for v in row_vals)
    inst_id = None
    m = ir_re.search(id_text_pool)
    if m:
        inst_id = m.group(0).upper().replace(" ", "-")

    # Institute name: prefer mapped column; fallback to longest texty cell
    inst_text = row.get("institute_name_raw")
    if not inst_text:
        ignore = {"rank","score","state","city","tlr","rpc","go","oi","perception","institute_id"}
        candidates = [(k, v) for k, v in row.items() if k not in ignore and isinstance(v, str) and len(v) >= 3]
        inst_text = max(candidates, key=lambda kv: len(kv[1]))[1] if candidates else ""

    # Strip IR-code token from institute cell
    inst_name = re.sub(ir_re, "", inst_text)
    inst_name = re.sub(r"\s*\(\s*\)\s*", " ", inst_name)
    inst_name = re.sub(r"\s{2,}", " ", inst_name).strip(" ,;-")

    out = {
        "rank": row.get("rank", ""),
        "score": row.get("score", ""),
        "institute_name_raw": inst_name or None,
        "institute_id": row.get("institute_id") or inst_id,
        "state": row.get("state", "") or None,
        "city": row.get("city", "") or None,
        "tlr": row.get("tlr", ""),
        "rpc": row.get("rpc", ""),
        "go": row.get("go", ""),
        "oi": row.get("oi", ""),
        "perception": row.get("perception", ""),
    }

    # Keep meaningful rows
    if out["rank"] or out["institute_id"] or out["institute_name_raw"]:
        records.append(out)

df_bs = pd.DataFrame(records)

# Safe numeric coercions
def _to_int(x):
    try:
        return int(str(x).strip())
    except:
        return None

def _to_float(x):
    try:
        return float(str(x).replace(",", "").strip())
    except:
        return None

if "rank" in df_bs:
    df_bs["rank"] = df_bs["rank"].map(_to_int)
for c in ["score","tlr","rpc","go","oi","perception"]:
    if c in df_bs:
        df_bs[c] = df_bs[c].map(_to_float)

# Metadata
df_bs["category"] = "OVERALL"
df_bs["year"] = 2025
df_bs["page_url"] = NIRF_URLS["OVERALL"]
df_bs["row_anchor"] = None
df_bs["crawl_time"] = datetime.utcnow().isoformat(timespec="seconds") + "Z"

ordered_cols = [
    "rank","score","institute_name_raw","institute_id","state","city",
    "tlr","rpc","go","oi","perception","category","year","page_url","row_anchor","crawl_time"
]
for c in ordered_cols:
    if c not in df_bs.columns:
        df_bs[c] = None
df_bs = df_bs[ordered_cols]

print("‚úÖ BeautifulSoup parse preview (filled names & IDs):")
display(df_bs.head(10))

print("\nüîé Null counts by column:")
display(df_bs.isna().sum())

print(f"\nüìä Rows parsed: {len(df_bs)}")


‚úÖ BeautifulSoup parse preview (filled names & IDs):


Unnamed: 0,rank,score,institute_name_raw,institute_id,state,city,tlr,rpc,go,oi,perception,category,year,page_url,row_anchor,crawl_time
0,,87.01,Indian Institute of Technology Madras More Det...,IR-O-U-0456,88.02,90.58,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:20:36Z
1,,100.0,88.02,90.58,63.34,87.01,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:20:36Z
2,,83.69,Indian Institute of Science More Details Close...,IR-O-U-0220,88.16,86.47,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:20:36Z
3,,98.41,88.16,86.47,60.29,83.69,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:20:36Z
4,,86.04,Indian Institute of Technology Bombay More Det...,IR-O-U-0306,83.05,83.79,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:20:36Z
5,,83.35,83.05,83.79,60.13,86.04,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:20:36Z
6,,74.71,Indian Institute of Technology Delhi More Deta...,IR-O-I-1074,86.67,80.52,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:20:36Z
7,,92.23,86.67,80.52,63.54,74.71,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:20:36Z
8,,82.01,Indian Institute of Technology Kanpur More Det...,IR-O-I-1075,72.12,86.36,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:20:36Z
9,,72.56,72.12,86.36,60.38,82.01,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:20:36Z



üîé Null counts by column:


rank                  200
score                   0
institute_name_raw      0
institute_id            0
state                   0
city                    0
tlr                   200
rpc                   200
go                    200
oi                    200
perception            200
category                0
year                    0
page_url                0
row_anchor            200
crawl_time              0
dtype: int64


üìä Rows parsed: 200


In [7]:
# Cell 4 ‚Äî Coalesce pandas + BS parses into one clean DataFrame
# Strategy:
#  - Left-join on IR-code and coalesce columns (prefer reliable values from df, fill from df_bs)
#  - Strip "More Details" artifacts from names
#  - Re-type numeric columns and sanity-check ranks
#  - Keep canonical column order

import re
import numpy as np

def _coalesce(a, b):
    """Return a where available, else b."""
    return a if (a is not None and not (isinstance(a, float) and np.isnan(a)) and str(a).strip() != "") else b

# Ensure we have required columns in both frames
required_cols = [
    "rank","score","institute_name_raw","institute_id","state","city",
    "tlr","rpc","go","oi","perception","category","year","page_url","row_anchor","crawl_time"
]
for c in required_cols:
    if c not in df.columns:    df[c] = None
    if c not in df_bs.columns: df_bs[c] = None

# Merge on institute_id (IR-code)
m = df.merge(df_bs[["institute_id","institute_name_raw","tlr","rpc","go","oi","perception"]],
             on="institute_id", how="left", suffixes=("", "_bs"))

# Coalesce fields (prefer df for rank/score/state/city; fill names & sub-scores from BS where missing)
names_clean = []
tlr_ = []; rpc_ = []; go_ = []; oi_ = []; perc_ = []

for i, row in m.iterrows():
    # Name: take df.institute_name_raw if present; else df_bs.institute_name_raw
    name = row.get("institute_name_raw")
    if not name or str(name).strip().lower() in ("none","nan",""):
        name = row.get("institute_name_raw_bs")

    # Strip UI noise like "More Details", "Close", etc.
    if isinstance(name, str):
        name = re.sub(r"\s*More Details.*$", "", name, flags=re.IGNORECASE).strip(" ,;-")
        name = re.sub(r"\s*Close.*$", "", name, flags=re.IGNORECASE).strip(" ,;-")

    names_clean.append(name if name else None)

    # Sub-scores: coalesce df (if present) else BS values
    tlr_.append(_coalesce(row.get("tlr"), row.get("tlr_bs")))
    rpc_.append(_coalesce(row.get("rpc"), row.get("rpc_bs")))
    go_.append(_coalesce(row.get("go"), row.get("go_bs")))
    oi_.append(_coalesce(row.get("oi"), row.get("oi_bs")))
    perc_.append(_coalesce(row.get("perception"), row.get("perception_bs")))

m["institute_name_raw"] = names_clean
m["tlr"] = tlr_
m["rpc"] = rpc_
m["go"] = go_
m["oi"] = oi_
m["perception"] = perc_

# Drop helper columns
m = m.drop(columns=[c for c in m.columns if c.endswith("_bs")], errors="ignore")

# Final type coercions
def _to_int(x):
    try:
        return int(str(x).strip())
    except:
        return None

def _to_float(x):
    try:
        return float(str(x).replace(",", "").strip())
    except:
        return None

m["rank"] = m["rank"].map(_to_int)
for c in ["score","tlr","rpc","go","oi","perception"]:
    m[c] = m[c].map(_to_float)

# If rank is still missing for some rows, infer from order among valid rows (rare)
if m["rank"].isna().any():
    # Preserve original order, fillna with 1..N for missing ranks, but only where institute_id exists
    mask = m["rank"].isna() & m["institute_id"].notna()
    start_rank = (m["rank"].max() or 0) + 1
    fill_vals = list(range(start_rank, start_rank + mask.sum()))
    m.loc[mask, "rank"] = fill_vals

# Remove non-institute noise rows (no IR-code & no name)
m = m[~(m["institute_id"].isna() & m["institute_name_raw"].isna())].copy()

# Canonical order
ordered_cols = [
    "rank","score","institute_name_raw","institute_id","state","city",
    "tlr","rpc","go","oi","perception","category","year","page_url","row_anchor","crawl_time"
]
for c in ordered_cols:
    if c not in m.columns:
        m[c] = None
m = m[ordered_cols].sort_values(["rank","score"], ascending=[True, False]).reset_index(drop=True)

print("‚úÖ Coalesced, clean NIRF Overall table:")
display(m.head(10))

print("\nüîé Null counts (post-merge):")
display(m.isna().sum())

print(f"\nüìä Final rows (Overall): {len(m)}")


‚úÖ Coalesced, clean NIRF Overall table:


Unnamed: 0,rank,score,institute_name_raw,institute_id,state,city,tlr,rpc,go,oi,perception,category,year,page_url,row_anchor,crawl_time
0,1,87.31,Indian Institute of Technology Madras,IR-O-U-0456,Tamil Nadu,Chennai,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
1,2,85.0,Indian Institute of Science,IR-O-U-0220,Karnataka,Bengaluru,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
2,3,81.62,Indian Institute of Technology Bombay,IR-O-U-0306,Maharashtra,Mumbai,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
3,4,80.67,Indian Institute of Technology Delhi,IR-O-I-1074,Delhi,New Delhi,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
4,5,77.25,Indian Institute of Technology Kanpur,IR-O-I-1075,Uttar Pradesh,Kanpur,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
5,6,73.99,Indian Institute of Technology Kharagpur,IR-O-U-0573,West Bengal,Kharagpur,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
6,7,71.73,Indian Institute of Technology Roorkee,IR-O-U-0560,Uttarakhand,Roorkee,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
7,8,70.57,"All India Institute of Medical Sciences, Delhi",IR-O-N-15,Delhi,New Delhi,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
8,9,69.62,Jawaharlal Nehru University,IR-O-U-0109,Delhi,New Delhi,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z
9,10,68.71,Banaras Hindu University,IR-O-U-0500,Uttar Pradesh,Varanasi,,,,,,OVERALL,2025,https://www.nirfindia.org/Rankings/2025/Overal...,,2025-09-07T06:17:31Z



üîé Null counts (post-merge):


rank                    0
score                   0
institute_name_raw      0
institute_id            0
state                   0
city                    0
tlr                   100
rpc                   100
go                    100
oi                    100
perception            100
category                0
year                    0
page_url                0
row_anchor            100
crawl_time              0
dtype: int64


üìä Final rows (Overall): 100


In [8]:
# Cell 5 ‚Äî Normalize institute names (deterministic), add `institute_name_norm`, and save OVERALL CSV

import unicodedata
import re
from pathlib import Path

def normalize_institute_name(name: str) -> str:
    """
    Deterministic, reversible-ish normalization used across NIRF & AISHE.
    - Unicode normalize (NFKC)
    - Uppercase
    - Collapse punctuation/whitespace
    - Standardize common expansions (IIT/IISc/NIT)
    - Remove trailing city in parentheses if city is provided in a separate column
    """
    if not isinstance(name, str) or not name.strip():
        return ""

    # Unicode normalize & strip
    s = unicodedata.normalize("NFKC", name).strip()

    # Remove UI leftovers just in case
    s = re.sub(r"\bMore Details\b.*$", "", s, flags=re.IGNORECASE).strip(" ,;-")

    # Remove empty parentheses
    s = re.sub(r"\(\s*\)", "", s)

    # If a city is present in parentheses like "ABC Institute (Chennai)"
    # and we have a separate city column, drop that suffix.
    s = re.sub(r"\s*\([^)]+\)\s*$", "", s).strip()

    # Normalize punctuation/spacing
    s = s.replace("‚Äì", "-").replace("‚Äî", "-")
    s = re.sub(r"[‚Äô`¬¥]", "'", s)
    s = re.sub(r"[.,;:/]+", " ", s)     # turn punctuation into spaces
    s = re.sub(r"\s+", " ", s).strip()

    # Uppercase
    s = s.upper()

    # Expand common long forms -> canonical short forms
    # Do longer phrases first to avoid partial collisions
    replacements = [
        (r"\bINDIAN INSTITUTE OF TECHNOLOGY\b", "IIT"),
        (r"\bINDIAN INSTITUTE OF SCIENCE\b", "IISC"),
        (r"\bNATIONAL INSTITUTE OF TECHNOLOGY\b", "NIT"),
        (r"\bBIRLA INSTITUTE OF TECHNOLOGY AND SCIENCE\b", "BITS"),
    ]
    for pat, repl in replacements:
        s = re.sub(pat, repl, s)

    # Clean residual multiple spaces
    s = re.sub(r"\s{2,}", " ", s).strip()
    return s

# Apply normalization
m["institute_name_norm"] = m["institute_name_raw"].map(normalize_institute_name)

# Quick sanity preview
print("üîé Normalization preview:")
display(m[["institute_name_raw", "institute_name_norm"]].head(10))

# Save OVERALL category as a standalone clean CSV for audit
out_path = Path(CLEAN_DIR) / "nirf_2025_overall.csv"
m.to_csv(out_path, index=False)
print(f"üíæ Saved clean NIRF OVERALL data to: {out_path.resolve()}")

# If you want a union file later, we‚Äôll append/join category-wise outputs.


üîé Normalization preview:


Unnamed: 0,institute_name_raw,institute_name_norm
0,Indian Institute of Technology Madras,IIT MADRAS
1,Indian Institute of Science,IISC
2,Indian Institute of Technology Bombay,IIT BOMBAY
3,Indian Institute of Technology Delhi,IIT DELHI
4,Indian Institute of Technology Kanpur,IIT KANPUR
5,Indian Institute of Technology Kharagpur,IIT KHARAGPUR
6,Indian Institute of Technology Roorkee,IIT ROORKEE
7,"All India Institute of Medical Sciences, Delhi",ALL INDIA INSTITUTE OF MEDICAL SCIENCES DELHI
8,Jawaharlal Nehru University,JAWAHARLAL NEHRU UNIVERSITY
9,Banaras Hindu University,BANARAS HINDU UNIVERSITY


üíæ Saved clean NIRF OVERALL data to: /teamspace/studios/this_studio/data/clean/nirf_2025_overall.csv


In [9]:
# Cell 6 ‚Äî Alignment scaffold + SFT schema helpers (no item generation yet)

import json
from pathlib import Path

# 1) Load the clean NIRF OVERALL we saved
nirf_overall_path = Path(CLEAN_DIR) / "nirf_2025_overall.csv"
df_nirf = pd.read_csv(nirf_overall_path)

# 2) Build a deterministic entity key to align later with AISHE (same rule we‚Äôll apply there)
#    Key = INSTITUTE_NAME_NORM + " | " + STATE (uppercased, trimmed)
def _norm_state(s):
    return str(s).strip().upper() if isinstance(s, str) else ""

df_nirf["state_norm"] = df_nirf["state"].map(_norm_state)
df_nirf["entity_key"] = df_nirf["institute_name_norm"].fillna("").astype(str).str.strip() + " | " + df_nirf["state_norm"]

# 3) Alias map scaffold (empty for now) ‚Äî we‚Äôll fill this when we hit tricky names
#    Columns: alias, canonical_norm, notes
alias_map_path = Path(CLEAN_DIR) / "alias_map.csv"
if not alias_map_path.exists():
    pd.DataFrame({"alias": [], "canonical_norm": [], "notes": []}).to_csv(alias_map_path, index=False)

# 4) Doc-ID generator for NIRF rows (stable & human-readable)
def nirf_doc_id(row) -> str:
    cat = str(row.get("category", "OVERALL")).upper()
    ir  = str(row.get("institute_id", "")).upper().replace(" ", "-")
    return f"NIRF2025-{cat}-{ir}" if ir else f"NIRF2025-{cat}-ROW-{int(row.name)}"

# 5) Evidence object builder (NIRF)
def build_nirf_evidence(row) -> dict:
    docid = nirf_doc_id(row)
    name  = str(row.get("institute_name_norm") or row.get("institute_name_raw") or "").strip()
    state = str(row.get("state") or "").strip()
    rank  = row.get("rank")
    score = row.get("score")

    span_bits = []
    if name:  span_bits.append(name)
    if isinstance(rank, (int, float)) and pd.notna(rank): span_bits.append(f"Rank {int(rank)}")
    if isinstance(score, (int, float)) and pd.notna(score): span_bits.append(f"Score {float(score):.2f}")
    if state: span_bits.append(state)

    span_text = " | ".join(span_bits) if span_bits else name
    return {
        "source": "NIRF",
        "year": 2025,
        "doc_id": docid,
        "span_text": span_text,
        "citation": {
            "url": str(row.get("page_url") or ""),
            "anchor": str(row.get("row_anchor") or "")
        },
        "is_positive": True
    }

# 6) SFT item schema helpers (shared across tasks)
SFT_REQUIRED_KEYS = [
    "id", "task", "instruction", "context", "evidence", "response", "style", "meta"
]

def validate_sft_item(item: dict, verbose: bool = True) -> bool:
    """
    Basic validator:
      - required keys present
      - evidence.doc_id(s) referenced in response as [DOCID]
      - every number in response appears in at least one positive evidence.span_text
    """
    ok = True
    missing = [k for k in SFT_REQUIRED_KEYS if k not in item]
    if missing:
        ok = False
        if verbose: print(f"‚ùå Missing required keys: {missing}")

    # Evidence doc_id set
    ev = item.get("evidence", [])
    doc_ids = [e.get("doc_id") for e in ev if isinstance(e, dict)]
    if not doc_ids:
        ok = False
        if verbose: print("‚ùå No evidence doc_ids found.")

    # Citation presence: each [DOCID] in response must be in evidence
    import re
    cited = re.findall(r"\[([A-Z0-9\-_:]+)\]", str(item.get("response", "")))
    for c in cited:
        if c not in doc_ids:
            ok = False
            if verbose: print(f"‚ùå Citation [{c}] not present in evidence doc_ids.")

    # Number check: every numeric token in response must appear in at least one positive span_text
    # (We keep it simple: integers and floats, ignoring percentages/units)
    nums = re.findall(r"\b\d+(?:\.\d+)?\b", str(item.get("response", "")))
    positive_spans = [e.get("span_text","") for e in ev if e.get("is_positive") is True]
    pos_blob = " || ".join(positive_spans)

    for n in nums:
        if n not in pos_blob:
            ok = False
            if verbose: print(f"‚ùå Number '{n}' not found in any positive evidence span_text.")

    if ok and verbose:
        print("‚úÖ SFT item passes basic validation.")
    return ok

print("‚úÖ Alignment & SFT helpers ready.")
print(f"Rows in NIRF OVERALL (with entity_key): {len(df_nirf)}")
display(df_nirf.head(5)[['institute_name_raw','institute_name_norm','state','state_norm','entity_key','institute_id','rank','score']])


‚úÖ Alignment & SFT helpers ready.
Rows in NIRF OVERALL (with entity_key): 100


Unnamed: 0,institute_name_raw,institute_name_norm,state,state_norm,entity_key,institute_id,rank,score
0,Indian Institute of Technology Madras,IIT MADRAS,Tamil Nadu,TAMIL NADU,IIT MADRAS | TAMIL NADU,IR-O-U-0456,1,87.31
1,Indian Institute of Science,IISC,Karnataka,KARNATAKA,IISC | KARNATAKA,IR-O-U-0220,2,85.0
2,Indian Institute of Technology Bombay,IIT BOMBAY,Maharashtra,MAHARASHTRA,IIT BOMBAY | MAHARASHTRA,IR-O-U-0306,3,81.62
3,Indian Institute of Technology Delhi,IIT DELHI,Delhi,DELHI,IIT DELHI | DELHI,IR-O-I-1074,4,80.67
4,Indian Institute of Technology Kanpur,IIT KANPUR,Uttar Pradesh,UTTAR PRADESH,IIT KANPUR | UTTAR PRADESH,IR-O-I-1075,5,77.25


In [11]:
# Cell 8 ‚Äî Patch evidence to include year + IR-code (+ rank) in span_text, then re-build & re-validate

def build_nirf_evidence(row) -> dict:
    docid = nirf_doc_id(row)
    name  = str(row.get("institute_name_norm") or row.get("institute_name_raw") or "").strip()
    state = str(row.get("state") or "").strip()
    rank  = row.get("rank")
    score = row.get("score")
    ir    = str(row.get("institute_id") or "").strip()
    year  = 2025

    bits = []
    if name: bits.append(name)
    if isinstance(rank, (int, float)) and pd.notna(rank): bits.append(f"Rank {int(rank)}")
    if isinstance(score, (int, float)) and pd.notna(score): bits.append(f"Score {float(score):.2f}")
    if state: bits.append(state)
    bits.append(str(year))            # include year to satisfy number check
    if ir:   bits.append(ir)          # include IR-code (digits will satisfy '0456')

    span_text = " | ".join(bits)

    anchor_val = row.get("row_anchor")
    anchor = "" if pd.isna(anchor_val) else str(anchor_val)

    return {
        "source": "NIRF",
        "year": year,
        "doc_id": docid,
        "span_text": span_text,
        "citation": {
            "url": str(row.get("page_url") or ""),
            "anchor": anchor
        },
        "is_positive": True
    }

# Rebuild the same demo item and validate again
row0 = df_nirf.iloc[0]
item0 = build_lookup_item_from_nirf_row(row0, seq=1)

print("üìÑ Patched SFT 'lookup' item (preview):")
import json
print(json.dumps(item0, indent=2, ensure_ascii=False))

print("\nüîç Validation after patch:")
_ = validate_sft_item(item0, verbose=True)


üìÑ Patched SFT 'lookup' item (preview):
{
  "id": "SFT-NIRF-LOOKUP-00001",
  "task": "lookup",
  "instruction": "What is the 2025 NIRF Overall rank of Indian Institute of Technology Madras?",
  "context": "",
  "evidence": [
    {
      "source": "NIRF",
      "year": 2025,
      "doc_id": "NIRF2025-OVERALL-IR-O-U-0456",
      "span_text": "IIT MADRAS | Score 87.31 | Tamil Nadu | 2025 | IR-O-U-0456",
      "citation": {
        "url": "https://www.nirfindia.org/Rankings/2025/OverallRanking.html",
        "anchor": ""
      },
      "is_positive": true
    }
  ],
  "response": "Indian Institute of Technology Madras is ranked 1 in NIRF 2025 (Overall) with a score of 87.31. Sources: [NIRF2025-OVERALL-IR-O-U-0456]",
  "style": "concise",
  "meta": {
    "annotator": "team-4",
    "created_at": "2025-09-07"
  }
}

üîç Validation after patch:
‚úÖ SFT item passes basic validation.


In [13]:
# Cell ‚Äî Full export in a NEW file (all NIRF OVERALL lookup items at once; no args)

from pathlib import Path
import json

# Reuse existing helpers already defined earlier:
# - build_lookup_item_from_nirf_row(row, seq)
# - validate_sft_item(item, verbose=False)
# - df_nirf (loaded from nirf_2025_overall.csv)
# - build_nirf_evidence patched to include year + IR-code in span_text

# New output file (distinct name so we don't touch your earlier file)
FULL_OUT = Path(SFT_DIR) / "sft_nirf_overall_lookup_full.jsonl"

# Overwrite if exists (fresh build)
items = []
seq = 1
invalids = 0

for _, r in df_nirf.iterrows():
    it = build_lookup_item_from_nirf_row(r, seq=seq)
    if validate_sft_item(it, verbose=False):
        items.append(it)
        seq += 1
    else:
        invalids += 1

with FULL_OUT.open("w", encoding="utf-8") as f:
    for it in items:
        f.write(json.dumps(it, ensure_ascii=False) + "\n")

print(f"‚úÖ Wrote {len(items)} items to: {FULL_OUT.resolve()}")
print(f"‚ö†Ô∏è Skipped (failed validation): {invalids}")

# Quick peek at the first 3 lines
print("\nüîé Preview (first 3 lines):")
with FULL_OUT.open("r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 3: break
        print(line.strip())


‚úÖ Wrote 100 items to: /teamspace/studios/this_studio/data/sft/sft_nirf_overall_lookup_full.jsonl
‚ö†Ô∏è Skipped (failed validation): 0

üîé Preview (first 3 lines):
{"id": "SFT-NIRF-LOOKUP-00001", "task": "lookup", "instruction": "What is the 2025 NIRF Overall rank of Indian Institute of Technology Madras?", "context": "", "evidence": [{"source": "NIRF", "year": 2025, "doc_id": "NIRF2025-OVERALL-IR-O-U-0456", "span_text": "IIT MADRAS | Rank 1 | Score 87.31 | Tamil Nadu | 2025 | IR-O-U-0456", "citation": {"url": "https://www.nirfindia.org/Rankings/2025/OverallRanking.html", "anchor": ""}, "is_positive": true}], "response": "Indian Institute of Technology Madras is ranked 1 in NIRF 2025 (Overall) with a score of 87.31. Sources: [NIRF2025-OVERALL-IR-O-U-0456]", "style": "concise", "meta": {"annotator": "team-4", "created_at": "2025-09-07"}}
{"id": "SFT-NIRF-LOOKUP-00002", "task": "lookup", "instruction": "What is the 2025 NIRF Overall rank of Indian Institute of Science?", "context": "

In [14]:
# Cell ‚Äî QA checks for the JSONL SFT (duplicates, conflicts, number consistency)
# This cell validates the written NIRF Overall lookup items against df_nirf.

from pathlib import Path
from collections import defaultdict, Counter
import json, re
import pandas as pd

# --- Inputs already available in notebook context:
# df_nirf : the cleaned NIRF Overall dataframe (with columns: institute_id, rank, score, etc.)
# SFT_DIR : base directory for SFT outputs

# Path to the "all at once" JSONL we wrote earlier
QA_JSONL = Path(SFT_DIR) / "sft_nirf_overall_lookup_full.jsonl"

if not QA_JSONL.exists():
    raise FileNotFoundError(f"JSONL file not found at {QA_JSONL}. Run the full export cell first.")

# Load items
items = []
with QA_JSONL.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            items.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"‚ö†Ô∏è Skipping malformed JSONL line ({e}) -> {line[:120]}...")

print(f"üì¶ Loaded SFT items: {len(items)}")

# --- Build reference maps from df_nirf
ref = (
    df_nirf[["institute_id","rank","score","institute_name_raw","institute_name_norm","state"]]
    .dropna(subset=["institute_id"])
    .copy()
)
ref["institute_id"] = ref["institute_id"].astype(str)
ref_map = {row["institute_id"]: {"rank": int(row["rank"]) if pd.notna(row["rank"]) else None,
                                 "score": float(row["score"]) if pd.notna(row["score"]) else None,
                                 "name": row["institute_name_raw"] or row["institute_name_norm"],
                                 "state": row["state"]}
           for _, row in ref.iterrows()}

# --- Helpers to extract info from an item
DOCID_IR_RE = re.compile(r"NIRF2025-[A-Z]+-(IR-[A-Z0-9-]+)", re.I)
RANK_RE = re.compile(r"\bRank\s+(\d+)\b", re.I)
SCORE_RE = re.compile(r"\bScore\s+(\d+(?:\.\d+)?)\b", re.I)
RESP_RANK_RE = re.compile(r"\b(rank|ranked)\s+(\d+)\b", re.I)
RESP_SCORE_RE = re.compile(r"\bscore of\s+(\d+(?:\.\d+)?)\b", re.I)

def extract_institute_id(it):
    ev = (it.get("evidence") or [])
    if not ev: 
        return None
    e0 = ev[0]
    # First try doc_id
    m = DOCID_IR_RE.search(str(e0.get("doc_id","")))
    if m:
        return m.group(1).upper()
    # Fallback: look inside span_text
    st = str(e0.get("span_text",""))
    m2 = re.search(r"\b(IR-[A-Z0-9-]+)\b", st, re.I)
    return m2.group(1).upper() if m2 else None

def extract_rank_score_from_span(it):
    ev = (it.get("evidence") or [])
    if not ev:
        return None, None
    st = str(ev[0].get("span_text",""))
    r_m = RANK_RE.search(st)
    s_m = SCORE_RE.search(st)
    rank = int(r_m.group(1)) if r_m else None
    score = float(s_m.group(1)) if s_m else None
    return rank, score

def extract_rank_score_from_response(it):
    resp = str(it.get("response",""))
    r_m = RESP_RANK_RE.search(resp)
    s_m = RESP_SCORE_RE.search(resp)
    rank = int(r_m.group(2)) if r_m else None
    score = float(s_m.group(1)) if s_m else None
    return rank, score

# --- Pass 1: gather quick stats and detect duplicates by rank and by institute_id
by_rank = defaultdict(list)
by_ir = defaultdict(list)

parsed_rows = []
for it in items:
    ir = extract_institute_id(it)
    span_rank, span_score = extract_rank_score_from_span(it)
    resp_rank, resp_score = extract_rank_score_from_response(it)
    parsed_rows.append({
        "id": it.get("id"),
        "institute_id": ir,
        "span_rank": span_rank, "span_score": span_score,
        "resp_rank": resp_rank, "resp_score": resp_score,
        "instruction": it.get("instruction","")[:120]
    })
    if resp_rank is not None:
        by_rank[resp_rank].append(ir or "UNKNOWN")
    if ir:
        by_ir[ir].append(it.get("id"))

# --- Report A: duplicate ranks (same rank assigned to multiple IR-codes)
dup_ranks = {r: ids for r, ids in by_rank.items() if len(set(ids) - {"UNKNOWN"}) > 1}
if dup_ranks:
    print("\n‚ùó Duplicate ranks detected (same rank appears for multiple institutes):")
    for r, irs in sorted(dup_ranks.items()):
        uniq = [x for x in sorted(set(irs)) if x != "UNKNOWN"]
        details = []
        for ir in uniq:
            refrow = ref_map.get(ir, {})
            details.append(f"{ir} ({refrow.get('name','?')}, {refrow.get('state','?')})")
        print(f"  - Rank {r}: " + " | ".join(details))
else:
    print("\n‚úÖ No duplicate ranks across items.")

# --- Report B: duplicate institute_id (same IR-code has multiple items)
dup_ir = {ir: ids for ir, ids in by_ir.items() if len(ids) > 1}
if dup_ir:
    print("\n‚ùó Duplicate items for same institute_id:")
    for ir, ids in sorted(dup_ir.items()):
        nm = ref_map.get(ir, {}).get("name","?")
        print(f"  - {ir} ({nm}): {', '.join(ids)}")
else:
    print("\n‚úÖ No duplicate items for the same IR-code.")

# --- Report C: consistency with df_nirf (rank & score)
mismatches = []
missing_in_ref = []
for row in parsed_rows:
    ir = row["institute_id"]
    if not ir:
        mismatches.append(("NO_IR_IN_ITEM", row["id"], None, None, None))
        continue
    if ir not in ref_map:
        missing_in_ref.append((ir, row["id"]))
        continue
    r_ref = ref_map[ir]["rank"]
    s_ref = ref_map[ir]["score"]
    # Prefer response numbers; fallback to span numbers
    r_item = row["resp_rank"] if row["resp_rank"] is not None else row["span_rank"]
    s_item = row["resp_score"] if row["resp_score"] is not None else row["span_score"]

    if (r_item is not None and r_ref is not None and int(r_item) != int(r_ref)) or \
       (s_item is not None and s_ref is not None and abs(float(s_item) - float(s_ref)) > 1e-6):
        mismatches.append((ir, row["id"], (r_item, s_item), (r_ref, s_ref), ref_map[ir]["name"]))

if missing_in_ref:
    print("\n‚ö†Ô∏è IR-codes in JSONL that are missing from df_nirf (check scraping/merge):")
    for ir, iid in missing_in_ref:
        print(f"  - {ir} (item {iid})")
else:
    print("\n‚úÖ All IR-codes in JSONL exist in df_nirf.")

if mismatches:
    print("\n‚ùó Number mismatches vs df_nirf (item vs reference):")
    for ir, iid, have, want, nm in mismatches[:20]:
        print(f"  - {iid} | {ir} ({nm}) -> item rank/score={have} ; df_nirf rank/score={want}")
    if len(mismatches) > 20:
        print(f"  ‚Ä¶ and {len(mismatches)-20} more")
else:
    print("\n‚úÖ All item ranks/scores match df_nirf.")

# --- Optional: compact dataframe summary of anomalies
rows_summary = []

for r, irs in dup_ranks.items():
    for ir in sorted(set(irs) - {"UNKNOWN"}):
        rows_summary.append({"type":"DUP_RANK", "rank": r, "institute_id": ir,
                             "name": ref_map.get(ir,{}).get("name","?"),
                             "state": ref_map.get(ir,{}).get("state","?")})

for ir, ids in dup_ir.items():
    rows_summary.append({"type":"DUP_INSTITUTE", "rank": ref_map.get(ir,{}).get("rank","?"),
                         "institute_id": ir, "name": ref_map.get(ir,{}).get("name","?"),
                         "state": ref_map.get(ir,{}).get("state","?"),
                         "items": ", ".join(ids)})

for entry in mismatches:
    ir, iid, have, want, nm = entry
    rows_summary.append({"type":"MISMATCH",
                         "institute_id": ir, "name": nm,
                         "item_id": iid,
                         "item_rank": None if have is None else have[0],
                         "item_score": None if have is None else have[1],
                         "ref_rank": None if want is None else want[0],
                         "ref_score": None if want is None else want[1]})

if rows_summary:
    df_anom = pd.DataFrame(rows_summary)
    print("\nüßæ Anomalies summary (first 20 rows):")
    display(df_anom.head(20))
else:
    print("\nüéâ No anomalies found by QA checks.")


üì¶ Loaded SFT items: 100

‚ùó Duplicate ranks detected (same rank appears for multiple institutes):
  - Rank 27: IR-O-U-0273 (Indian Institute of Technology Indore, Madhya Pradesh) | IR-O-U-0356 (Kalinga Institute of Industrial Technology, Odisha)
  - Rank 64: IR-O-U-0308 (Institute of Chemical Technology, Maharashtra) | IR-O-U-0564 (UPES, Uttarakhand)

‚úÖ No duplicate items for the same IR-code.

‚úÖ All IR-codes in JSONL exist in df_nirf.

‚úÖ All item ranks/scores match df_nirf.

üßæ Anomalies summary (first 20 rows):


Unnamed: 0,type,rank,institute_id,name,state
0,DUP_RANK,27,IR-O-U-0273,Indian Institute of Technology Indore,Madhya Pradesh
1,DUP_RANK,27,IR-O-U-0356,Kalinga Institute of Industrial Technology,Odisha
2,DUP_RANK,64,IR-O-U-0308,Institute of Chemical Technology,Maharashtra
3,DUP_RANK,64,IR-O-U-0564,UPES,Uttarakhand


In [15]:
# Cell ‚Äî Fix duplicate ranks by syncing from Selenium df, rebuild JSONL, and re-run QA

from pathlib import Path
import json
import pandas as pd
import numpy as np
import re
from collections import defaultdict

# --- 0) Preconditions
# Assumes you still have:
#   - df          : Selenium-parsed table (authoritative for Rank)
#   - df_nirf     : coalesced/normalized NIRF Overall dataframe
#   - build_lookup_item_from_nirf_row, validate_sft_item, build_nirf_evidence (patched), nirf_doc_id
#   - SFT_DIR     : output directory
# If df is not present, we‚Äôll fall back to re-ranking by score order (deterministic), but df is preferred.

def _to_int(x):
    try:
        return int(str(x).strip())
    except:
        return None

def _to_float(x):
    try:
        return float(str(x).replace(",", "").strip())
    except:
        return None

# --- 1) Build authoritative rank map from Selenium df (preferred), else None
rank_map = {}
if "df" in globals() and isinstance(df, pd.DataFrame) and "institute_id" in df and "rank" in df:
    tmp = df[["institute_id","rank"]].dropna(subset=["institute_id"]).copy()
    tmp["institute_id"] = tmp["institute_id"].astype(str)
    tmp["rank"] = tmp["rank"].map(_to_int)
    rank_map = {r["institute_id"]: r["rank"] for _, r in tmp.iterrows() if r["rank"] is not None}

print(f"üîó Authoritative rank_map from Selenium table: {len(rank_map)} entries")

# --- 2) Overwrite ranks in df_nirf using rank_map
df_fix = df_nirf.copy()
df_fix["institute_id"] = df_fix["institute_id"].astype(str)
before_dups = df_fix["rank"].duplicated(keep=False).sum()

# Apply authoritative ranks when available
mask_has_map = df_fix["institute_id"].isin(rank_map.keys())
df_fix.loc[mask_has_map, "rank"] = df_fix.loc[mask_has_map, "institute_id"].map(rank_map)

# --- 3) If duplicates remain (or if no rank_map), do a deterministic fallback:
# Unique ranks by sorting Score desc, then by institute_id as tiebreaker, assign 1..N
def _dedupe_by_score(df_in: pd.DataFrame) -> pd.DataFrame:
    df2 = df_in.copy()
    # Coerce numeric
    df2["score"] = df2["score"].map(_to_float)
    # Stable ordering: score desc, then name, then IR
    df2 = df2.sort_values(
        by=["score", "institute_name_norm", "institute_id"],
        ascending=[False, True, True],
        kind="mergesort"
    ).reset_index(drop=True)
    df2["rank"] = np.arange(1, len(df2) + 1, dtype=int)
    return df2

dup_after_map = df_fix["rank"].duplicated(keep=False).sum()
if dup_after_map > 0 or len(rank_map) == 0:
    print(f"‚ö†Ô∏è Duplicate ranks remain ({dup_after_map}) or no rank_map available. Applying deterministic re-ranking by score.")
    df_fix = _dedupe_by_score(df_fix)

# Sanity
dup_final = df_fix["rank"].duplicated(keep=False).sum()
assert dup_final == 0, "Ranks are still duplicated after fix‚Äîplease inspect data."

# --- 4) Rebuild FULL JSONL using corrected ranks
FULL_OUT_FIXED = Path(SFT_DIR) / "sft_nirf_overall_lookup_full_fixed.jsonl"

items = []
seq = 1
for _, r in df_fix.iterrows():
    it = build_lookup_item_from_nirf_row(r, seq=seq)
    if validate_sft_item(it, verbose=False):
        items.append(it)
        seq += 1

with FULL_OUT_FIXED.open("w", encoding="utf-8") as f:
    for it in items:
        f.write(json.dumps(it, ensure_ascii=False) + "\n")

print(f"‚úÖ Rebuilt JSONL with corrected ranks: {FULL_OUT_FIXED.resolve()}")
print(f"üßÆ Items written: {len(items)}")

# --- 5) Quick QA re-run (duplicate rank check only, on the fixed file)
def _qa_duplicates(jsonl_path: Path):
    with jsonl_path.open("r", encoding="utf-8") as f:
        lines = [json.loads(x) for x in f if x.strip()]
    resp_rank_re = re.compile(r"\b(rank|ranked)\s+(\d+)\b", re.I)
    by_rank = defaultdict(list)
    for it in lines:
        resp = str(it.get("response",""))
        m = resp_rank_re.search(resp)
        if m:
            r = int(m.group(2))
            by_rank[r].append(it["id"])
    dups = {r:v for r,v in by_rank.items() if len(v) > 1}
    return dups, len(lines)

dups, total = _qa_duplicates(FULL_OUT_FIXED)
if dups:
    print("‚ùó Duplicate ranks still present after fix:")
    for r, ids in sorted(dups.items()):
        print(f"  - Rank {r}: {', '.join(ids[:6])}{' ...' if len(ids) > 6 else ''}")
else:
    print(f"üéâ No duplicate ranks in responses across {total} items (fixed).")

# --- 6) Replace df_nirf in session with fixed ordering/ranks for any future steps
df_nirf = df_fix.copy()


üîó Authoritative rank_map from Selenium table: 100 entries
‚ö†Ô∏è Duplicate ranks remain (4) or no rank_map available. Applying deterministic re-ranking by score.
‚úÖ Rebuilt JSONL with corrected ranks: /teamspace/studios/this_studio/data/sft/sft_nirf_overall_lookup_full_fixed.jsonl
üßÆ Items written: 100
üéâ No duplicate ranks in responses across 100 items (fixed).
