<a href="https://colab.research.google.com/github/ProfDee92/Cancer-3IPMLM/blob/main/At.Ranker_Weighted_Score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ranker- The model that ranks article based on researcher defined weights of the relevance of words in the titile and abstract.

Download your articles based on your research using specific search string from relevant database(s). Deduplicate to remove inherent cross-database redundancy. Then load into Ranker applying your defined weights based on the importance or relevance of specific words/tests, and Ranker will automatically apply your defined weights based on selected workds of relevance to your study. With this, it is easier to have higher precision in study Quality Assessemnt.

In [None]:
import pandas as pd
import os
import re
import unicodedata
from typing import Dict, Optional, Sequence, Union, Iterable

# ---- Utility: detect likely text and year columns ----
def detect_columns(df: pd.DataFrame):
    # Detect text-like columns: those with longer string values
    text_candidates = []
    for col in df.columns:
        if df[col].dtype == object or pd.api.types.is_string_dtype(df[col]):
            avg_len = df[col].dropna().astype(str).map(len).mean() if not df[col].dropna().empty else 0
            if avg_len > 5:  # heuristic: exclude IDs/codes
                text_candidates.append(col)
    # Use all candidate text columns if found
    text_cols = text_candidates if text_candidates else [df.columns[0]]

    # Detect a likely year column
    year_col = None
    for col in df.columns:
        if re.search(r'year', col, flags=re.IGNORECASE):
            year_col = col
            break
    return text_cols, year_col

# ---- Weighted ranking function (tidied from above) ----
def rank_and_filter(
    df: pd.DataFrame,
    weights: Dict[str, float],
    text_cols: Union[str, Sequence[str]],
    year_col: Optional[str] = None,
    core_keywords: Optional[Iterable[str]] = None,
    score_col: str = "Weighted_Score",
    exclude_threshold: Optional[float] = 2.0,
    guard_threshold: Optional[float] = 4.0,
) -> pd.DataFrame:
    def _norm(s: object) -> str:
        s = "" if s is None or (isinstance(s, float) and pd.isna(s)) else str(s)
        s = unicodedata.normalize("NFKC", s).lower()
        return re.sub(r"\s+", " ", s).strip()

    if isinstance(text_cols, str):
        text_cols = [text_cols]
    text = df[text_cols].astype(str).applymap(_norm).agg(" ".join, axis=1)

    flags = re.IGNORECASE
    compiled = [(re.compile(rf"\b{re.escape(k)}\b", flags), float(w)) for k, w in weights.items()]

    score = pd.Series(0.0, index=text.index)
    for pat, w in compiled:
        score += text.str.count(pat) * w
    df_out = df.copy()
    df_out[score_col] = score

    # Core keyword hits
    core_hit = None
    if core_keywords:
        core_patterns = [re.compile(rf"\b{re.escape(k)}\b", flags) for k in core_keywords]
        core_counts = sum(text.str.count(p) for p in core_patterns)
        core_hit = core_counts.gt(0)
        df_out["CoreHit"] = core_hit

    # Deterministic filter flags
    if exclude_threshold is not None or guard_threshold is not None:
        keep_mask = pd.Series(True, index=df_out.index)
        if exclude_threshold is not None:
            keep_mask &= df_out[score_col].gt(exclude_threshold)
        if guard_threshold is not None and core_hit is not None:
            guard_mask = df_out[score_col].gt(guard_threshold) | core_hit
            keep_mask &= guard_mask
        df_out["PreFilter_Keep"] = keep_mask

    # Sort
    if year_col is not None and year_col in df_out.columns:
        df_out = (
            df_out.assign(_year_sort=pd.to_numeric(df_out[year_col], errors="coerce"))
                  .sort_values([score_col, "_year_sort"], ascending=[False, False])
                  .drop(columns="_year_sort")
        )
    else:
        df_out = df_out.sort_values(score_col, ascending=False)
    return df_out.reset_index(drop=True)

# ---- End-to-end runner ----
def process_file(
    file_path: str,
    weights: Dict[str, float],
    core_keywords: Optional[Iterable[str]] = None,
    output_path: Optional[str] = None
):
    # Load
    ext = os.path.splitext("/content/Camera Ready Final SLR.xlsx")[1].lower()
    if ext in [".csv", ".txt"]:
        df = pd.read_csv(file_path)
    elif ext in [".xlsx", ".xls"]:
        df = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file type. Use CSV or Excel.")

    # Detect columns
    text_cols, year_col = detect_columns(df)
    print(f"Detected text columns: {text_cols}, year column: {year_col}")

    # Rank & filter
    ranked = rank_and_filter(df, weights=weights, text_cols=text_cols,
                             year_col=year_col, core_keywords=core_keywords)

    # Save
    if output_path is None:
        base, ext = os.path.splitext(file_path)
        output_path = f"{base}_ranked.csv"
    ranked.to_csv(output_path, index=False)
    print(f"Saved ranked file to: {output_path}")
    return ranked

# ---- Example usage ----
if __name__ == "__main__":
    weights = {
        "multi-omics": 3,
        "integration": 3,
        "transcriptomic": 3,
        "multi-omics": 2,
        "Bayesian": 2,
        "deep learning": 2
    }
    core_keywords = ["pan-cancer", "multi-cancer"]
    ranked_df = process_file("/content/Camera Ready Final SLR.xlsx", weights, core_keywords)
    print(ranked_df.head(30))
