In [21]:
# TF-IDF ADR builder (safe defaults), notebook-friendly (no CLI).
# ---------------------------------------------------------------
# Requirements:
#   pip install pyarrow scikit-learn
#
# Inputs expected in memory:
#   dti_df: columns ["rxcui", ...]
#   adr_df: columns ["rxnorm_ingredient_id", "meddra_id", "meddra_name"]
#
# Output directory (Windows path OK):
#   r"F:\Thesis Korbi na\dti-prediction-with-adr\Data\TFIDF_ADR_vectors"

from __future__ import annotations
from dataclasses import dataclass
from typing import Optional, Sequence, Dict, Tuple
import os, json
import numpy as np
import pandas as pd
from pathlib import Path
import yaml
from scipy import sparse as sp
from sklearn.feature_extraction.text import TfidfTransformer

In [22]:
# =========================
# Options (safe defaults)
# =========================
@dataclass
class TFIDFOptions:
    # Column names
    rxcui_col_dti: str = "rxcui"
    rxcui_col_adr: str = "rxnorm_ingredient_id"
    meddra_id_col: str = "meddra_id"
    meddra_name_col: str = "meddra_name"  # only for preview

    # Preprocessing
    intersect_with_dti: bool = True
    drop_duplicates_pairs: bool = True  # drop duplicate (rxcui, meddra_id)

    # TF-IDF hyperparams (safe defaults)
    norm: Optional[str] = "l2"          # None, "l1", or "l2"
    sublinear_tf: bool = False
    smooth_idf: bool = True

    # Column filtering (computed on TRAIN drugs only, avoids leakage)
    # If float in (0,1] => fraction of train drugs; if int >=1 => absolute count
    min_df: float | int = 1
    max_df: float | int = 1.0           # 1.0 keeps all

    # Saving
    output_dir: str = r"F:\Thesis Korbi na\dti-prediction-with-adr\Data\TFIDF_ADR_vectors"
    save_long: bool = True
    save_wide: bool = True

    # Dtypes
    float_dtype: str = "float32"

    # Previews
    preview_n_drugs: int = 3  # set 0 to skip


In [23]:
# =========================
# Helpers
# =========================
def _ensure_str_series(s: pd.Series) -> pd.Series:
    return s.astype(str).str.strip()

def _compute_df_thresholds(min_df, max_df, n_docs: int) -> Tuple[int, int]:
    """Return absolute (min_df_abs, max_df_abs) based on number of train docs."""
    if isinstance(min_df, float) and 0 < min_df <= 1:
        min_df_abs = int(np.ceil(min_df * n_docs))
    elif isinstance(min_df, int) and min_df >= 1:
        min_df_abs = min_df
    else:
        min_df_abs = 1

    if isinstance(max_df, float) and 0 < max_df <= 1:
        max_df_abs = int(np.floor(max_df * n_docs))
    elif isinstance(max_df, int) and max_df >= 1:
        max_df_abs = max_df
    else:
        max_df_abs = n_docs

    min_df_abs = max(1, min_df_abs)
    max_df_abs = max(min_df_abs, min(n_docs, max_df_abs))
    return min_df_abs, max_df_abs

def _to_long_parquet(X: sp.spmatrix, drugs: np.ndarray, adrs: np.ndarray,
                     out_path: str, float_dtype="float32"):
    """Save sparse matrix to long (tidy) parquet with columns [rxcui, meddra_id, tfidf]."""
    X = X.tocoo(copy=False)
    df_long = pd.DataFrame({
        "rxcui": drugs[X.row],
        "meddra_id": adrs[X.col],
        "tfidf": X.data.astype(float_dtype)
    })
    df_long.to_parquet(out_path, index=False)

def _to_wide_parquet(X: sp.spmatrix, drugs: np.ndarray, adrs: np.ndarray,
                     out_path: str, float_dtype="float32", chunk_cols: int = 2000):
    """
    Save to wide parquet: index=rxcui, columns=meddra_<id>.
    Writes in column chunks to keep memory friendly.
    """
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    n_drugs, n_adrs = X.shape
    X = X.tocsr(copy=False)

    drug_index = pd.Index(drugs, name="rxcui")
    parts = []
    for start in range(0, n_adrs, chunk_cols):
        end = min(start + chunk_cols, n_adrs)
        X_chunk = X[:, start:end].toarray().astype(float_dtype, copy=False)  # dense per chunk
        cols = [f"meddra_{int(mid)}" for mid in adrs[start:end]]
        df_chunk = pd.DataFrame(X_chunk, index=drug_index, columns=cols)
        parts.append(df_chunk)

    df_wide = pd.concat(parts, axis=1)
    df_wide.reset_index().to_parquet(out_path, index=False)

In [24]:
# =========================
# Core builder
# =========================
def build_and_save_tfidf_parquets(
    dti_df: pd.DataFrame,
    adr_df: pd.DataFrame,
    options: TFIDFOptions = TFIDFOptions(),
    train_rxcui: Optional[Sequence[str]] = None,
    val_rxcui: Optional[Sequence[str]] = None,
    test_rxcui: Optional[Sequence[str]] = None,
    per_split_subdirs: bool = True,   # NEW: write split files to subfolders
):
    """
    Build TF-IDF features for ADRs and save Parquet artifacts (safe defaults).
    If train/val/test lists are provided, IDF is fit on train only and split-specific files
    are saved under <output_dir>/<split>/ (when per_split_subdirs=True).
    Otherwise, a single 'all' TF-IDF is produced in <output_dir>/.
    """
    os.makedirs(options.output_dir, exist_ok=True)

    # --- Validate presence of required columns ---
    for col in [options.rxcui_col_dti]:
        assert col in dti_df.columns, f"DTI is missing column: {col}"
    for col in [options.rxcui_col_adr, options.meddra_id_col]:
        assert col in adr_df.columns, f"ADR is missing column: {col}"

    # --- Normalize types ---
    dti = dti_df.copy()
    adr = adr_df.copy()
    dti[options.rxcui_col_dti] = _ensure_str_series(dti[options.rxcui_col_dti])
    adr[options.rxcui_col_adr] = _ensure_str_series(adr[options.rxcui_col_adr])

    # --- Keep only ADR rows that appear in DTI ---
    if options.intersect_with_dti:
        rxcui_keep = set(dti[options.rxcui_col_dti].unique().tolist())
        adr = adr[adr[options.rxcui_col_adr].isin(rxcui_keep)].copy()

    # --- Drop duplicate (drug, ADR) pairs ---
    if options.drop_duplicates_pairs:
        adr = adr.drop_duplicates(
            subset=[options.rxcui_col_adr, options.meddra_id_col],
            keep="first"
        )

    # --- Build vocabularies ---
    drugs = np.array(sorted(dti[options.rxcui_col_dti].unique()), dtype=object)
    adrs_unique = np.array(sorted(adr[options.meddra_id_col].astype(np.int64).unique()), dtype=np.int64)

    drug_to_row = {r: i for i, r in enumerate(drugs)}
    adr_to_col  = {int(a): j for j, a in enumerate(adrs_unique)}

    # Save index maps in ROOT
    pd.DataFrame({"rxcui": drugs, "row_id": np.arange(len(drugs), dtype=int)}).to_parquet(
        os.path.join(options.output_dir, "drug_index.parquet"), index=False
    )
    pd.DataFrame({"meddra_id": adrs_unique, "col_id": np.arange(len(adrs_unique), dtype=int)}).to_parquet(
        os.path.join(options.output_dir, "adr_index.parquet"), index=False
    )

    # --- Presence matrix (CSR) ---
    rows = adr[options.rxcui_col_adr].map(drug_to_row).to_numpy()
    cols = adr[options.meddra_id_col].astype(np.int64).map(adr_to_col).to_numpy()
    data = np.ones_like(rows, dtype=np.float32)

    mask = (~pd.isna(rows)) & (~pd.isna(cols))
    rows = rows[mask].astype(np.int64, copy=False)
    cols = cols[mask].astype(np.int64, copy=False)
    data = data[mask]

    n_drugs = len(drugs)
    n_adrs  = len(adrs_unique)
    presence = sp.coo_matrix((data, (rows, cols)), shape=(n_drugs, n_adrs), dtype=np.float32).tocsr()

    # --- Splits ---
    splits: Dict[str, np.ndarray] = {}
    if train_rxcui is None and val_rxcui is None and test_rxcui is None:
        splits["all"] = np.arange(n_drugs, dtype=int)
        train_mask = splits["all"]
    else:
        def _map_list_to_idx(lst: Sequence[str]) -> np.ndarray:
            return np.array([drug_to_row[r] for r in lst if r in drug_to_row], dtype=int)
        if train_rxcui is not None:
            splits["train"] = _map_list_to_idx(train_rxcui)
        if val_rxcui is not None:
            splits["val"] = _map_list_to_idx(val_rxcui)
        if test_rxcui is not None:
            splits["test"] = _map_list_to_idx(test_rxcui)
        if "train" not in splits or len(splits["train"]) == 0:
            raise ValueError("When providing splits, train_rxcui must be non-empty.")
        train_mask = splits["train"]

    # --- Column filtering (TRAIN only) ---
    presence_train = presence[train_mask]
    df_train = np.asarray((presence_train > 0).sum(axis=0)).ravel().astype(int)
    N_train = presence_train.shape[0]
    min_df_abs, max_df_abs = _compute_df_thresholds(options.min_df, options.max_df, N_train)
    keep_col_mask = (df_train >= min_df_abs) & (df_train <= max_df_abs)
    kept_cols = np.where(keep_col_mask)[0]

    presence   = presence[:, kept_cols]
    adrs_kept  = adrs_unique[kept_cols]
    df_kept    = df_train[kept_cols]

    # --- Fit TF-IDF on TRAIN only ---
    tfidf_transformer = TfidfTransformer(
        norm=options.norm,
        use_idf=True,
        smooth_idf=options.smooth_idf,
        sublinear_tf=options.sublinear_tf,
    )
    tfidf_transformer.fit(presence_train[:, kept_cols])

    # Save IDF table in ROOT
    idf_vals = tfidf_transformer.idf_.astype(options.float_dtype)
    idf_table = pd.DataFrame({
        "meddra_id": adrs_kept.astype(np.int64),
        "df": df_kept.astype(np.int32),
        "idf": idf_vals
    })
    idf_table.to_parquet(os.path.join(options.output_dir, "idf_table.parquet"), index=False)

    # Optional name map for previews
    meddra_name_map = None
    if options.meddra_name_col in adr.columns:
        tmp = adr[[options.meddra_id_col, options.meddra_name_col]].drop_duplicates(subset=[options.meddra_id_col])
        meddra_name_map = dict(zip(tmp[options.meddra_id_col].astype(np.int64), tmp[options.meddra_name_col].astype(str)))

    # --- Transform and save per split ---
    all_stats = []

    def _split_dir(split_name: str) -> str:
        if "all" == split_name or not per_split_subdirs:
            return options.output_dir
        d = os.path.join(options.output_dir, split_name)
        os.makedirs(d, exist_ok=True)
        return d

    def _save_for_mask(mask_idx: np.ndarray, split_name: str):
        X = tfidf_transformer.transform(presence[mask_idx, :]).astype(options.float_dtype)
        out_dir = _split_dir(split_name)

        # Save Parquets inside split folder
        if options.save_long:
            _to_long_parquet(
                X, drugs=drugs[mask_idx], adrs=adrs_kept,
                out_path=os.path.join(out_dir, "tfidf_long.parquet"),
                float_dtype=options.float_dtype
            )
        if options.save_wide:
            _to_wide_parquet(
                X, drugs=drugs[mask_idx], adrs=adrs_kept,
                out_path=os.path.join(out_dir, "tfidf_wide.parquet"),
                float_dtype=options.float_dtype
            )

        # Save preview in split folder
        if meddra_name_map is not None and options.preview_n_drugs > 0:
            sample_idx = mask_idx[:min(options.preview_n_drugs, len(mask_idx))]
            previews = []
            X_csr = X.tocsr()
            for ridx in range(len(sample_idx)):
                row = X_csr[ridx]
                if row.nnz == 0:
                    continue
                top_local = np.argsort(row.data)[::-1][:10]
                cols_local = row.indices[top_local]
                scores = row.data[top_local]
                for c, s in zip(cols_local, scores):
                    mid = int(adrs_kept[c])
                    previews.append({
                        "split": split_name,
                        "rxcui": drugs[sample_idx[ridx]],
                        "meddra_id": mid,
                        "meddra_name": meddra_name_map.get(mid, ""),
                        "tfidf": float(s)
                    })
            if previews:
                pd.DataFrame(previews).to_parquet(
                    os.path.join(out_dir, "preview_top_tfidf.parquet"),
                    index=False
                )

        density = X.nnz / (X.shape[0] * X.shape[1]) if X.shape[0] and X.shape[1] else 0.0
        split_stats = {"split": split_name, "n_drugs": int(X.shape[0]), "n_adrs": int(X.shape[1]),
                       "nnz": int(X.nnz), "density": float(density)}
        # also save split stats to its folder
        with open(os.path.join(out_dir, "stats.json"), "w") as f:
            json.dump(split_stats, f, indent=2)
        return split_stats

    # Process splits
    if "all" in (splits.keys()):
        all_stats.append(_save_for_mask(splits["all"], "all"))
    else:
        for split_name, mask_idx in splits.items():
            all_stats.append(_save_for_mask(mask_idx, split_name))

    # --- Global stats in ROOT ---
    global_stats = {
        "n_drugs_total": int(n_drugs),
        "n_adrs_original": int(n_adrs),
        "n_adrs_kept": int(len(adrs_kept)),
        "min_df_abs": int(min_df_abs),
        "max_df_abs": int(max_df_abs),
        "splits": all_stats,
        "norm": options.norm,
        "sublinear_tf": options.sublinear_tf,
        "smooth_idf": options.smooth_idf,
        "note": "IDF fit on train only if splits provided; per-split files saved to subfolders."
    }
    with open(os.path.join(options.output_dir, "global_stats.json"), "w") as f:
        json.dump(global_stats, f, indent=2)

    return {"adrs_kept": adrs_kept, "drugs": drugs, "idf_table": idf_table, "stats": global_stats}


In [25]:
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [26]:
# ---- point these to your files ----
DTI_PATH = config["paths"]["DTI_DATASET"]    # or .csv
ADR_PATH = config["paths"]["ADR_DATASET"]      # or .csv

# Columns we actually need
DTI_COLS = ["drug_chembl_id","target_uniprot_id","label","smiles","sequence","molfile_3d","rxcui"]
ADR_COLS = ["rxnorm_ingredient_id","meddra_id","meddra_name"]

def _read_any(path: Path, usecols=None) -> pd.DataFrame:
    return pd.read_parquet(path, columns=usecols)

# Load DTI
dti_df = _read_any(
    DTI_PATH,
    usecols=DTI_COLS,
)

# Load ADR
adr_df = _read_any(
    ADR_PATH,
    usecols=ADR_COLS,
)

# Final type hygiene (robust against weird CSVs)
dti_df["rxcui"] = dti_df["rxcui"].astype("string").str.strip()
adr_df["rxnorm_ingredient_id"] = adr_df["rxnorm_ingredient_id"].astype("string").str.strip()
adr_df["meddra_id"] = pd.to_numeric(adr_df["meddra_id"], downcast="integer")

print(dti_df.info())
print(adr_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34741 entries, 0 to 34740
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   drug_chembl_id     34741 non-null  object
 1   target_uniprot_id  34741 non-null  object
 2   label              34741 non-null  int64 
 3   smiles             34741 non-null  object
 4   sequence           34741 non-null  object
 5   molfile_3d         34741 non-null  object
 6   rxcui              34741 non-null  string
dtypes: int64(1), object(5), string(1)
memory usage: 1.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69474 entries, 0 to 69473
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   rxnorm_ingredient_id  69474 non-null  string
 1   meddra_id             69474 non-null  int32 
 2   meddra_name           69474 non-null  object
dtypes: int32(1), object(1), string(1)
memory

A) One-shot build (no explicit splits).
Fits IDF on all drugs, saves tfidf_wide.parquet and tfidf_long.parquet:

In [17]:
opts = TFIDFOptions()

artifacts = build_and_save_tfidf_parquets(
    dti_df=dti_df,
    adr_df=adr_df,
    options=opts,
    # No train/val/test lists passed => single "all" build
)
artifacts["stats"]


{'n_drugs_total': 1028,
 'n_adrs_original': 4817,
 'n_adrs_kept': 4817,
 'min_df_abs': 1,
 'max_df_abs': 1028,
 'splits': [{'split': 'all',
   'n_drugs': 1028,
   'n_adrs': 4817,
   'nnz': 69474,
   'density': 0.014029834349648497}],
 'norm': 'l2',
 'sublinear_tf': False,
 'smooth_idf': True,
 'note': 'IDF fit on train only if splits provided; else fit on all.'}

B) Proper split-aware build.
Fit IDF on train drugs only, and save split-specific Parquets:

In [27]:
opts = TFIDFOptions()

# split (replace with deterministic split)
from numpy.random import default_rng
rng = default_rng(42)
all_drugs = dti_df["rxcui"].astype(str).unique()
rng.shuffle(all_drugs)
n = len(all_drugs)
train_drugs = all_drugs[: int(0.70*n)]
val_drugs   = all_drugs[int(0.70*n): int(0.85*n)]
test_drugs  = all_drugs[int(0.85*n):]

artifacts = build_and_save_tfidf_parquets(
    dti_df, adr_df, options=opts,
    train_rxcui=train_drugs, val_rxcui=val_drugs, test_rxcui=test_drugs,
    per_split_subdirs=True,   # <- key line
)
artifacts["stats"]


{'n_drugs_total': 1028,
 'n_adrs_original': 4817,
 'n_adrs_kept': 4048,
 'min_df_abs': 1,
 'max_df_abs': 719,
 'splits': [{'split': 'train',
   'n_drugs': 719,
   'n_adrs': 4048,
   'nnz': 47047,
   'density': 0.01616450988692024},
  {'split': 'val',
   'n_drugs': 154,
   'n_adrs': 4048,
   'nnz': 10324,
   'density': 0.016561008161798677},
  {'split': 'test',
   'n_drugs': 155,
   'n_adrs': 4048,
   'nnz': 11222,
   'density': 0.017885375494071147}],
 'norm': 'l2',
 'sublinear_tf': False,
 'smooth_idf': True,
 'note': 'IDF fit on train only if splits provided; per-split files saved to subfolders.'}