In [1]:
# Core
import numpy as np
import pandas as pd
import json
import joblib
import warnings
warnings.filterwarnings("ignore")

# ML
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [2]:
np.random.seed(42)
N = 6000

def choice(options, probs):
    return np.random.choice(options, size=N, p=probs)

data = pd.DataFrame({
    # Symptoms
    "hematuria": choice([0,1],[0.7,0.3]),
    "flank_pain": choice([0,1,2],[0.5,0.3,0.2]),   # 0=None,1=Mild,2=Severe
    "lower_back_pain": choice([0,1],[0.6,0.4]),
    "fever": choice([0,1],[0.75,0.25]),
    "loss_of_appetite": choice([0,1],[0.7,0.3]),
    "weight_loss": choice([0,1],[0.8,0.2]),
    "fatigue": choice([0,1],[0.65,0.35]),
    "anemia": choice([0,1],[0.85,0.15]),

    # Medical history
    "hypertension": choice([0,1],[0.55,0.45]),
    "diabetes": choice([0,1],[0.7,0.3]),
    "ckd": choice([0,1],[0.8,0.2]),
    "family_kidney_tumor": choice([0,1],[0.9,0.1]),
    "family_hypertension": choice([0,1],[0.6,0.4]),

    # Lifestyle
    "smoking": choice([0,1,2],[0.5,0.25,0.25]),  # 0=Never,1=Former,2=Current
    "alcohol": choice([0,1,2],[0.45,0.35,0.2]),  # 0=None,<7,>7
    "bmi": np.clip(np.random.normal(27, 5, N), 16, 45),
    "chemical_exposure": choice([0,1],[0.85,0.15]),
    "physical_activity": choice([0,1,2],[0.3,0.4,0.3])  # 0=Low,1=Mod,2=High
})


In [3]:
def tumor_logic(row):
    score = (
        row["hematuria"]*3 +
        row["flank_pain"]*2 +
        row["weight_loss"]*2 +
        row["anemia"]*2 +
        row["smoking"] +
        row["hypertension"] +
        row["ckd"] +
        row["chemical_exposure"]*2 +
        (row["bmi"] > 30)
    )
    if score > 8:
        return "renal_cell_carcinoma"
    elif score > 5:
        return "benign_renal_mass"
    else:
        return "no_tumor"

data["tumor_type"] = data.apply(tumor_logic, axis=1)


In [4]:
data["ckd_risk"] = (
    (data["hypertension"] == 1) &
    (data["diabetes"] == 1) &
    (data["bmi"] > 30)
).astype(int)


In [5]:
data["rcc"] = (data["tumor_type"] == "renal_cell_carcinoma").astype(int)
data["benign"] = (data["tumor_type"] == "benign_renal_mass").astype(int)


In [6]:
X = data.drop(columns=["tumor_type","rcc","benign","ckd_risk"])
y_rcc = data["rcc"]
y_ckd = data["ckd_risk"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y_rcc, test_size=0.25, random_state=42
)


In [7]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.95)),
    ("model", XGBClassifier(
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=42
    ))
])


In [8]:
param_grid = {
    "model__n_estimators": [100,200],
    "model__max_depth": [3,5],
    "model__learning_rate": [0.05,0.1]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)
best_model = grid.best_estimator_

print("Best ROC-AUC:", grid.best_score_)


Best ROC-AUC: 0.986660879367801


In [9]:
joblib.dump(best_model, "kidney_risk_inference_model.pkl")
print("✅ Model saved")

✅ Model saved
