In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sdv.tabular import CTGAN
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_recall_curve, roc_curve


aniridia_df = pd.read_excel("2025-04-11 Aniridiia oftal'molog.xlsx")
albinism_df = pd.read_excel("2025-04-11 Al'binizm oftal'molog.xlsx")

def parse_age_to_years(age_str):
    """Parse age from various formats to float (years)."""
    if pd.isna(age_str):
        return np.nan
    s = str(age_str).lower()
    s = s.replace(',', '.').strip()
    if 'mes' in s or 'мес' in s:
        digits = ''.join(ch for ch in s if ch.isdigit())
        if digits:
            months = int(digits)
            return months / 12.0
        else:
            return np.nan
    if 'год' in s or 'лет' in s or 'года' in s:
        nums = [int(x) for x in s.split() if x.isdigit()]
        if len(nums) == 0:
            return np.nan
        if len(nums) == 1:
            # only years provided
            return float(nums[0])
        if len(nums) >= 2:
            years = float(nums[0])
            months = float(nums[1])
            if years < 0:
                years = 0
            if months < 0:
                months = 0
            return years + months/12.0
    try:
        return float(s)
    except:
        return np.nan

def preprocess_dataset(df, disease_type):
    """
    Preprocess aniridia or albinism dataframe:
    - Parse age
    - Encode binary categorical features (Yes/No questions) as 0/1
    - Derive target labels for relevant tasks
    - Add disease_type column ('aniridia' or 'albinism')
    """
    result = pd.DataFrame()
    # Age in years
    if 'Возраст' in ' '.join(df.columns) or 'Возраст пациента' in df.columns:
        age_col = 'Возраст пациента' if 'Возраст пациента' in df.columns else 'Возраст'
        result['Age'] = df[age_col].apply(parse_age_to_years)
    else:
        result['Age'] = np.nan

    if 'Пол' in df.columns:
        result['Sex_Male'] = df['Пол'].map({'М': 1, 'Ж': 0})
    else:
        result['Sex_Male'] = np.nan

    nyst_col = None
    for col in df.columns:
        if 'нистагм' in str(col).lower():
            nyst_col = col
            break
    if nyst_col:
        result['Nystagmus'] = df[nyst_col].map({'Да': 1, 'Нет': 0})
    else:
        result['Nystagmus'] = np.nan

    photo_col = None
    for col in df.columns:
        if 'светобоязнь' in str(col).lower():
            photo_col = col
            break
    if photo_col:
        result['Photophobia'] = df[photo_col].map({'Да': 1, 'Нет': 0})
    else:
        if disease_type == 'albinism':
            result['Photophobia'] = 0
        else:
            result['Photophobia'] = np.nan

    cat_col = None
    for col in df.columns:
        if 'катаракта' in str(col).lower() and 'пациента' in str(col).lower():
            cat_col = col
            break
    if cat_col:
        result['Cataract'] = df[cat_col].map({'Да': 1, 'Нет': 0})
    else:
        result['Cataract'] = np.nan

    gen_col = None
    for col in df.columns:
        if 'Молекулярно' in str(col):
            if 'Да' in col:
                gen_col = col
                break
    if gen_col:
        result['GeneticTestDone'] = df[gen_col].fillna(0).apply(lambda x: 1 if x == 'Да' else 0)
    else:
        result['GeneticTestDone'] = np.nan

    rehab_col = None
    for col in df.columns:
        if 'пользуетесь ли средствами реабилитации' in str(col).lower():
            rehab_col = col
            break
    if rehab_col:
        result['UsesDevice'] = df[rehab_col].map({'Да': 1, 'Нет': 0})
    else:
        result['UsesDevice'] = np.nan

    gl_col = None
    for col in df.columns:
        if 'глаукома' in str(col).lower() and 'пациента' in str(col).lower():
            gl_col = col
            break
    if gl_col:
        result['Glaucoma'] = df[gl_col].map({'Да': 1, 'Нет': 0})
    else:
        result['Glaucoma'] = np.nan

    syndrome_label = []
    for idx, row in df.iterrows():
        syn = 'None'
        notes = ''
        for col in df.columns:
            if 'екомендации' in str(col) or 'заметки' in str(col) or 'рекомендации' in str(col).lower():
                notes = str(row[col]).lower()
                break
        if disease_type == 'aniridia':
            if 'wagr' in notes or 'вагр' in notes:
                syn = 'WAGR'
        if disease_type == 'albinism':
            if 'hps' in notes.lower() or 'hermansky' in notes or 'гепат' in notes or 'пудлак' in notes:
                syn = 'HPS'
        syndrome_label.append(syn)
    result['SyndromeLabel'] = syndrome_label

    result['DiseaseType'] = disease_type
    return result

proc_aniridia = preprocess_dataset(aniridia_df, disease_type='aniridia')
proc_albinism = preprocess_dataset(albinism_df, disease_type='albinism')

combined_df = pd.concat([proc_aniridia, proc_albinism], ignore_index=True)

for feature in ['Age']:
    combined_df[feature].fillna(combined_df[feature].median(), inplace=True)
    proc_aniridia[feature].fillna(proc_aniridia[feature].median(), inplace=True)
    proc_albinism[feature].fillna(proc_albinism[feature].median(), inplace=True)
for feature in ['Sex_Male','Nystagmus','Photophobia','Cataract','GeneticTestDone']:
    combined_df[feature].fillna(0, inplace=True)
    proc_aniridia[feature].fillna(0, inplace=True)
    proc_albinism[feature].fillna(0, inplace=True)


glaucoma_features = ['Age','Sex_Male','Nystagmus','Cataract','GeneticTestDone']
X_glaucoma = proc_aniridia[glaucoma_features].values
y_glaucoma = proc_aniridia['Glaucoma'].dropna().astype(int).values
# Filter out entries with no glaucoma info (in case some missing)
mask = ~pd.isna(proc_aniridia['Glaucoma'])
X_glaucoma = proc_aniridia.loc[mask, glaucoma_features].values
y_glaucoma = proc_aniridia.loc[mask, 'Glaucoma'].astype(int).values

device_features = ['Age','Sex_Male','Nystagmus','Photophobia','GeneticTestDone']
mask2 = ~pd.isna(proc_albinism['UsesDevice'])
X_device = proc_albinism.loc[mask2, device_features].values
y_device = proc_albinism.loc[mask2, 'UsesDevice'].astype(int).values


combined_df['DiseaseType'] = combined_df['DiseaseType'].map({'aniridia': 0, 'albinism': 1})
syndrome_features = ['Age','Sex_Male','Nystagmus','Photophobia','Cataract','GeneticTestDone','DiseaseType']
X_syndrome = combined_df[syndrome_features].values
# Target: 0=None, 1=WAGR, 2=HPS
y_syndrome = combined_df['SyndromeLabel'].map({'None':0, 'WAGR':1, 'HPS':2}).values


logreg_params = {'C': [0.1, 1, 10], 'class_weight': [None, 'balanced'], 'max_iter': [1000]}
rf_params = {'n_estimators': [100], 'max_depth': [None, 5, 10], 'min_samples_leaf': [1, 2, 5]}
xgb_params = {'n_estimators': [100], 'max_depth': [3, 6], 'learning_rate': [0.1], 'use_label_encoder': [False], 'eval_metric': ['logloss']}

def train_and_evaluate(X, y, model_type='binary'):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    best_models = {}
    results = {}
    models = [
        ('LogReg', LogisticRegression(), logreg_params),
        ('RandomForest', RandomForestClassifier(random_state=42), rf_params),
        ('XGBoost', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'), xgb_params)
    ]
    for name, model, param_grid in models:
        best_score = -np.inf
        best_params = None
        for params in [dict(zip(param_grid, v)) for v in np.meshgrid(*param_grid.values())]:
            from sklearn.model_selection import ParameterGrid
        for params in ParameterGrid(param_grid):
            model.set_params(**params)
            scores = []
            for train_idx, val_idx in skf.split(X, y):
                X_train, X_val = X[train_idx], X[val_idx]
                y_train, y_val = y[train_idx], y[val_idx]
                if model_type == 'multi':
                    train_df = pd.DataFrame(X_train, columns=syndrome_features)
                    train_df['y'] = y_train
                    if any(y_train == 1):
                        wagr_data = train_df[train_df['y']==1].copy().drop('y', axis=1)
                        ctgan = CTGAN(epochs=300, verbose=False)
                        ctgan.fit(wagr_data)
                        syn_wagr = ctgan.sample(n_samples=max(0, 20 - len(wagr_data)))
                        if not syn_wagr.empty:
                            syn_wagr['y'] = 1
                            train_df = pd.concat([train_df, syn_wagr], ignore_index=True)
                    # Augment HPS (class 2)
                    if any(y_train == 2):
                        hps_data = train_df[train_df['y']==2].copy().drop('y', axis=1)
                        ctgan2 = CTGAN(epochs=300, verbose=False)
                        ctgan2.fit(hps_data)
                        syn_hps = ctgan2.sample(n_samples=max(0, 20 - len(hps_data)))
                        if not syn_hps.empty:
                            syn_hps['y'] = 2
                            train_df = pd.concat([train_df, syn_hps], ignore_index=True)
                    y_train = train_df['y'].values.astype(int)
                    X_train = train_df.drop('y', axis=1).values
                elif model_type == 'binary_imbalanced':
                    train_df = pd.DataFrame(X_train, columns=device_features)
                    train_df['y'] = y_train
                    if any(y_train == 1):
                        pos_data = train_df[train_df['y']==1].copy().drop('y', axis=1)
                        ctgan = CTGAN(epochs=300, verbose=False)
                        ctgan.fit(pos_data)
                        syn_pos = ctgan.sample(n_samples=max(0, 2*len(pos_data)))
                        if not syn_pos.empty:
                            syn_pos['y'] = 1
                            train_df = pd.concat([train_df, syn_pos], ignore_index=True)
                    y_train = train_df['y'].values.astype(int)
                    X_train = train_df.drop('y', axis=1).values
                model.fit(X_train, y_train)
                if model_type == 'multi':
                    y_pred = model.predict(X_val)
                    score = f1_score(y_val, y_pred, average='macro')
                else:
                    if hasattr(model, "predict_proba"):
                        y_proba = model.predict_proba(X_val)[:,1]
                    else:
                        try:
                            y_proba = model.decision_function(X_val)
                        except:
                            y_proba = model.predict(X_val)  # fallback
                    if len(np.unique(y_val)) < 2:
                        continue
                    score = roc_auc_score(y_val, y_proba)
                scores.append(score)
            avg_score = np.mean(scores) if scores else -np.inf
            if avg_score > best_score:
                best_score = avg_score
                best_params = params
        best_model = model.__class__(**best_params)
        best_model.fit(X, y)
        best_models[name] = best_model
        results[name] = {'best_params': best_params, 'cv_score': best_score}
        print(f"{name} best CV score: {best_score:.3f} with params {best_params}")
    return best_models, results

print("Training models for Glaucoma Prediction...")
best_models_glaucoma, cv_results_glaucoma = train_and_evaluate(X_glaucoma, y_glaucoma, model_type='binary')
print("\nTraining models for Device Need Prediction...")
best_models_device, cv_results_device = train_and_evaluate(X_device, y_device, model_type='binary_imbalanced')
print("\nTraining models for Syndromic Classification...")
best_models_syndrome, cv_results_syndrome = train_and_evaluate(X_syndrome, y_syndrome, model_type='multi')


Xg_train, Xg_test, yg_train, yg_test = train_test_split(X_glaucoma, y_glaucoma, test_size=0.2, stratify=y_glaucoma, random_state=1)
Xd_train, Xd_test, yd_train, yd_test = train_test_split(X_device, y_device, test_size=0.2, stratify=y_device, random_state=1)
Xs_train, Xs_test, ys_train, ys_test = train_test_split(X_syndrome, y_syndrome, test_size=0.2, stratify=y_syndrome, random_state=1)

model_g = best_models_glaucoma['XGBoost']
y_proba_g = model_g.predict_proba(Xg_test)[:,1]
y_pred_g = model_g.predict(Xg_test)
print("Glaucoma Test ROC-AUC:", roc_auc_score(yg_test, y_proba_g))
print("Glaucoma Test PR-AUC:", average_precision_score(yg_test, y_proba_g))
print("Glaucoma Test Macro-F1:", f1_score(yg_test, y_pred_g, average='macro'))

model_d = best_models_device['XGBoost']
y_proba_d = model_d.predict_proba(Xd_test)[:,1]
y_pred_d = model_d.predict(Xd_test)
print("Device Test ROC-AUC:", roc_auc_score(yd_test, y_proba_d))
print("Device Test PR-AUC:", average_precision_score(yd_test, y_proba_d))
print("Device Test Macro-F1:", f1_score(yd_test, y_pred_d, average='macro'))

model_s = best_models_syndrome['XGBoost']
y_pred_s = model_s.predict(Xs_test)
print("Syndrome Test Macro-F1:", f1_score(ys_test, y_pred_s, average='macro'))
from sklearn.metrics import classification_report
print(classification_report(ys_test, y_pred_s, target_names=['None','WAGR','HPS']))

import matplotlib.pyplot as plt

plt.figure()
for name, model in best_models_glaucoma.items():
    if hasattr(model, "predict_proba"):
        y_score = model.predict_proba(Xg_test)[:,1]
    else:
        try:
            y_score = model.decision_function(Xg_test)
        except:
            y_score = model.predict(Xg_test)
    fpr, tpr, _ = roc_curve(yg_test, y_score)
    plt.plot(fpr, tpr, label=name)
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Glaucoma Prediction")
plt.legend()
plt.show()

precision, recall, _ = precision_recall_curve(yd_test, y_proba_d)
plt.figure()
plt.plot(recall, precision, label="XGBoost PR curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve - Device Need Prediction")
plt.legend()
plt.show()
