In [None]:
import pandas as pd
import numpy as np
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sklearn.feature_selection import RFE, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def fit_ctgan_synthesizer(data, epochs=300, verbose=False):
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data)
    synthesizer = CTGANSynthesizer(metadata=metadata, epochs=epochs, verbose=verbose)
    synthesizer.fit(data)
    return synthesizer

In [None]:
aniridia_df = pd.read_excel("2025-04-11 Aniridiia oftal'molog.xlsx")
albinism_df = pd.read_excel("2025-04-11 Al'binizm oftal'molog.xlsx")

In [None]:
def parse_age_to_years(age_str):
    if pd.isna(age_str):
        return np.nan
    s = str(age_str).lower().replace(',', '.').strip()
    if 'mes' in s or 'мес' in s:
        digits = ''.join(ch for ch in s if ch.isdigit())
        if digits:
            return int(digits) / 12.0
        else:
            return np.nan
    if 'год' in s or 'лет' in s or 'года' in s:
        nums = [int(x) for x in s.split() if x.isdigit()]
        if len(nums) == 0:
            return np.nan
        if len(nums) == 1:
            return float(nums[0])
        if len(nums) >= 2:
            years = float(nums[0]); months = float(nums[1])
            if years < 0: years = 0
            if months < 0: months = 0
            return years + months/12.0
    try:
        return float(s)
    except:
        return np.nan


In [None]:
def preprocess_dataset(df, disease_type):
    result = pd.DataFrame()
    # Age
    if 'Возраст пациента' in df.columns or 'Возраст' in ' '.join(df.columns):
        age_col = 'Возраст пациента' if 'Возраст пациента' in df.columns else 'Возраст'
        result['Age'] = df[age_col].apply(parse_age_to_years)
    else:
        result['Age'] = np.nan
    # Sex
    if 'Пол' in df.columns:
        result['Sex_Male'] = df['Пол'].map({'М':1, 'Ж':0})
    else:
        result['Sex_Male'] = np.nan
    # Nystagmus
    nyst_col = None
    for col in df.columns:
        if 'нистагм' in str(col).lower():
            nyst_col = col; break
    if nyst_col:
        result['Nystagmus'] = df[nyst_col].map({'Да':1, 'Нет':0})
    else:
        result['Nystagmus'] = np.nan
    # Photophobia
    photo_col = None
    for col in df.columns:
        if 'светобоязнь' in str(col).lower():
            photo_col = col; break
    if photo_col:
        result['Photophobia'] = df[photo_col].map({'Да':1, 'Нет':0})
    else:
        result['Photophobia'] = 0 if disease_type=='albinism' else np.nan
    # Cataract
    cat_col = None
    for col in df.columns:
        if 'катаракта' in str(col).lower() and 'пациента' in str(col).lower():
            cat_col = col; break
    if cat_col:
        result['Cataract'] = df[cat_col].map({'Да':1, 'Нет':0})
    else:
        result['Cataract'] = np.nan
    # GeneticTestDone
    gen_col = None
    for col in df.columns:
        if 'молекулярно' in str(col) and 'Да' in str(col):
            gen_col = col; break
    if gen_col:
        result['GeneticTestDone'] = df[gen_col].fillna('Нет').map(lambda x: 1 if x=='Да' else 0)
    else:
        result['GeneticTestDone'] = np.nan
    # UsesDevice
    rehab_col = None
    for col in df.columns:
        if 'реабилитац' in str(col).lower():
            rehab_col = col; break
    if rehab_col:
        result['UsesDevice'] = df[rehab_col].map({'Да':1, 'Нет':0})
    else:
        result['UsesDevice'] = np.nan
    # Glaucoma
    gl_col = None
    for col in df.columns:
        if 'глаукома' in str(col).lower() and 'пациента' in str(col).lower():
            gl_col = col; break
    if gl_col:
        result['Glaucoma'] = df[gl_col].map({'Да':1, 'Нет':0})
    else:
        result['Glaucoma'] = np.nan
    # SyndromeLabel
    syndrome_label = []
    for _, row in df.iterrows():
        label = 'None'
        notes_text = ''
        for col in df.columns:
            if ('екомендации' in str(col)) or ('заметки' in str(col)) or ('рекомендации' in str(col).lower()):
                notes_text = str(row[col]).lower(); break
        if disease_type=='aniridia':
            if 'wagr' in notes_text or 'вагр' in notes_text:
                label = 'WAGR'
        elif disease_type=='albinism':
            if 'hps' in notes_text or 'hermansky' in notes_text or 'гепат' in notes_text or 'пудлак' in notes_text:
                label = 'HPS'
        syndrome_label.append(label)
    result['SyndromeLabel'] = syndrome_label
    result['DiseaseType'] = disease_type
    return result

In [None]:
proc_aniridia = preprocess_dataset(aniridia_df, 'aniridia')
proc_albinism = preprocess_dataset(albinism_df, 'albinism')
combined_df = pd.concat([proc_aniridia, proc_albinism], ignore_index=True)
combined_df['Age'].fillna(combined_df['Age'].median(), inplace=True)
proc_aniridia['Age'].fillna(proc_aniridia['Age'].median(), inplace=True)
proc_albinism['Age'].fillna(proc_albinism['Age'].median(), inplace=True)
for feature in ['Sex_Male', 'Nystagmus', 'Photophobia', 'Cataract', 'GeneticTestDone']:
    combined_df[feature].fillna(0, inplace=True)
    proc_aniridia[feature].fillna(0, inplace=True)
    proc_albinism[feature].fillna(0, inplace=True)
combined_df['DiseaseType'] = combined_df['DiseaseType'].map({'aniridia':0, 'albinism':1})

glaucoma_features = ['Age', 'Sex_Male', 'Nystagmus', 'Cataract', 'GeneticTestDone']
device_features   = ['Age', 'Sex_Male', 'Nystagmus', 'Photophobia', 'GeneticTestDone']
syndrome_features = ['Age', 'Sex_Male', 'Nystagmus', 'Photophobia', 'Cataract', 'GeneticTestDone', 'DiseaseType']

gl_mask = ~pd.isna(proc_aniridia['Glaucoma'])
X_glaucoma = proc_aniridia.loc[gl_mask, glaucoma_features].values
y_glaucoma = proc_aniridia.loc[gl_mask, 'Glaucoma'].astype(int).values
dev_mask = ~pd.isna(proc_albinism['UsesDevice'])
X_device = proc_albinism.loc[dev_mask, device_features].values
y_device = proc_albinism.loc[dev_mask, 'UsesDevice'].astype(int).values
X_syndrome = combined_df[syndrome_features].values
y_syndrome = combined_df['SyndromeLabel'].map({'None':0, 'WAGR':1, 'HPS':2}).values

In [None]:
train_gl = pd.DataFrame(X_glaucoma, columns=glaucoma_features)
train_gl['y'] = y_glaucoma
if any(y_glaucoma == 1):
    pos_data = train_gl[train_gl['y']==1].drop('y', axis=1)
    if len(pos_data) > 0:
        ctgan_pos = fit_ctgan_synthesizer(pos_data)
        syn_pos = ctgan_pos.sample(2 * len(pos_data))
        if not syn_pos.empty:
            syn_pos['y'] = 1
            train_gl = pd.concat([train_gl, syn_pos], ignore_index=True)
X_gl_aug = train_gl.drop('y', axis=1).values
y_gl_aug = train_gl['y'].astype(int).values

In [None]:
train_de = pd.DataFrame(X_device, columns=device_features)
train_de['y'] = y_device
if any(y_device == 1):
    pos_data = train_de[train_de['y']==1].drop('y', axis=1)
    if len(pos_data) > 0:
        ctgan_pos = fit_ctgan_synthesizer(pos_data)
        syn_pos = ctgan_pos.sample(2 * len(pos_data))
        if not syn_pos.empty:
            syn_pos['y'] = 1
            train_de = pd.concat([train_de, syn_pos], ignore_index=True)
X_de_aug = train_de.drop('y', axis=1).values
y_de_aug = train_de['y'].astype(int).values

In [None]:
train_sy = pd.DataFrame(X_syndrome, columns=syndrome_features)
train_sy['y'] = y_syndrome
if any(train_sy['y']==1):
    wagr_data = train_sy[train_sy['y']==1].drop('y', axis=1)
    if len(wagr_data) >= 2:
        ctgan_wagr = fit_ctgan_synthesizer(wagr_data)
        syn_wagr = ctgan_wagr.sample(max(0, 20 - len(wagr_data)))
        if not syn_wagr.empty:
            syn_wagr['y'] = 1
            train_sy = pd.concat([train_sy, syn_wagr], ignore_index=True)
if any(train_sy['y']==2):
    hps_data = train_sy[train_sy['y']==2].drop('y', axis=1)
    if len(hps_data) >= 2:
        ctgan_hps = fit_ctgan_synthesizer(hps_data)
        syn_hps = ctgan_hps.sample(max(0, 20 - len(hps_data)))
        if not syn_hps.empty:
            syn_hps['y'] = 2
            train_sy = pd.concat([train_sy, syn_hps], ignore_index=True)
X_sy_aug = train_sy.drop('y', axis=1).values
y_sy_aug = train_sy['y'].astype(int).values

In [None]:
print("RFE Feature Selection:")
rfe_gl = RFE(LogisticRegression(solver='liblinear'), n_features_to_select=4)
rfe_gl.fit(X_gl_aug, y_gl_aug)
selected_gl = [glaucoma_features[i] for i, sel in enumerate(rfe_gl.support_) if sel]
print(" Glaucoma selected features:", selected_gl)
rfe_de = RFE(LogisticRegression(solver='liblinear'), n_features_to_select=4)
rfe_de.fit(X_de_aug, y_de_aug)
selected_de = [device_features[i] for i, sel in enumerate(rfe_de.support_) if sel]
print(" Device selected features:", selected_de)
rfe_sy = RFE(LogisticRegression(solver='liblinear'), n_features_to_select=6)
rfe_sy.fit(X_sy_aug, y_sy_aug)
selected_sy = [syndrome_features[i] for i, sel in enumerate(rfe_sy.support_) if sel]
print(" Syndrome selected features:", selected_sy)

In [None]:
mi_gl = mutual_info_classif(X_glaucoma, y_glaucoma, discrete_features='auto')
mi_de = mutual_info_classif(X_device, y_device, discrete_features='auto')
mi_sy = mutual_info_classif(X_syndrome, y_syndrome, discrete_features='auto')
print("\nMutual Information Scores:")
print(" Glaucoma MI:")
for feat, score in sorted(zip(glaucoma_features, mi_gl), key=lambda x: x[1], reverse=True):
    print(f"  {feat}: {score:.3f}")
print(" Device MI:")
for feat, score in sorted(zip(device_features, mi_de), key=lambda x: x[1], reverse=True):
    print(f"  {feat}: {score:.3f}")
print(" Syndrome MI:")
for feat, score in sorted(zip(syndrome_features, mi_sy), key=lambda x: x[1], reverse=True):
    print(f"  {feat}: {score:.3f}")

In [None]:
print("\nFeature Importances (XGBoost):")
model_gl = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model_gl.fit(X_glaucoma, y_glaucoma)
for feat, imp in sorted(zip(glaucoma_features, model_gl.feature_importances_), key=lambda x: x[1], reverse=True):
    print(f"  Glaucoma - {feat}: {imp:.3f}")

model_de = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model_de.fit(X_device, y_device)
for feat, imp in sorted(zip(device_features, model_de.feature_importances_), key=lambda x: x[1], reverse=True):
    print(f"  Device - {feat}: {imp:.3f}")

model_sy = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', objective='multi:softprob', num_class=3)
model_sy.fit(X_syndrome, y_syndrome)
for feat, imp in sorted(zip(syndrome_features, model_sy.feature_importances_), key=lambda x: x[1], reverse=True):
    print(f"  Syndrome - {feat}: {imp:.3f}")

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_syndrome)
print("\nPCA explained variance ratios:", pca.explained_variance_ratio_)
plt.figure(figsize=(6,5))
plt.scatter(X_pca[:,0], X_pca[:,1], c=y_syndrome, cmap='viridis', alpha=0.7)
plt.xlabel("PC1"); plt.ylabel("PC2")
plt.title("PCA of Syndrome Dataset (colored by class)")
plt.colorbar(label='Class')
plt.show()