# Training

In [None]:
# Helper function for feature filtering

def filter_features(df, model_name, remove_clinical_trained=False, remove_all_tools=False):
    with open("../resources/feature_lists/tools_excluded_due_to_unavailable_training_sets.txt", "r") as f:
        no_training_set = [line.strip() for line in f]
    df = df.drop(columns=no_training_set, errors="ignore")
    
    if model_name in ["FuncVEP_CTI", "ClinVEP_CTI"]:
        return df
    
    if model_name == "FuncVEP_CTE" or model_name == "ClinVEP_CTE" or remove_clinical_trained:
        with open("../resources/feature_lists/clinical_trained_tools.txt", "r") as f:
            clinically_trained_set = [line.strip() for line in f]
        df = df.drop(columns=clinically_trained_set, errors="ignore")
    
    if model_name == "FuncVEP_SP" or model_name == "ClinVEP_SP" or remove_all_tools:
        with open("../resources/feature_lists/all_tools.txt", "r") as f:
            tools = [line.strip() for line in f]
        df = df.drop(columns=tools, errors="ignore")

    return df

In [2]:
def train_funcvep(
    model_name,
    test_size=0.1,
    n_trials=50,
    random_state=42,
    remove_clinical_trained=False,
    remove_all_tools=False
):
    import pandas as pd
    import lightgbm as lgb
    import optuna
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score, accuracy_score
    import joblib
    import json
    import warnings
    warnings.simplefilter("ignore")
    import os
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer

    df = pd.read_csv("../data/final/functional_labels_model_input.txt", sep="\t")

    target_column = "functional_label"

    model_dir = f"../models/{model_name}"
    os.makedirs(model_dir, exist_ok=True)

    df = filter_features(df, model_name, remove_clinical_trained, remove_all_tools)
    df = df.dropna(subset=[target_column])

    weight_info = df[["ID", "weight"]].copy()
    weight_info["weight"] = weight_info["weight"].astype(float)
    non_default_weight_ids = list(weight_info[weight_info["weight"] != 1]["ID"])

    df["weight"] = pd.to_numeric(df["weight"], errors="coerce").fillna(1.0)

    df[target_column] = df[target_column].replace({"PS3": 1, "BS3": 0})

    id_column = "ID"
    feature_columns = df.columns.difference([id_column, target_column, "weight"])

    feature_columns_df = pd.DataFrame(feature_columns, columns=["Feature"])
    feature_columns_df.to_csv(os.path.join(model_dir, "training_features.txt"), sep="\t", index=False)

    df[feature_columns] = df[feature_columns].apply(pd.to_numeric, errors='coerce')

    df = df.dropna(subset=[target_column])

    X = df[feature_columns]
    y = df[target_column]
    ids = df[id_column]
    sample_weights = df["weight"]

    X_train, X_test, y_train, y_test, ids_train, ids_test, sample_weights_train, _ = train_test_split(
        X, y, ids, sample_weights, test_size=test_size, random_state=random_state
    )

    # Keep only test samples with default weight = 1
    test_mask = sample_weights.loc[ids_test.index] == 1.0
    X_test = X_test.loc[test_mask]
    y_test = y_test.loc[test_mask]
    ids_test = ids_test.loc[test_mask]

    training_ids_df = pd.DataFrame(ids_train, columns=["ID"])
    training_ids_df.to_csv(os.path.join(model_dir, "training_variants.txt"), sep="\t", index=False)

    # Define Optuna objective function for tuning
    def objective(trial):
        params = {
            "num_leaves": trial.suggest_int("num_leaves", 31, 256),
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "lambda_l1": trial.suggest_float("lambda_l1", 0.1, 10, log=True),
            "lambda_l2": trial.suggest_float("lambda_l2", 0.1, 10, log=True),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
            "objective": "binary",
            "random_state": random_state,
            "verbose": -1 
        }

        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_train, y_train,
            sample_weight=sample_weights_train,
            eval_set=[(X_test, y_test)],
            eval_metric="auc", 
            callbacks=[lgb.log_evaluation(0)] 
        )

        y_pred_proba = model.predict_proba(X_test)[:, 1]
        return roc_auc_score(y_test, y_pred_proba)

    # Run Optuna optimization
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    # Train final model with best parameters
    lgb_model = lgb.LGBMClassifier(**study.best_params, objective="binary", random_state=random_state)
    lgb_model.fit(X_train, y_train, sample_weight=sample_weights_train)

    # Make predictions
    y_pred = lgb_model.predict(X_test)
    y_pred_proba = lgb_model.predict_proba(X_test)[:, 1]

    # Evaluate performance
    auc_score = roc_auc_score(y_test, y_pred_proba)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"AUC Score: {auc_score}")
    print(f"Accuracy: {accuracy * 100:.2f}%")

    joblib.dump(lgb_model, os.path.join(model_dir, "model.pkl"))

    test_results_df = X_test.copy()
    test_results_df[id_column] = ids_test
    test_results_df[target_column] = y_test
    test_results_df[model_name] = y_pred_proba

    test_results_df = test_results_df[["ID", target_column, model_name]]

    # Remove rows where weight is not 1 (this step ensures the proxy benign variants are not included in downstream benchmarking)
    test_results_df = test_results_df[~test_results_df["ID"].isin(non_default_weight_ids)]

    results_dir = f"../results/predictions/functional/"
    os.makedirs(results_dir, exist_ok=True)

    test_results_df.to_csv(f"{results_dir}{model_name}.txt", sep="\t", index=False)


In [3]:
def train_clinvep(
    model_name,
    test_size=0.1,
    n_trials=50,
    random_state=42,
    remove_clinical_trained=False,
    remove_all_tools=False
):
    import pandas as pd
    import lightgbm as lgb
    import optuna
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score, accuracy_score
    import joblib
    import json
    import warnings
    warnings.simplefilter("ignore")
    import os
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer

    df = pd.read_csv("../data/final/clinical_labels_model_input.txt", sep="\t")

    target_column = "clinical_label"

    model_dir = f"../models/{model_name}"
    os.makedirs(model_dir, exist_ok=True)

    df = filter_features(df, model_name)
    df = df.dropna(subset=[target_column])

    df[target_column] = df[target_column].replace({"P": 1, "B": 0})

    id_column = "ID"
    feature_columns = df.columns.difference([id_column, target_column])

    feature_columns_df = pd.DataFrame(feature_columns, columns=["Feature"])
    feature_columns_df.to_csv(os.path.join(model_dir, "training_features.txt"), sep="\t", index=False)

    df[feature_columns] = df[feature_columns].apply(pd.to_numeric, errors='coerce')

    df = df.dropna(subset=[target_column])

    X = df[feature_columns]
    y = df[target_column]
    ids = df[id_column]

    X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(
        X, y, ids, test_size=test_size, random_state=random_state
    )

    training_ids_df = pd.DataFrame(ids_train, columns=["ID"])
    training_ids_df.to_csv(os.path.join(model_dir, "training_variants.txt"), sep="\t", index=False)

    # Define Optuna objective function for tuning
    def objective(trial):
        params = {
            "num_leaves": trial.suggest_int("num_leaves", 31, 256),
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "lambda_l1": trial.suggest_float("lambda_l1", 0.1, 10, log=True),
            "lambda_l2": trial.suggest_float("lambda_l2", 0.1, 10, log=True),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
            "objective": "binary",
            "random_state": random_state,
            "verbose": -1 
        }

        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="auc", 
            callbacks=[lgb.log_evaluation(0)] 
        )

        y_pred_proba = model.predict_proba(X_test)[:, 1]
        return roc_auc_score(y_test, y_pred_proba)

    # Run Optuna optimization
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    # Train final model with best parameters
    lgb_model = lgb.LGBMClassifier(**study.best_params, objective="binary", random_state=random_state)
    lgb_model.fit(X_train, y_train)

    # Make predictions
    y_pred = lgb_model.predict(X_test)
    y_pred_proba = lgb_model.predict_proba(X_test)[:, 1]

    # Evaluate performance
    auc_score = roc_auc_score(y_test, y_pred_proba)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"AUC Score: {auc_score}")
    print(f"Accuracy: {accuracy * 100:.2f}%")

    joblib.dump(lgb_model, os.path.join(model_dir, "model.pkl"))

    test_results_df = X_test.copy()
    test_results_df[id_column] = ids_test
    test_results_df[target_column] = y_test
    test_results_df[model_name] = y_pred_proba

    test_results_df = test_results_df[["ID", target_column, model_name]]

    results_dir = f"../results/predictions/clinical/"
    os.makedirs(results_dir, exist_ok=True)

    test_results_df.to_csv(f"{results_dir}{model_name}.txt", sep="\t", index=False)

### FuncVEP-CTI (clinical-trained tools included)

In [4]:
train_funcvep("FuncVEP_CTI")

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-06-13 13:33:41,661] A new study created in memory with name: no-name-4e78d149-444e-4a91-81ed-0509b9d5e7ba
[I 2025-06-13 13:33:46,846] Trial 0 finished with value: 0.9714951026290293 and parameters: {'num_leaves': 97, 'n_estimators': 189, 'subsample': 0.9369295210606873, 'colsample_bytree': 0.7943108384509181, 'learning_rate': 0.14606222834454977, 'lambda_l1': 0.16181955382312915, 'lambda_l2': 0.46215435131100113, 'min_child_weight': 10}. Best is trial 0 with value: 0.9714951026290293.
[I 2025-06-13 13:33:52,141] Trial 1 finished with value: 0.9691705815780688 and parameters: {'num_leaves': 54, 'n_estimators': 212, 'subsample': 0.9398436892755107, 'colsample_bytree': 0.5325035312690969, 'learning_rate': 0.017943435669934442, 'lambda_l1': 4.2146583850549915, 'lambda_l2': 1.2768708225160423, 'min_child_weight': 8}. Best is trial 0 with value: 0.9714951026290293.
[I 2025-06-13 13:33:58,329] Trial 2 finished with value: 0.96597080303

AUC Score: 0.9736730689377223
Accuracy: 93.40%


### FuncVEP-CTE (clinical-trained tools excluded)

In [5]:
train_funcvep("FuncVEP_CTE")

[I 2025-06-13 13:40:52,468] A new study created in memory with name: no-name-2b0b456e-0047-459f-9615-4808d879884a
[I 2025-06-13 13:40:57,949] Trial 0 finished with value: 0.9496706589264051 and parameters: {'num_leaves': 253, 'n_estimators': 63, 'subsample': 0.6176745109823558, 'colsample_bytree': 0.8943193273457939, 'learning_rate': 0.21753521328338662, 'lambda_l1': 0.1550447604576234, 'lambda_l2': 0.7079812226463329, 'min_child_weight': 1}. Best is trial 0 with value: 0.9496706589264051.
[I 2025-06-13 13:40:59,845] Trial 1 finished with value: 0.9532042566010699 and parameters: {'num_leaves': 238, 'n_estimators': 57, 'subsample': 0.5696677792070465, 'colsample_bytree': 0.7502622777879551, 'learning_rate': 0.2443586501604376, 'lambda_l1': 7.112941667784829, 'lambda_l2': 0.7935086461586226, 'min_child_weight': 9}. Best is trial 1 with value: 0.9532042566010699.
[I 2025-06-13 13:41:02,628] Trial 2 finished with value: 0.9490478012717695 and parameters: {'num_leaves': 64, 'n_estimators':

AUC Score: 0.9580487050260135
Accuracy: 91.98%


### FuncVEP-SP (single predictor)

In [6]:
train_funcvep("FuncVEP_SP")

[I 2025-06-13 13:49:35,790] A new study created in memory with name: no-name-0165b23f-c1b0-4e12-b81b-40c8e5be7cce
[I 2025-06-13 13:49:42,713] Trial 0 finished with value: 0.9420375994333217 and parameters: {'num_leaves': 126, 'n_estimators': 246, 'subsample': 0.984903631737561, 'colsample_bytree': 0.8068903180618882, 'learning_rate': 0.28049222199416657, 'lambda_l1': 0.9658530121842493, 'lambda_l2': 0.3318048235331907, 'min_child_weight': 3}. Best is trial 0 with value: 0.9420375994333217.
[I 2025-06-13 13:49:47,462] Trial 1 finished with value: 0.9320108124832073 and parameters: {'num_leaves': 91, 'n_estimators': 100, 'subsample': 0.9177373050299935, 'colsample_bytree': 0.7915530766342582, 'learning_rate': 0.015379531706146738, 'lambda_l1': 0.3158634588303593, 'lambda_l2': 6.142356161601135, 'min_child_weight': 5}. Best is trial 0 with value: 0.9420375994333217.
[I 2025-06-13 13:49:52,247] Trial 2 finished with value: 0.9489745239006359 and parameters: {'num_leaves': 178, 'n_estimator

AUC Score: 0.9504685680787488
Accuracy: 91.06%


### ClinVEP_CTI (all tools included)

In [7]:
train_clinvep("ClinVEP_CTI")

[I 2025-06-13 14:03:18,092] A new study created in memory with name: no-name-3e51a34f-5448-43e6-b268-bb55e6c44a63
[I 2025-06-13 14:03:21,945] Trial 0 finished with value: 0.9994319423202971 and parameters: {'num_leaves': 88, 'n_estimators': 285, 'subsample': 0.754360862891042, 'colsample_bytree': 0.7098350393659985, 'learning_rate': 0.1821963528101254, 'lambda_l1': 0.29317219210750606, 'lambda_l2': 0.2036759833220673, 'min_child_weight': 1}. Best is trial 0 with value: 0.9994319423202971.
[I 2025-06-13 14:03:24,219] Trial 1 finished with value: 0.998477896730027 and parameters: {'num_leaves': 188, 'n_estimators': 138, 'subsample': 0.8150606037127408, 'colsample_bytree': 0.9764291899072295, 'learning_rate': 0.010531527985510693, 'lambda_l1': 5.303730058752582, 'lambda_l2': 0.3590759492586038, 'min_child_weight': 1}. Best is trial 0 with value: 0.9994319423202971.
[I 2025-06-13 14:03:25,620] Trial 2 finished with value: 0.9993125045517443 and parameters: {'num_leaves': 199, 'n_estimators

AUC Score: 0.9994785521811957
Accuracy: 98.89%


### ClinVEP_CTE (clinical-trained tools excluded)

In [8]:
train_clinvep("ClinVEP_CTE")

[I 2025-06-13 14:08:08,951] A new study created in memory with name: no-name-ecebed50-cb6f-4046-9243-24f16beeef9b
[I 2025-06-13 14:08:14,613] Trial 0 finished with value: 0.9848255771611683 and parameters: {'num_leaves': 33, 'n_estimators': 289, 'subsample': 0.5195482329723622, 'colsample_bytree': 0.6273603869252373, 'learning_rate': 0.12232577739416184, 'lambda_l1': 0.21448907210566967, 'lambda_l2': 0.23083685166841825, 'min_child_weight': 4}. Best is trial 0 with value: 0.9848255771611683.
[I 2025-06-13 14:08:18,955] Trial 1 finished with value: 0.9842487801325467 and parameters: {'num_leaves': 228, 'n_estimators': 249, 'subsample': 0.5735526593760883, 'colsample_bytree': 0.6375671578048947, 'learning_rate': 0.09061321914909153, 'lambda_l1': 4.776189668485952, 'lambda_l2': 0.111713675303899, 'min_child_weight': 9}. Best is trial 0 with value: 0.9848255771611683.
[I 2025-06-13 14:08:23,318] Trial 2 finished with value: 0.984251693248853 and parameters: {'num_leaves': 181, 'n_estimator

AUC Score: 0.9863403976403757
Accuracy: 94.71%


### ClinVEP_SP (single predictor)

In [9]:
train_clinvep("ClinVEP_SP")

[I 2025-06-13 14:12:45,309] A new study created in memory with name: no-name-b7e3fe6d-b411-4744-af8d-8ffb13486e37
[I 2025-06-13 14:12:50,418] Trial 0 finished with value: 0.9601718738620639 and parameters: {'num_leaves': 88, 'n_estimators': 236, 'subsample': 0.64641665768032, 'colsample_bytree': 0.666335716351102, 'learning_rate': 0.011021721375631791, 'lambda_l1': 3.2174849644241954, 'lambda_l2': 6.42329341653818, 'min_child_weight': 9}. Best is trial 0 with value: 0.9601718738620639.
[I 2025-06-13 14:12:54,054] Trial 1 finished with value: 0.9602388755371057 and parameters: {'num_leaves': 242, 'n_estimators': 87, 'subsample': 0.6618343377722153, 'colsample_bytree': 0.611110233643366, 'learning_rate': 0.010925155690522246, 'lambda_l1': 0.1927913632049286, 'lambda_l2': 1.7883664221866342, 'min_child_weight': 5}. Best is trial 1 with value: 0.9602388755371057.
[I 2025-06-13 14:12:56,711] Trial 2 finished with value: 0.9623829291384459 and parameters: {'num_leaves': 64, 'n_estimators': 9

AUC Score: 0.9683460782171729
Accuracy: 90.87%
