## Run inference on new variants

In [1]:
import os
import joblib
import pandas as pd

def impute_missing_values(df):
    imputer_dir = "../models/imputation"

    with open("../resources/feature_lists/columns_to_impute.txt", "r") as f:
        columns_to_impute = [line.strip() for line in f.readlines()]

    df.columns = df.columns.str.replace(" ", "_")

    id_column = df["ID"].copy()

    df = df.apply(lambda col: pd.to_numeric(col, errors='coerce') if col.name != "ID" else col)

    df["ID"] = id_column

    for col in columns_to_impute:
        model_path = os.path.join(imputer_dir, f"{col}_imputer.pkl")

        if col not in df.columns:
            print(f"Skipping {col}: not found in dataframe.")
            continue

        if not os.path.exists(model_path):
            print(f"Skipping {col}: imputation model not found.")
            continue

        missing_mask = df[col].isna()
        if missing_mask.sum() == 0:
            continue

        imputer = joblib.load(model_path)
        predictors = imputer.feature_name_

        # Subset the DataFrame to rows with missing target values and the required predictors
        available_predictors = [p for p in predictors if p in df.columns]
        X_pred = df.loc[missing_mask, available_predictors].copy()

        # Reorder columns to match the model's expected input
        X_pred = X_pred.reindex(columns=predictors)

        if not X_pred.empty:
            df.loc[missing_mask, col] = imputer.predict(X_pred)
            print(f"Imputed {missing_mask.sum()} missing values in {col}.")

    return df


In [2]:
def run_inference(model_name, df, dataset):
    model_dir = f"../models/{model_name}"
    output_path = f"../results/predictions/inference/{dataset}/{model_name}.txt"

    trained_on = pd.read_csv(os.path.join(model_dir, "training_variants.txt"), sep="\t")
    df = df[~df["ID"].isin(trained_on["ID"])].copy()

    lgb_model = joblib.load(os.path.join(model_dir, "model.pkl"))
    trained_features = lgb_model.feature_name_

    df[trained_features] = df[trained_features].apply(pd.to_numeric, errors='coerce')

    X = df[trained_features]
    df[model_name] = lgb_model.predict_proba(X)[:, 1]

    result_df = df[["ID", model_name]]
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    result_df.to_csv(output_path, sep="\t", index=False)

    return result_df


In [4]:
dataset = "IEI_variants"
df_input = pd.read_csv("../data/inference/IEI_variants_model_input.txt", sep="\t").drop_duplicates(subset=["ID"])

df_imputed = impute_missing_values(df_input.copy())

funcvep_cti = run_inference("FuncVEP_CTI", df_imputed, dataset)
funcvep_cte = run_inference("FuncVEP_CTE", df_imputed, dataset)
funcvep_sp = run_inference("FuncVEP_SP", df_imputed, dataset)
clinvep = run_inference("ClinVEP", df_imputed, dataset)

models_combined = funcvep_cti.merge(funcvep_cte, on="ID", how="outer").merge(funcvep_sp, on="ID", how="outer").merge(clinvep, on="ID", how="outer")

models_combined.to_csv(f"../results/predictions/inference/{dataset}/models_combined.txt", sep="\t", index=False)

  df_input = pd.read_csv("../data/inference/IEI_variants_model_input.txt", sep="\t").drop_duplicates(subset=["ID"])


Imputed 18765 missing values in MetaLR_score.
Imputed 20015 missing values in MutScore_score.
Imputed 16688 missing values in fathmm_XF_coding_score.
Imputed 21818 missing values in MutFormer_score.
Imputed 18765 missing values in MetaSVM_score.
Imputed 41505 missing values in VEST4_score.
Imputed 159607 missing values in EVH_epistatic.
Imputed 159607 missing values in EVH_independent.
Imputed 30136 missing values in M_CAP_score.
Imputed 45483 missing values in DEOGEN2_score.
Imputed 139401 missing values in glm_CaddDeogenRevel.
Imputed 63247 missing values in glm_AlphDeogenRevel.
Imputed 63132 missing values in glm_AlphCaddDeogen.
Imputed 62812 missing values in glm_AlphRevelCadd.
Imputed 62499 missing values in glm_AlphRevel.
Imputed 43838 missing values in glm_DeogenRevel.
Imputed 42308 missing values in glm_RevelCadd.
Imputed 64109 missing values in MutPred_score.
Imputed 63360 missing values in REVEL_score.
Imputed 45779 missing values in MetaRNN_score.
Imputed 52184 missing value

  df[model_name] = lgb_model.predict_proba(X)[:, 1]
  df[model_name] = lgb_model.predict_proba(X)[:, 1]
  df[model_name] = lgb_model.predict_proba(X)[:, 1]
  df[model_name] = lgb_model.predict_proba(X)[:, 1]
