### Merge gene-level features

In [None]:
import pandas as pd

variant_features = pd.read_csv('../data/intermediate/variant_features_core.txt', sep='\t', low_memory=False)

variant_features['ensg'] = variant_features['ensg'].astype(str)
variant_features['gene'] = variant_features['gene'].astype(str)

# Helper function to merge gene-level features
def merge_gene_metric(df_main, filepath, key, suffix='_2'):
    df = pd.read_csv(filepath, sep='\t', low_memory=False)
    df[key] = df[key].astype(str)
    df = df.drop_duplicates(subset=[key], keep='first')
    return pd.merge(df_main, df, how='left', on=key, suffixes=('', suffix))

gene_metrics = [
    ('gnomad_constraint_metrics.txt', 'ensg'),
    ('phi.txt', 'gene'),
    ('rvis.txt', 'gene'),
    ('s_het.txt', 'gene'),
    ('geckov2_gene_knockout_effects.txt', 'gene'),
    ('wang2017_gene_knockout_effects.txt', 'gene'),
    ('avana_gene_knockout_effects.txt', 'gene'),
    ('gtex.txt', 'ensg')
]

for filename, key in gene_metrics:
    path = f'../resources/gene_level_features/{filename}'
    variant_features = merge_gene_metric(variant_features, path, key)


### Merge protein language model-derived features

In [None]:
esm2_metrics = pd.read_csv("../data/intermediate/esm2_wt_mut_comparison_metrics.txt", sep="\t").drop_duplicates()

variant_features = variant_features.merge(esm2_metrics, on="ID", how="left")

In [16]:
variant_features

Unnamed: 0,ID,ensg,enst,spliceai,SIFT_score,Polyphen2_HDIV_score,Polyphen2_HVAR_score,MutationTaster_score,MutationAssessor_score,PROVEAN_score,...,SS_G,SS_I,SS_T,SS_S,SS_-,FoldX_dG,esm2_mean_log_likelihood_difference,esm2_perplexity_difference,esm2_cosine_similarity,esm2_avg_abs_difference
0,10-100057090-C-T,ENSG00000120054,ENST00000370418,,0.18,0.002,0.003,0.24,0.82,-1.72,...,0.0,0.0,0.0,0.0,0.0,135.7490,-0.001133,0.001560,0.999975,0.001065
1,10-100069757-C-T,ENSG00000120054,ENST00000370418,,0.171,0.863,.,0.24,2.275,-4.19,...,1.0,0.0,0.0,0.0,0.0,135.7490,0.000762,-0.001011,0.999961,0.001273
2,10-100076062-G-A,ENSG00000120054,ENST00000370418,,1.0,0.005,0.002,0.1,-1.62,1.49,...,0.0,0.0,0.0,0.0,0.0,135.7490,0.006985,-0.008888,0.999914,0.001779
3,10-100081405-G-A,ENSG00000120054,ENST00000370418,,0.662,0.001,0.008,0.05,0.33,-1.43,...,0.0,0.0,0.0,0.0,0.0,135.7490,-0.009512,0.012117,0.999850,0.002473
4,10-100152307-T-C,ENSG00000107566,ENST00000421367,,0.138,.,.,0.02,.,-0.79,...,0.0,1.0,0.0,0.0,0.0,193.6820,-0.002086,0.002513,0.999942,0.001543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423445,X-154419608-C-T,ENSG00000102125,ENST00000601016,,.,.,.,.,3.725,.,...,0.0,0.0,0.0,0.0,0.0,87.6349,-0.019937,0.027467,0.999907,0.002163
423446,X-154546061-T-C,ENSG00000160211,ENST00000393562,,.,.,.,0.67,2.125,.,...,,,,,,,,,,
423447,X-2936766-C-T,ENSG00000157399,ENST00000381134,,0.098,0.963,0.715,0.93,.,-2.96,...,0.0,0.0,0.0,0.0,0.0,143.7240,-0.005589,0.007478,0.999922,0.001887
423448,X-2938158-G-A,ENSG00000157399,ENST00000381134,,0.001,1.0,1.0,0.75,.,-5.63,...,0.0,0.0,1.0,0.0,0.0,143.7240,-0.012496,0.016540,0.999956,0.001315


### Imputation

Train imputation models (LightGBM regressors) for features with informative missingness and impute missing inputs

In [18]:
import os
import joblib
import pandas as pd
import lightgbm as lgb

with open("../resources/feature_lists/columns_to_impute.txt", "r") as f:
    columns_to_impute = [line.strip() for line in f.readlines()]

imputed_file = "../data/intermediate/variant_features_extended_imputed.txt"
model_dir = "../models/imputation"
os.makedirs(model_dir, exist_ok=True)

df = variant_features.copy()

non_predictors = {"ID", "gene", "ensg", "enst"}
feature_columns = [col for col in df.columns if col not in non_predictors]
df[feature_columns] = df[feature_columns].apply(pd.to_numeric, errors="coerce")

imputed_values = {}

for col in columns_to_impute:
    if col not in df.columns:
        continue

    print(f"Imputing {col}...")

    df_non_null = df.dropna(subset=[col])
    df_null = df[df[col].isnull()]

    if df_non_null.empty or df_null.empty:
        print(f"  Skipped {col} (no train/predict data).")
        continue

    predictors = [f for f in df.columns if f not in non_predictors and f != col]

    X_train = df_non_null[predictors].copy()
    y_train = df_non_null[col]
    X_pred = df_null[predictors].copy()

    X_train = X_train.dropna(axis=1, how="all")
    X_pred = X_pred[X_train.columns]

    model = lgb.LGBMRegressor(
        objective="regression",
        random_state=42,
        n_estimators=180,
        num_leaves=155,
        subsample=0.8,
        colsample_bytree=0.7,
        learning_rate=0.11,
        verbose=-1,
    )
    model.fit(X_train, y_train)

    model_path = os.path.join(model_dir, f"{col}_imputer.pkl")
    joblib.dump(model, model_path)

    imputed_values[col] = model.predict(X_pred)

for col, preds in imputed_values.items():
    missing_mask = df[col].isnull()
    df.loc[missing_mask, col] = preds

df.to_csv(imputed_file, sep="\t", index=False)
print(f"\nImputed variant_features saved to: {imputed_file}")


Imputing glm_CaddDeogenRevel...
Imputing glm_AlphDeogenRevel...
Imputing glm_AlphCaddDeogen...
Imputing glm_AlphRevelCadd...
Imputing glm_AlphRevel...
Imputing MutPred_score...
Imputing glm_DeogenRevel...
Imputing glm_RevelCadd...
Imputing REVEL_score...
Imputing MetaRNN_score...
Imputing M_CAP_score...
Imputing EVH_epistatic...
Imputing EVH_independent...
Imputing VARITY_ER_LOO_score...
Imputing VARITY_R_LOO_score...
Imputing VARITY_ER_score...
Imputing VARITY_R_score...
Imputing MutFormer_score...
Imputing MetaLR_score...
Imputing MetaSVM_score...
Imputing glm_AlphDeogen...
Imputing VEST4_score...
Imputing fathmm_XF_coding_score...
Imputing MutScore_score...
Imputing glm_CaddDeogen...
Imputing DEOGEN2_score...
Imputing ClinPred_score...
Imputing MPC_score...
Imputing sigma_score...
Imputing glm_AlphCadd...
Imputing gMVP_score...
Imputing EWSIM...
Imputing ESM1v...
Imputing GERP_91_mammals...
Imputing popEVE...
Imputing Polyphen2_HVAR_score...
Imputing PHACT...
Imputing LIST_S2_score.

### Create functional and clinical training inputs

In [None]:
# df = pd.read_csv("../data/intermediate/variant_features_extended_imputed.txt", sep="\t")
variant_labels = pd.read_csv("../data/intermediate/variant_labels.txt", sep="\t")
df.drop(columns=["spliceai", "gene", "ensg", "enst"], inplace=True, errors="ignore")
functional_input = variant_labels[["ID", "weight", "functional_label"]].dropna(subset=["functional_label"]).merge(df, how="left", on="ID")
clinical_input = variant_labels[["ID", "clinical_label"]].dropna(subset=["clinical_label"]).merge(df, how="left", on="ID")

functional_input.to_csv("../data/final/functional_labels_model_input.txt", sep="\t", index=False)
clinical_input.to_csv("../data/final/clinical_labels_model_input.txt", sep="\t", index=False)