In [1]:
import os
import numpy as np
import pandas as pd
import argparse
import pickle
import tqdm
import json
from sklearn import mixture, linear_model, svm, gaussian_process
import h5py

In [2]:
data = pd.read_csv('/home/scratch/wpotosna/disease_variant_prediction_language_model/data/labels/PTEN_ClinVar_labels.csv')

In [3]:
data.protein_name.unique()[:100]

array(['1433G_HUMAN', '2A5D_HUMAN', '2AAA_HUMAN', '3BP2_HUMAN',
       'A1AT_HUMAN', 'A4_HUMAN', 'AAKG2_HUMAN', 'AATM_HUMAN',
       'ABCBB_HUMAN', 'ABCD1_HUMAN', 'ABD12_HUMAN', 'ABHD5_HUMAN',
       'ACAD9_HUMAN', 'ACHA_HUMAN', 'ACHB_HUMAN', 'ACHD_HUMAN',
       'ACHE_HUMAN', 'ACHG_HUMAN', 'ACOX1_HUMAN', 'ACTG_HUMAN',
       'ACY2_HUMAN', 'ADA_HUMAN', 'ADA2_HUMAN', 'ADAT3_HUMAN',
       'ADRO_HUMAN', 'ADT1_HUMAN', 'AGAL_HUMAN', 'AGM1_HUMAN',
       'AHI1_HUMAN', 'AICDA_HUMAN', 'AIFM1_HUMAN', 'AIRE_HUMAN',
       'ALAT2_HUMAN', 'ALDOB_HUMAN', 'ALG1_HUMAN', 'ALG11_HUMAN',
       'ALG12_HUMAN', 'ALG3_HUMAN', 'ALG6_HUMAN', 'ALR_HUMAN',
       'ALS_HUMAN', 'AMACR_HUMAN', 'AMHR2_HUMAN', 'AMPD3_HUMAN',
       'ANAG_HUMAN', 'ANGT_HUMAN', 'ANKL2_HUMAN', 'ANO10_HUMAN',
       'ANO5_HUMAN', 'ANT3_HUMAN', 'ANTR2_HUMAN', 'AOFA_HUMAN',
       'AP2A_HUMAN', 'AP2B_HUMAN', 'AP4B1_HUMAN', 'AP4M1_HUMAN',
       'AP4S1_HUMAN', 'APC_HUMAN', 'APOE_HUMAN', 'APTX_HUMAN',
       'AQP2_HUMAN', 'ARF_HUMAN', 'AR

In [7]:
data_dir = '/home/scratch/wpotosna/disease_variant_prediction_language_model/data'
GMM_stats_log_location = '/home/scratch/wpotosna/gmm/gmm_stats.csv'
GMM_stats_path = '/home/scratch/wpotosna/gmm'

In [8]:
mapping_file = pd.read_csv(data_dir+'/labels/ClinVar_labels_P53_PTEN_RASH_SCN5A.csv',low_memory=False)
protein_list = np.unique(mapping_file['protein_name'])
list_variables_to_keep=['protein_name','mutations','evol_indices']

In [28]:
path = '/home/scratch/wpotosna/data/logodds.h5'
h5 = h5py.File(path,'r')

logodds = []
protein_names = []
for seq in h5.keys():
    seq_array = np.array(h5[seq])
    logodds.extend(seq_array)
    protein_names.extend(np.repeat(np.array(seq), seq_array.shape[0])) 
    
all_evol_indices = pd.DataFrame(logodds, index=protein_names)
all_evol_indices.index.name = 'protein_name'
all_evol_indices.reset_index(inplace=True, drop=False)

X_train = all_evol_indices.iloc[:, 1:].values.reshape(-1, 1)
X_test = all_evol_indices.iloc[:, 1:].values

In [12]:
dict_models = {}
dict_pathogenic_cluster_index = {}

main_GMM = mixture.GaussianMixture(n_components=2, covariance_type='full',max_iter=1000,n_init=30,tol=1e-4)
main_GMM.fit(X_train)
        
dict_models['main'] = main_GMM
pathogenic_cluster_index = np.argmax(np.array(main_GMM.means_).flatten()) #The pathogenic cluster is the cluster with higher mean value
dict_pathogenic_cluster_index['main'] = pathogenic_cluster_index

with open(GMM_stats_log_location, "a") as logs:
    logs.write(",".join(str(x) for x in [
        'main', np.array(main_GMM.weights_).flatten()[dict_pathogenic_cluster_index['main']], np.array(main_GMM.means_).flatten()[dict_pathogenic_cluster_index['main']],
        np.array(main_GMM.means_).flatten()[1 - dict_pathogenic_cluster_index['main']], np.sqrt(np.array(main_GMM.covariances_).flatten()[dict_pathogenic_cluster_index['main']]),
        np.sqrt(np.array(main_GMM.covariances_).flatten()[1 - dict_pathogenic_cluster_index['main']])])+"\n")
    

for protein in tqdm.tqdm(protein_list, "Training all protein GMMs"):
    X_train_protein = all_evol_indices[all_evol_indices.protein_name=='seq1'].iloc[:, 1:].values.reshape(-1, 1)
    if len(X_train_protein) > 0: #We have evol indices computed for protein on file
        protein_GMM = mixture.GaussianMixture(n_components=2,covariance_type='full',max_iter=1000,tol=1e-4,weights_init=main_GMM.weights_,means_init=main_GMM.means_,precisions_init=main_GMM.precisions_)
        protein_GMM.fit(X_train_protein)
        dict_models[protein] = protein_GMM
        dict_pathogenic_cluster_index[protein] = np.argmax(np.array(protein_GMM.means_).flatten())
        with open(GMM_stats_log_location, "a") as logs:
            logs.write(",".join(str(x) for x in [
                protein, np.array(protein_GMM.weights_).flatten()[dict_pathogenic_cluster_index[protein]], np.array(protein_GMM.means_).flatten()[dict_pathogenic_cluster_index[protein]],
                np.array(protein_GMM.means_).flatten()[1 - dict_pathogenic_cluster_index[protein]], np.sqrt(np.array(protein_GMM.covariances_).flatten()[dict_pathogenic_cluster_index[protein]]),
                np.sqrt(np.array(protein_GMM.covariances_).flatten()[1 - dict_pathogenic_cluster_index[protein]])
                ])+"\n")
    else:
        if args.verbose:
            print("No evol indices for the protein: "+str(protein)+". Skipping.")
        
pickle.dump(dict_models, open(GMM_stats_path+'/GMM_model_dictionary.pkl', 'wb'))
pickle.dump(dict_pathogenic_cluster_index, open(GMM_stats_path+'/GMM_pathogenic_cluster_index_dictionary.pkl', 'wb'))




Training all protein GMMs: 100%|████████████████████████████████████████████████████| 4/4 [00:00<00:00, 50.62it/s]


In [36]:
main_GMM.get_params()

{'covariance_type': 'full',
 'init_params': 'kmeans',
 'max_iter': 1000,
 'means_init': None,
 'n_components': 2,
 'n_init': 30,
 'precisions_init': None,
 'random_state': None,
 'reg_covar': 1e-06,
 'tol': 0.0001,
 'verbose': 0,
 'verbose_interval': 10,
 'warm_start': False,
 'weights_init': None}

In [38]:
main_GMM.weights_

array([0.41935484, 0.58064516])

# Compute EVE scores


In [43]:


def compute_weighted_score_two_GMMs(X_pred, main_model, protein_model, cluster_index_main, cluster_index_protein, protein_weight):
    return protein_model.predict_proba(X_pred)[:,cluster_index_protein] * protein_weight + (main_model.predict_proba(X_pred)[:,cluster_index_main]) * (1 - protein_weight)

def compute_weighted_class_two_GMMs(X_pred, main_model, protein_model, cluster_index_main, cluster_index_protein, protein_weight):
    """By construct, 1 is always index of pathogenic, 0 always that of benign"""
    proba_pathogenic = protein_model.predict_proba(X_pred)[:,cluster_index_protein] * protein_weight + (main_model.predict_proba(X_pred)[:,cluster_index_main]) * (1 - protein_weight)
    return (proba_pathogenic > 0.5).astype(int)

def compute_EVE_scores(test_data_protein, all_evol_indices, dict_models, dict_pathogenic_cluster_index, protein_GMM_weight):
    if protein_GMM_weight > 0.0:
        all_scores = all_evol_indices.copy()
        all_scores['EVE_scores'] = np.nan
        all_scores['EVE_classes_100_pct_retained'] = ""
        for protein in tqdm.tqdm(protein_list,"Scoring all protein mutations"):
            try:
                test_data_protein = all_scores[all_scores.protein_name==protein]
                X_test_protein = np.array(test_data_protein).reshape(-1, 1)
                mutation_scores_protein = compute_weighted_score_two_GMMs(X_pred=X_test_protein, 
                                                                            main_model = dict_models['main'], 
                                                                            protein_model=dict_models[protein], 
                                                                            cluster_index_main = dict_pathogenic_cluster_index['main'], 
                                                                            cluster_index_protein = dict_pathogenic_cluster_index[protein], 
                                                                            protein_weight = protein_GMM_weight)
                gmm_class_protein = compute_weighted_class_two_GMMs(X_pred=X_test_protein, 
                                                                            main_model = dict_models['main'], 
                                                                            protein_model=dict_models[protein], 
                                                                            cluster_index_main = dict_pathogenic_cluster_index['main'], 
                                                                            cluster_index_protein = dict_pathogenic_cluster_index[protein], 
                                                                            protein_weight = protein_GMM_weight)
                print(mutation_scores_protein)
                gmm_class_label_protein = pd.Series(gmm_class_protein).map(lambda x: 'Pathogenic' if x == 1 else 'Benign')
                    
                all_scores.loc[all_scores.protein_name==protein, 'EVE_scores'] = np.array(mutation_scores_protein)
                all_scores.loc[all_scores.protein_name==protein, 'EVE_classes_100_pct_retained'] = np.array(gmm_class_label_protein)
            except:
                print("Issues with protein: "+str(protein)+". Skipping.")
    else:
        all_scores = all_evol_indices.copy()
        mutation_scores = dict_models['main'].predict_proba(np.array(all_scores['evol_indices']).reshape(-1, 1))
        all_scores['EVE_scores'] = mutation_scores[:,dict_pathogenic_cluster_index['main']]
        gmm_class = dict_models['main'].predict(np.array(all_scores['evol_indices']).reshape(-1, 1))
        all_scores['EVE_classes_100_pct_retained'] = np.array(pd.Series(gmm_class).map(lambda x: 'Pathogenic' if x == dict_pathogenic_cluster_index['main'] else 'Benign'))
        
    len_before_drop_na = len(all_scores)
    all_scores = all_scores.dropna(subset=['EVE_scores'])
    len_after_drop_na = len(all_scores)

In [44]:
compute_EVE_scores(X_test, all_evol_indices, dict_models, dict_pathogenic_cluster_index, main_GMM.weights_.mean())

Scoring all protein mutations: 100%|██████████████████████████████████████████████| 4/4 [00:00<00:00, 1243.40it/s]

Issues with protein: P53_HUMAN. Skipping.
Issues with protein: PTEN_HUMAN. Skipping.
Issues with protein: RASH_HUMAN. Skipping.
Issues with protein: SCN5A_HUMAN. Skipping.



