In this notebook we want to examine whether the mutations in positions with extreme mfe are more pathogenic than mutations found in other positions. We will use ClinVar data and keep variants labeled as Pathogenic or Benign. 

## Imports

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from typing import Tuple
from scipy.stats import hypergeom

## Functions

In [2]:
def get_scores_parameter_set(measure: str, permutation: str, gene_id: str, mut_start_pos: int) -> Tuple[float, float]:
    ''' 
    For a given paramter set and a given mutation, get its z-score and p-value
    '''
    path = f"../Results/z-scores/{gene_id}_{permutation}.pickle"
    df = pd.read_pickle(path)
    zscore = df.loc[mut_start_pos]["z-score"]
    pval = df.loc[mut_start_pos]["corrected p-value"]
    
    return(zscore, pval)
   

In [8]:
def get_scores_and_pvals(mut_info: dict, gene_protein_dict: dict, cds_chrom_dict: dict) -> Tuple[float, float, float, float, float, float, float, float]:
    ''' 
    Get the z-scores and p-vals of a single ClinVar variant according to all measures 
    '''
    try:
    
        #basic info of the mutation
        mut_start = mut_info["Start_Position"] # 1-based start position, relative to the chromosome
        gene_id = mut_info["Gene stable ID"]
        protein_id = gene_protein_dict[gene_id]

        #map from a 1-based position relative to the chromosome to a 0-based position relative to the cds
        cds2chr = cds_chrom_dict[gene_id, protein_id] #mapping cds_pos -> chrm_pos of the current gene. 0-based. 
        chr2cds = {v: k for k, v in cds2chr.items()} #reverse the dictionary, obtain a mapping of chrm_pos -> cds pos. 0-based. 
        cds_mut_start = chr2cds[mut_start - 1] #0-based start position relative to the CDS. 
        aa_mut_start = int(np.floor(cds_mut_start / 3)) #results of CAI are in codons, not nts

        #get scores from the dfs. Each protein has a seperate df that containes the scores of each of its cds positions
        mfe_ver_zscore, mfe_ver_pval = get_scores_parameter_set("mfe", "vertical", gene_id, cds_mut_start)
        mfe_col_zscore, mfe_col_pval = get_scores_parameter_set("mfe", "column", gene_id, cds_mut_start)

        return(mfe_ver_zscore, mfe_ver_pval,
               mfe_col_zscore, mfe_col_pval)    
    except:
        
        return(None, None, None, None)

In [None]:
def calc_hg_pval(counts_df: pd.DataFrame) -> Tuple[float, float]:
    ''' This function calculates a hypergeometric p-value. 
    "population_size" - how many TCGA variants are there? (in our analysis - all SNPs in the CDSs of all genes)
    "total_success" - how many of these TCGA variants are *not* in 1000G ("therefore treated as "non-null")
    "num_draws" - how many variants are in our "sample"? in our case, how many variants received the lowest / highest
    1% of z-scores?
    "observed_success" - how many of the variants in the sample are *not* in 1000G? 
    '''
    population_size = counts_df["Original"].sum()
    total_success = counts_df.loc[False,"Original"] 
    num_draws_low = counts_df["Percentile_low"].sum()
    num_draws_high = counts_df["Percentile_high"].sum()

    #assert(num_draws == counts_df.loc["Percentile99"].sum())
    observed_success_low = counts_df.loc[False,"Percentile_low"]
    observed_success_high = counts_df.loc[False,"Percentile_high"]

    
    hg_p_low = hypergeom.sf(observed_success_low - 1, population_size, total_success, num_draws_low)
    hg_p_high = hypergeom.sf(observed_success_high - 1, population_size, total_success, num_draws_high)
    
    return(hg_p_low, hg_p_high)

In [4]:
def get_patho_benign_ratio(df: pd.DataFrame, measure: str, low_percentile: float, high_percentile: float) -> Tuple[pd.DataFrame, float, float, pd.DataFrame]:
    '''
    We have the group of all ClinVar variants, and the groups of ClinVar variants with extreme z-scores. 
    here we check the pathogenic/benign ratios in these groups
    '''
    df = df[~df[measure].isna()] #for example if we are looking at mfe z-scores - remove mutations that dont have this score
    df = df.drop_duplicates(subset=['mut_id']) #some mutations have duplicates with the same scores and they shouldnt count more than once.. 
    original_ratio = df["Clinical significance"].value_counts() / df["Clinical significance"].shape[0]
    
    bottom_percentile = np.percentile(df[measure], low_percentile)
    top_percentile = np.percentile(df[measure], high_percentile)

    df_low = df[df[measure] <= bottom_percentile].copy()
    df_high = df[df[measure] >= top_percentile].copy()

    ratio_low = df_low["Clinical significance"].value_counts() / df_low["Clinical significance"].shape[0]
    ratio_high = df_high["Clinical significance"].value_counts() / df_high["Clinical significance"].shape[0]
    
    ratios_df = pd.DataFrame()
    ratios_df["original"] = original_ratio
    ratios_df["ratio_low"] = ratio_low
    ratios_df["ratio_high"] = ratio_high
    
    nums_df = ratios_df * [df.shape[0], df_low.shape[0], df_high.shape[0]]
    nums_df = nums_df.astype(int)

    hg_p_low, hg_p_high = calc_hg_pval(nums_df)

    return(ratios_df, hg_p_low, hg_p_high, nums_df) 

In [6]:
def plot_bar_chart(ratios_df: pd.DataFrame, nums_df: pd.DataFrame, measure: str, low_percentile: float, high_percentile: float) -> None:
    ''' 
    Plot the pathogenic/benign ratios in the three groups and see if there is an enrichement
    '''
    group_data = np.array([
        [ratios_df.loc['Pathogenic', 'original'], ratios_df.loc['Benign', 'original']],
        [ratios_df.loc['Pathogenic', 'ratio_low'], ratios_df.loc['Benign', 'ratio_low']],
        [ratios_df.loc['Pathogenic', 'ratio_high'], ratios_df.loc['Benign', 'ratio_high']]
    ])
    
    # Calculate the pathogenic ratio and benign ratio for each group
    patho_ratio = group_data[:, 0] / group_data.sum(axis=1)
    benign_ratio = group_data[:, 1] / group_data.sum(axis=1)

    # Create a bar chart with stacked bars for each group
    x_labels = [f'original\n(n = {int(nums_df["original"].sum())})', 
                f'{low_percentile} percentile\n(n = {int(nums_df["ratio_low"].sum())})', 
                f'{high_percentile} percentile\n(n = {int(nums_df["ratio_high"].sum())})']
    x_pos = np.arange(len(x_labels))

    fig, ax = plt.subplots()

    bar_width = 0.5

    # Plot the bars for pathogenic ratio
    patho_bars = ax.bar(x_pos, patho_ratio, bar_width, label='Pathogenic', color='blue')

    # Add labels for pathogenic ratio on top of the bars
    dist_from_bottom = 0.3
    for bar in patho_bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2, dist_from_bottom, f'{height:.2f}', ha='center', color='white')

    # Plot the bars for benign ratio
    benign_bars = ax.bar(x_pos, benign_ratio, bar_width, label='Benign', color='orange', bottom=patho_ratio)

    ax.set_xticks(x_pos)
    ax.set_xticklabels(x_labels)

    ax.set_ylabel('Ratio')
    ax.legend([patho_bars[0], benign_bars[0]], ['Pathogenic', 'Benign'])
    
    m = "MFE" if "mfe" in measure else "CAI"
    perm = "vertical" if "v" in measure else "column"
    if "z" not in measure:
        plt.title(f"Z-scores obtained with {m}")
    else:
        plt.title(f"Z-scores obtained with {m}, {perm} permutations")

    plt.show()



In [7]:
def calc_emperical_pval(clinvar_df: pd.DataFrame, measure: str, num_random_groups: int, low_percentile: float, high_percentile: float) -> Tuple[float, float]:
    ''' 
    Create randomizations (choose random ClinVar variants) in the size of the original groups to
    obtain an empirical p-value 
    ''' 
    clinvar_df = clinvar_df[~clinvar_df[measure].isna()] #remove mutations that don't have a score for the current measure
    clinvar_df = clinvar_df.drop_duplicates(subset=['mut_id']) #remove duplicate mutations
    
    bottom_percentile, top_percentile = np.percentile(clinvar_df[measure], low_percentile), np.percentile(clinvar_df[measure], high_percentile) #calculate 
    #percentile 1 and 99 for example
    
    df_low, df_high = clinvar_df[clinvar_df[measure] <= bottom_percentile].copy(),clinvar_df[clinvar_df[measure] >= top_percentile].copy() #get the mutations
    #below bottom_percentile and above top_percentile for the current score
    
    num_muts_low, num_muts_high = df_low.shape[0], df_high.shape[0] # number of mutations in these two groups (should be very similar but not necceserily identical)
    num_patho_low, num_patho_high = df_low["Clinical significance"].value_counts()["Pathogenic"] , df_high["Clinical significance"].value_counts()["Pathogenic"] #num 
    #pathogenic mutations in those groups
    
    #get the number of pathogenic mutations in randomly chosen groups of the same size:
    num_patho_rand_low = np.zeros(num_random_groups)
    num_patho_rand_high = np.zeros(num_random_groups)
    
    for i in range(num_random_groups):
        patho_rand_low = clinvar_df.sample(n = num_muts_low)["Clinical significance"].value_counts()["Pathogenic"] #choose randomly a group of size "num_muts1"
        #and see how many pathogenic mutations are in it 
        patho_rand_high = clinvar_df.sample(n = num_muts_high)["Clinical significance"].value_counts()["Pathogenic"] 
        num_patho_rand_low[i] = patho_rand_low #save the scores
        num_patho_rand_high[i] = patho_rand_high
        
    p_val_low = 1 - np.sum(num_patho_rand_low < num_patho_low) / (num_random_groups)
    p_val_high = 1 - np.sum(num_patho_rand_high < num_patho_high) / (num_random_groups)
    
    return(p_val_low, p_val_high)
 

## Main

In [3]:
''' Get the variants from ClinVar '''
clinvar_df = pd.read_pickle("../co_trans_data/ClinVar_processed_df.pickle")

In [6]:
''' Get needed variables to map the "mut_start" to  a position on the gene's cds sequence '''

# a dictionary that maps the gene ids to the protein ids that we used: 
gene_protein_dict = pd.read_pickle("../co_trans_data/gene_protein_dict.pickle") #map between the gene to the protein id used by ensembl

# a dictionary that maps the cds positions to the chromosome positions of our genes
cds_to_chrom_dict = pd.read_pickle("../co_trans_data/cds_to_chrom_dict_with_protein_id.pickle") #dictionary mapping between CDS to chromosome positions


In [155]:
''' get the z-scores and p-vals for all variants on ClinVar, and save it'''

clinvar_df[["mfe_v_z","mfe_v_p",
            "mfe_c_z","mfe_c_p"]] = clinvar_df.apply(lambda x: pd.Series(get_scores_and_pvals(x, gene_protein_dict, cds_to_chrom_dict)), axis = 1)

with open('../Results/validation/clinvar_df_with_zscores_pvals', 'wb') as handle:
    pickle.dump(clinvar_df, handle, protocol=3)
