# Notebook for making feature selection tables (TableS16), metric tables (TableS22) and preparing for feature selection table (Table3)

In [None]:
# Configutation part #

# Directory with matrix after deduplication
data_dir = "../db/cv_bess_files.2"
# Directory with matrix before deduplication
data_dir_pre = "../db/cv_bess_files"

# Output from elastic_net.py
output_elastic_net = './output_elastic_net'

# Output path of elastic net regularization
final_output_elastic_net = './final_output_elastic_net'

# Annotation file
annotation_file = '../db/AL123456_rev.gff'
# Resistance genes file
resistance_gene_file = '../db/resistance_genes.csv'

# List of gamma parameter for elastic net regularization
gammas = [0.25, 0.5, 0.75]

######################

In [1]:
import os

from copy import deepcopy

import pandas as pd
import numpy as np

from tqdm import tqdm

from scipy.stats import fisher_exact

In [2]:

if not os.path.exists(final_output_elastic_net):
    os.makedirs(final_output_elastic_net)
    
thr = '3'

drugs = ['Kanamycin', 'Amikacin', 'Streptomycin', 'Ofloxacin',
         'Moxifloxacin', 'Isoniazid', 'Rifampicin', 'Ethambutol',
         'Pyrazinamide', 'Capreomycin', 'Ethionamide', 'Prothionamide',
         'Ciprofloxacin']

annotation = pd.read_csv(annotation_file, sep='\t', index_col=None, header=None)
annotation = annotation[annotation[2] == "CDS"]
annotation['gene'] = [x.split(';')[2].split()[-1] for x in annotation[8]]
annotation['seq'] = [x.split(';')[-1].split()[1][1:] for x in annotation[8]]
annotation = annotation[['gene', 'seq']]

res_data = pd.read_csv(resistance_gene_file, sep='\t', 
                               index_col=None, header=0)

## Support Functions

In [3]:
def represents_int(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

In [4]:
def conj_matrix(X):
    value00 = len(X[(X.iloc[:,0] == 0) & (X.iloc[:,1] == 0)])
    value01 = len(X[(X.iloc[:,0] == 0) & (X.iloc[:,1] == 1)])
    value10 = len(X[(X.iloc[:,0] == 1) & (X.iloc[:,1] == 0)])
    value11 = len(X[(X.iloc[:,0] == 1) & (X.iloc[:,1] == 1)])
    return [value00, value01, value10, value11]

def fisher(X):
    n = len(X)
    [value00, value01, value10, value11] = conj_matrix(X)
    return fisher_exact([[value00, value01], [value10, value11]])[1]

In [5]:
def get_ind_by_name(pos_info, name, drug):
    if name.split('_')[-1] == 'domain':
        gene = "_".join(name.split('_')[:-1])
        temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == 'changed')]
    elif name.split('_')[-1] == 'broken':
        gene = "_".join(name.split('_')[:-1])
        temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == 'broken')]
    elif name.split('_')[-1] == 'snp':
        gene = "_".join(name.split('_')[:-3])
        pos = name.split('_')[-3]
        ind = name.split('_')[-2]
        temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == pos) & (pos_info['ind2'] == ind) & (pos_info['act'] == 'snp')]
    elif name.split('_')[-1] == 'del':
        gene = "_".join(name.split('_')[:-3])
        pos = name.split('_')[-3]
        ind = name.split('_')[-2]
        if represents_int(ind):
            temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == pos) & (pos_info['ind1'] == ind)  & (pos_info['ind2'] == 'del')]
        else:
            temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == pos) & (pos_info['ind2'] == ind)  & (pos_info['act'] == 'del')]
    elif name.split('_')[-1] == 'ins':
        gene = "_".join(name.split('_')[:-3])
        pos = name.split('_')[-3]
        ind = name.split('_')[-2]
        temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == pos) & (pos_info['ind2'] == ind)  & (pos_info['act'] == 'ins')]

    if len(temp) == 0: 
        return -1
    else: 
        return temp.index[0]
    
def get_name_by_ind(pos_info, j, drug):
    temp = pos_info.iloc[j]
    gene, pos, ind1, ind2, act = temp['gene'], temp['pos'], temp['ind1'], temp['ind2'], temp['act']

    if pos == 'changed':
        return f"{gene}_domain"
    elif pos == 'broken':
        return f"{gene}_broken"
    elif act == 'snp':
        return f"{gene}_{pos}_{ind2}_snp"
    elif ind2 == 'del':
        return f"{gene}_{pos}_{ind1}_{ind2}"
    elif act == 'del':
        return f"{gene}_{pos}_{ind2}_{act}"
    elif act == 'ins':
        return f"{gene}_{pos}_{ind2}_{act}"
    
def get_description(pos_info, ind):
    gene, pos, fr, to, act = pos_info.loc[ind].values
    if act == 'snp':
        return [gene, pos, 'SNV', fr, to]
    elif act == 'ins':
        return [gene, pos, 'Insertion', fr, to]
    elif act == 'del':
        return [gene, pos, 'Deletion', fr, to]
    elif to == 'del':
        seq = annotation[annotation['gene'] == gene]['seq'].to_numpy()[0]
        fr = seq[int(pos)-1: int(pos)+int(fr)]
        return [gene, pos, 'Deletion', fr, "-"]
    elif pos == 'changed':
        temp = [i for i in gene.split('_') if not represents_int(i)]
        new_gene = "_".join(temp[:-1])
        domain = "_".join(gene.split('_')[len(new_gene.split('_')):])
        return [new_gene, '-', f'{domain} is changed', '-', '-']
        return f"In {new_gene} PFAM {domain} is changed"
    elif pos == 'broken':
        return [gene, '-', 'Broken', '-', '-']

## Add statical info

In [6]:
for drug in tqdm(drugs):
    pos_info = pd.read_csv(f"{data_dir_pre}/{drug}.gt.pos.domains.3.0", sep='\t', 
                              index_col=None, header=None, usecols=[0, 1, 2, 3, 4])
    pos_info.columns = ['gene', 'pos', 'ind1', 'ind2', 'act']
    
    auc_data = pd.DataFrame()
    for gamma in gammas:
        temp = pd.read_csv(f"{output_elastic_net}/{drug}/auc_{gamma}.csv", sep='\t')
        temp['gamma'] = gamma
        auc_data = pd.concat([auc_data, temp], axis=0)

    auc_data.reset_index(inplace=True)

    l_max, gamma_max = auc_data.loc[np.argmax(auc_data['mean_auc'])][['l', 'gamma']].values

    for i in range(5):
        pos_info_temp = pd.read_csv(f"{data_dir_pre}/{drug}.gt.pos.domains.3.{i}", sep='\t', 
                              index_col=None, header=None, usecols=[0, 1, 2, 3, 4])
        pos_info_temp.columns = ['gene', 'pos', 'ind1', 'ind2', 'act']
        
        temp = pd.read_csv(f"{output_elastic_net}/{drug}/{l_max:.2f}_{gamma_max}_{i}.csv", 
                       sep='\t', index_col=0, header=0)
        temp['name'] = [", ".join([get_name_by_ind(pos_info_temp, int(i), drug) for i in str(x).split(', ')])
                        for x in temp.index]
        indeces = [int(str(x).split(', ')[0]) for x in temp.index]
        indeces = [get_ind_by_name(pos_info, get_name_by_ind(pos_info_temp, i, drug), drug) for i in indeces]
        
        sel_indeces = deepcopy(indeces)
        while -1 in sel_indeces: sel_indeces.remove(-1)
        
        X_test = pd.read_csv(f"{data_dir_pre}/{drug}.gt.domains.3.test.0", sep=' ', 
                             index_col=None, header=None, usecols=sel_indeces)
        X_train = pd.read_csv(f"{data_dir_pre}/{drug}.gt.domains.3.train.0", sep=' ', 
                             index_col=None, header=None, usecols=sel_indeces)
        X = pd.concat([X_train, X_test])
        
        y_train= pd.read_csv(f"{data_dir_pre}/{drug}.phen.domains.3.train.0", sep=' ', 
                             index_col=None, header=None)
        y_test = pd.read_csv(f"{data_dir_pre}/{drug}.phen.domains.3.test.0", sep=' ', 
                             index_col=None, header=None)
        y = pd.concat([y_train, y_test])
        X['y'] = y.to_numpy().ravel()


        pvalues = []
        values00 = []
        values01 = []
        values10 = []
        values11 = []

        for j in indeces:
            if j == -1: 
                pvalues.append('-')
                [value00, value01, value10, value11] = ['-', '-', '-', '-']
            else: 
                pvalues.append(fisher(X[[j, 'y']]))
                [value00, value01, value10, value11] = conj_matrix(X[[j, 'y']])
            values00.append(value00)
            values01.append(value01)
            values10.append(value10)
            values11.append(value11)
        temp['0_0'] = values00
        temp['0_1'] = values01
        temp['1_0'] = values10
        temp['1_1'] = values11
                
        
        temp['pvalue'] = pvalues
        temp.to_csv(f"{output_elastic_net}/{drug}/fisher_{l_max:.2f}_{gamma_max}_{i}.csv", 
                    sep='\t', header=True, index=True)

100%|██████████| 13/13 [10:19<00:00, 47.65s/it]


## Elastic net metrics (TableS22)

In [14]:
elastic_auc = pd.DataFrame(index=drugs, columns=['auc', 'sensitivity', 
                                                 'specificity', 'npv', 'ppv'])
gammas = [0.25, 0.5, 0.75]
for drug in drugs:
    auc_data = pd.DataFrame()
    for gamma in gammas:
        temp = pd.read_csv(f"{output_elastic_net}/{drug}/auc_{gamma}.csv", sep='\t')
        temp['gamma'] = gamma
        auc_data = pd.concat([auc_data, temp], axis=0)
    auc_data = auc_data[['mean_auc', 'mean_sensitivity', 
                        'mean_specificity', 'mean_npv', 'mean_ppv']]
    auc_data.columns = ['auc', 'sensitivity', 'specificity', 'npv', 'ppv']
    auc_data.reset_index(inplace=True)
    elastic_auc.loc[drug] = auc_data.loc[np.argmax(auc_data['auc'])]
elastic_auc.to_csv(f"{final_output_elastic_net}/TableS22.csv", sep='\t',
                  index=True, header=True)

## Form feature selection table 

In [16]:
all_data = pd.DataFrame()

for drug in tqdm(drugs):
    auc_data = pd.DataFrame()
    features = pd.DataFrame()
    
    auc_data = pd.DataFrame()
    for gamma in gammas:
        temp = pd.read_csv(f"{output_elastic_net}/{drug}/auc_{gamma}.csv", sep='\t')
        temp['gamma'] = gamma
        auc_data = pd.concat([auc_data, temp], axis=0)

    auc_data.reset_index(inplace=True)

    l_max, gamma_max = auc_data.loc[np.argmax(auc_data['mean_auc'])][['l', 'gamma']].values

    count_features = pd.DataFrame()
    result = pd.DataFrame(columns=['gene', 'pos', 'act', 'from', 'to', 'num', 
                                   'name', 'coef', 'pvalue', '0_0', 
                                       '0_1', '1_0', '1_1', 'count'])
    for i in range(5):
        temp = pd.read_csv(f"{output_elastic_net}/{drug}/fisher_{l_max:.2f}_{gamma_max}_{i}.csv", 
                           sep='\t', index_col=0, header=0)
        
        # indeces = [features_info[i][0] for i in indeces]

        count_features = pd.concat([count_features, temp], axis=0)
    count_features['count'] = 1    

    count_features = count_features.groupby(['name']).agg({'coef': np.mean, 
                                                                  'count': np.sum,
                                                                  'pvalue': lambda x: x.iloc[0],
                                                                  '0_0': lambda x: x.iloc[0],
                                                                  '0_1': lambda x: x.iloc[0],
                                                                  '1_0': lambda x: x.iloc[0],
                                                                  '1_1': lambda x: x.iloc[0]})
    
    
    count_features.sort_values('count', ascending=False, inplace=True)
    
    for j in range(len(count_features)):
        names = count_features.index[j]
        coef, count, pvalue, value00, value01, value10, value11 = count_features.to_numpy()[j]
        for name in names.split(', '):
            for i in range(5):
                pos_info = pd.read_csv(f"{data_dir_pre}/{drug}.gt.pos.domains.3.{i}", sep='\t', 
                                  index_col=None, header=None, usecols=[0, 1, 2, 3, 4])
                pos_info.columns = ['gene', 'pos', 'ind1', 'ind2', 'act']
                pos_info.fillna('Nope', inplace=True)
                ind = get_ind_by_name(pos_info, name, drug)
                if ind != -1:
                    break
                    
            add = pd.DataFrame([[*get_description(pos_info, ind), j, name, coef, pvalue, value00,
                                 value01, value10, value11, count]], 
                              columns=['gene', 'pos', 'act', 'from', 'to', 
                                       'num', 'name', 'coef', 'pvalue', '0_0', 
                                       '0_1', '1_0', '1_1', 'count'])
            result = pd.concat([result, add])
    result.drop(['name'], axis=1, inplace=True)
    result['drug'] = drug
    result.to_csv(f"{final_output_elastic_net}/{drug}.csv", sep='\t', 
                  index=False, header=True)
    
    all_data = pd.concat([all_data, result])
all_data.to_csv(f"{final_output_elastic_net}/TableS16.csv", sep='\t', 
                index=False, header=True)


100%|██████████| 13/13 [01:32<00:00,  7.08s/it]


## Elastic net feature metrics 

In [20]:
elastic_result = pd.DataFrame(index=drugs, 
                            columns=['res_gen', 'jacard', 
                                     'sel_true_gene', 'sel_false_gene'])
for drug in drugs:
    auc_data = pd.DataFrame()
    features = pd.DataFrame()
    
    auc_data = pd.DataFrame()
    for gamma in gammas:
        temp = pd.read_csv(f"{output_elastic_net}/{drug}/auc_{gamma}.csv", sep='\t')
        temp['gamma'] = gamma
        auc_data = pd.concat([auc_data, temp], axis=0)

    auc_data.reset_index(inplace=True)

    l_max, gamma_max = auc_data.loc[np.argmax(auc_data['mean_auc'])][['l', 'gamma']].values
    
    for i in range(5):
        pos_info = pd.read_csv(f"{data_dir_pre}/{drug}.gt.pos.domains.3.{i}", sep='\t',
                                  index_col=None, header=None, usecols=[0, 1, 2, 3, 4])
        pos_info.columns = ['gene', 'pos', 'ind1', 'ind2', 'act']
        pos_info.fillna('Nope', inplace=True)
        
        
        temp = pd.read_csv(f"{output_elastic_net}/{drug}/fisher_{l_max:.2f}_{gamma_max}_{i}.csv", 
                           sep='\t', index_col=0, header=0)
        
        description = [", ".join([get_name_by_ind(pos_info, int(i), drug) 
                                 for i in str(x).split(", ")]) for x in temp.index]
        
        temp['Event description'] = description
        temp.drop('name', axis=1)
        
        features = pd.concat([features, temp], axis=0)

    features['count'] = 1

    features = features.groupby(['Event description']).agg({'coef': np.mean, 
                                               'count': np.sum, 
                                               'pvalue': lambda x: x.iloc[0]})
    features.sort_values('count', ascending=False, inplace=True)
    features = features[features['count'] >= 3]

    sel_genes = list(set([x.split('_')[0] for x in features.index]))
    sel_genes = pd.DataFrame(sel_genes, columns=['gene'])
    sel_genes['true'] = sel_genes['gene'].isin(res_data[res_data['drug'] == drug]['gene'])
    sel_genes['temp'] = sel_genes['gene'].isin(res_data[res_data['drug'] != drug]['gene'])
    sel_genes['false'] = ~sel_genes['true'] & sel_genes['temp']

    res_gene = len(res_data[res_data['drug'] == drug])
    sel_true_gene = np.sum(sel_genes['true'])
    sel_false_gene = np.sum(sel_genes['false'])
    jacard = sel_true_gene/(res_gene + len(features) - sel_true_gene)

    elastic_result.loc[drug] = [res_gene, jacard, sel_true_gene, sel_false_gene]
elastic_result.to_csv(f"{final_output_elastic_net}/elastic_net_feature_metrics.csv", 
                      sep='\t', index=True, header=True)
