# Notebook to form Table4, TableS11, TableS17, TableS29 preparation for feature matrix (Table 3)

In [3]:
# Configutation part #

# Directory with matrix after deduplication
data_dir = ""
# Directory with matrix before deduplication
data_dir_pre = ""
# Output directory of abess_lr.py 
output_abess_lr = './output_abess_lr/lr'
# Output directory
final_output_abess = './final_output_abess'
# Annotation file
annotation_file = '../db/AL123456_rev.gff'
# Resistance genes file
resistance_gene_file = '../db/resistance_genes.csv'

######################

In [1]:
import os

from copy import deepcopy

import pandas as pd
import numpy as np

from tqdm import tqdm

from scipy.stats import fisher_exact

In [2]:
if not os.path.exists(final_output_abess):
    os.makedirs(final_output_abess)

thr = '3'

drugs = ['Kanamycin', 'Amikacin', 'Streptomycin', 'Ofloxacin',
         'Moxifloxacin', 'Isoniazid', 'Rifampicin', 'Ethambutol',
         'Pyrazinamide', 'Capreomycin', 'Ethionamide', 'Prothionamide',
         'Ciprofloxacin']

annotation = pd.read_csv(annotation_file, sep='\t', index_col=None, header=None)
annotation = annotation[annotation[2] == "CDS"]
annotation['gene'] = [x.split(';')[2].split()[-1] for x in annotation[8]]
annotation['seq'] = [x.split(';')[-1].split()[1][1:] for x in annotation[8]]
annotation = annotation[['gene', 'seq']]

res_data = pd.read_csv(resistance_gene_file, sep='\t', 
                               index_col=None, header=0)

## Support functions

In [3]:
def represents_int(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

In [4]:
def conj_matrix(X):
    value00 = len(X[(X.iloc[:,0] == 0) & (X.iloc[:,1] == 0)])
    value01 = len(X[(X.iloc[:,0] == 0) & (X.iloc[:,1] == 1)])
    value10 = len(X[(X.iloc[:,0] == 1) & (X.iloc[:,1] == 0)])
    value11 = len(X[(X.iloc[:,0] == 1) & (X.iloc[:,1] == 1)])
    return [value00, value01, value10, value11]

def fisher(X):
    n = len(X)
    [value00, value01, value10, value11] = conj_matrix(X)
    return fisher_exact([[value00, value01], [value10, value11]])[1]

In [1]:
def get_ind_by_name(pos_info, name, drug):
    if name.split('_')[-1] == 'domain':
        gene = "_".join(name.split('_')[:-1])
        temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == 'changed')]
    elif name.split('_')[-1] == 'broken':
        gene = "_".join(name.split('_')[:-1])
        temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == 'broken')]
    elif name.split('_')[-1] == 'snp':
        gene = "_".join(name.split('_')[:-3])
        pos = name.split('_')[-3]
        ind = name.split('_')[-2]
        temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == pos) & (pos_info['ind2'] == ind) & (pos_info['act'] == 'snp')]
    elif name.split('_')[-1] == 'del':
        gene = "_".join(name.split('_')[:-3])
        pos = name.split('_')[-3]
        ind = name.split('_')[-2]
        if represents_int(ind):
            temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == pos) & (pos_info['ind1'] == ind)  & (pos_info['ind2'] == 'del')]
        else:
            temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == pos) & (pos_info['ind2'] == ind)  & (pos_info['act'] == 'del')]
    elif name.split('_')[-1] == 'ins':
        gene = "_".join(name.split('_')[:-3])
        pos = name.split('_')[-3]
        ind = name.split('_')[-2]
        temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == pos) & (pos_info['ind2'] == ind)  & (pos_info['act'] == 'ins')]
    elif name_split('_')[-1] == 'structure':
        i = name.split('_')[1]
        node = name.split('_')[2]
        temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == i) & (pos_info['ind1'] == node)]
    
    if len(temp) == 0: 
        return -1
    else: 
        return temp.index[0]
    
def get_name_by_ind(pos_info, j, drug):
    temp = pos_info.iloc[j]
    gene, pos, ind1, ind2, act = temp['gene'], temp['pos'], temp['ind1'], temp['ind2'], temp['act']

    if pos == 'changed':
        return f"{gene}_domain"
    elif pos == 'broken':
        return f"{gene}_broken"
    elif act == 'snp':
        return f"{gene}_{pos}_{ind2}_snp"
    elif ind2 == 'del':
        return f"{gene}_{pos}_{ind1}_{ind2}"
    elif act == 'del':
        return f"{gene}_{pos}_{ind2}_{act}"
    elif act == 'ins':
        return f"{gene}_{pos}_{ind2}_{act}"
    elif gene == 'structure':
        return f"{gene}_{pos}_{ind1}"
    
def get_description(pos_info, ind):
    gene, pos, fr, to, act = pos_info.loc[ind].values
    if act == 'snp':
        return [gene, pos, 'SNV', fr, to]
    elif act == 'ins':
        return [gene, pos, 'Insertion', fr, to]
    elif act == 'del':
        return [gene, pos, 'Deletion', fr, to]
    elif to == 'del':
        seq = annotation[annotation['gene'] == gene]['seq'].to_numpy()[0]
        fr = seq[int(pos)-1: int(pos)+int(fr)]
        return [gene, pos, 'Deletion', fr, "-"]
    elif pos == 'changed':
        temp = [i for i in gene.split('_') if not represents_int(i)]
        new_gene = "_".join(temp[:-1])
        domain = "_".join(gene.split('_')[len(new_gene.split('_')):])
        return [new_gene, '-', f'{domain} is changed', '-', '-']
        return f"In {new_gene} PFAM {domain} is changed"
    elif pos == 'broken':
        return [gene, '-', 'Broken', '-', '-']
    elif gene == 'structure':
        return [fr, '-', 'TreeBreaker clade', '-', '-']

## Add statical info

In [36]:
# Adding to the output abess feature description conjuction matrix and p-value fisher exact test for each split

for drug in tqdm(drugs):
    pos_info = pd.read_csv(f"{data_dir_pre}/{drug}.gt.pos.domains.3.0", sep='\t', 
                              index_col=None, header=None, usecols=[0, 1, 2, 3, 4])
    pos_info.columns = ['gene', 'pos', 'ind1', 'ind2', 'act']

    for i in range(5):
        pos_info_temp = pd.read_csv(f"{data_dir_pre}/{drug}.gt.pos.domains.3.{i}", sep='\t', 
                              index_col=None, header=None, usecols=[0, 1, 2, 3, 4])
        pos_info_temp.columns = ['gene', 'pos', 'ind1', 'ind2', 'act']

        f = open(f"{data_dir}/{drug}.gt.features.domains.{thr}.train.{i}", 'r')
        features_info = [[int(y) for y in x[:-1].split(', ')] for x in f.readlines()]
        f.close()

        if not os.path.exists(f"{output_abess_lr}/{drug}/gic_{i}.csv"): continue
        temp = pd.read_csv(f"{output_abess_lr}/{drug}/gic_{i}.csv", sep='\t', index_col=0, header=0)
        indeces = [int(x) for x in temp.index]
        indeces = [features_info[i][0] for i in indeces]
        indeces = [get_ind_by_name(pos_info, get_name_by_ind(pos_info_temp, i, drug), drug) for i in indeces]

        sel_indeces = deepcopy(indeces)
        while -1 in sel_indeces: sel_indeces.remove(-1)

        X_test = pd.read_csv(f"{data_dir_pre}/{drug}.gt.domains.3.test.0", sep=' ', 
                             index_col=None, header=None, usecols=sel_indeces)
        X = X_test

        y_test = pd.read_csv(f"{data_dir_pre}/{drug}.phen.domains.3.test.0", sep=' ', 
                             index_col=None, header=None)
        y = y_test
        X['y'] = y.to_numpy().ravel()

        pvalues = []
        values00 = []
        values01 = []
        values10 = []
        values11 = []

        for j in indeces:
            if j == -1: 
                pvalues.append('-')
                [value00, value01, value10, value11] = ['-', '-', '-', '-']
            else: 
                pvalues.append(fisher(X[[j, 'y']]))
                [value00, value01, value10, value11] = conj_matrix(X[[j, 'y']])

            values00.append(value00)
            values01.append(value01)
            values10.append(value10)
            values11.append(value11)
        temp['0_0'] = values00
        temp['0_1'] = values01
        temp['1_0'] = values10
        temp['1_1'] = values11
        temp['pvalue'] = pvalues
        temp.to_csv(f"{output_abess_lr}/{drug}/fisher_gic_{i}.csv", sep='\t', header=True, index=True)

  0%|          | 0/13 [00:01<?, ?it/s]


## Feature table for ABESS (Table 4 and Table S11) and population structure table (TableS29)

In [7]:
for drug in drugs:
    
    count_features = pd.DataFrame()
    result = pd.DataFrame(columns=['gene', 'pos', 'act', 'from', 'to', 'num', 
                                   'name', 'coef', 'pvalue', '0_0', 
                                       '0_1', '1_0', '1_1', 'count'])
    for i in range(5):
        temp = pd.read_csv(f"{output_abess_lr}/{drug}/fisher_gic_{i}.csv", 
                           sep='\t', index_col=0, header=0)

        count_features = pd.concat([count_features, temp], axis=0)
    count_features['count'] = 1    

    count_features = count_features.groupby(['ext_feature']).agg({'coef': np.mean, 
                                                                  'count': np.sum,
                                                                  'pvalue': lambda x: x.iloc[0],
                                                                  '0_0': lambda x: x.iloc[0],
                                                                  '0_1': lambda x: x.iloc[0],
                                                                  '1_0': lambda x: x.iloc[0],
                                                                  '1_1': lambda x: x.iloc[0]})
    
    count_features.sort_values('count', ascending=False, inplace=True)
    
    for j in range(len(count_features)):
        names = count_features.index[j]
        coef, count, pvalue, value00, value01, value10, value11 = count_features.to_numpy()[j]
        for name in names.split(' '):
            for i in range(5):
                pos_info = pd.read_csv(f"{data_dir_pre}/{drug}.gt.pos.domains.3.{i}", sep='\t', 
                                  index_col=None, header=None, usecols=[0, 1, 2, 3, 4])
                pos_info.columns = ['gene', 'pos', 'ind1', 'ind2', 'act']
                pos_info.fillna('Nope', inplace=True)
                ind = get_ind_by_name(pos_info, name, drug)
                if ind != -1:
                    break
                    
            add = pd.DataFrame([[*get_description(pos_info, ind), j, name, coef, pvalue, value00,
                                 value01, value10, value11, count]], 
                              columns=['gene', 'pos', 'act', 'from', 'to', 
                                       'num', 'name', 'coef', 'pvalue', '0_0', 
                                       '0_1', '1_0', '1_1', 'count'])
            result = pd.concat([result, add])
    result.drop(['name'], axis=1, inplace=True)
    result['drug'] = drug
    result.to_csv(f"{final_output_abess}/{drug}.csv", sep='\t', index=False, header=True)

In [5]:
# TableS11
# Don't use this code block in the experiment with population structure
all_data = pd.DataFrame(columns=['drug', 'gene', 'pos', 'act', 'from', 'to', 
                                       'num', 'coef', 'pvalue', '0_0', 
                                       '0_1', '1_0', '1_1', 'count'])
for drug in drugs:
    result = pd.read_csv(f"{final_output_abess}/{drug}.csv", sep='\t', index_col=None, header=0)
    result['drug'] = drug
    all_data = pd.concat([all_data, result], axis=0)
all_data.to_csv(f"{final_output_abess}/TableS11.csv", sep='\t', header=None, index=None)

In [6]:
# Table 4
# If you want to get selected features table for matrix with TreeBreaker features
# Change name of written table to TableS29
table_name = 'TableS4.csv'

all_data = pd.DataFrame(columns=['drug', 'gene', 'pos', 'act', 'from', 'to', 'count'])
for drug in drugs:
    result = pd.read_csv(f"{final_output_abess}/{drug}.csv", sep='\t', index_col=None, header=0)
    result['drug'] = drug
    result = result[['drug', 'gene', 'pos', 'act', 'from', 'to', 'count']]
    all_data = pd.concat([all_data, result], axis=0)
all_data.to_csv(f"{final_output_abess}/{table_name}", sep='\t', header=None, index=None)

## Abess metrics (TableS17)

In [44]:
metrics = pd.read_csv(f'{output_abess_lr}/auc_result.csv', index_col=0, header=0, sep='\t')
metrics = metrics[['mean_auc', 'mean_sensitivity', 'mean_specivity', 'mean_npv', 'mean_ppv']]
metrics.to_csv(f'{final_output_abess}/TableS17.csv', sep='\t', index=True, header=True)

## Abess feature metrics

In [50]:
abess_result = pd.DataFrame(index=drugs, 
                            columns=['res_gen', 'jacard', 
                                     'sel_true_gene', 'sel_false_gene'])
for drug in drugs:
    features = pd.read_csv(f"{final_output_abess}/{drug}.csv", sep='\t', index_col=0, header=0)
    features = features[features['count'] >= 3]

    sel_genes = list(set([x.split('_')[0] for x in features.index]))
    sel_genes = pd.DataFrame(sel_genes, columns=['gene'])
    sel_genes['true'] = sel_genes['gene'].isin(res_data[res_data['drug'] == drug]['gene'])
    sel_genes['temp'] = sel_genes['gene'].isin(res_data[res_data['drug'] != drug]['gene'])
    sel_genes['false'] = ~sel_genes['true'] & sel_genes['temp']

    res_gene = len(res_data[res_data['drug'] == drug])
    sel_true_gene = np.sum(sel_genes['true'])
    sel_false_gene = np.sum(sel_genes['false'])
    jacard = sel_true_gene/(res_gene + len(features) - sel_true_gene)

    abess_result.loc[drug] = [res_gene, jacard, sel_true_gene, sel_false_gene]
abess_result.to_csv(f"{final_output_abess}/abess_feature_metrics.csv", sep='\t', index=True, header=True)