# Notebook to make aggregated result table about work of iterative ABESS (Table S32) and feature tables (Table S31 and Table S33)

In [2]:
# Configutation part #

# Directory with first iteration matrix before deduplication
data_dir = ''
# Directory with second iteration matrix before deduplication 
data_dir_second_iter_pre = '../db/cv_hhs_files_iter_2'
# Directory with second iteration matrix after deduplication 
data_dir_second_iter = '../db/cv_hhs_files.2_iter_2'

# Output of first run of make_db_next_hhs_iter.py
output_hhs_exp_iter_1 = './output_iterative_hhs/explained_iter_1'
# Output of second run of make_db_next_hhs_iter.py
output_hhs_exp_iter_2 = './output_iterative_hhs/explained_iter_2'

# Output of hhs_lr.py running using second iteration matrices
output_hhs_lr_iter_2 = './output_hhs_lr_iter_2'

# Output directory with created Table S31, S32 and S33
result_dir = './result_iterative_hhs'

# Annotation file
annotation_file = '../db/AL123456_rev.gff'

######################

In [3]:
import os

from copy import deepcopy

import pandas as pd
import numpy as np

from tqdm import tqdm

from scipy.stats import fisher_exact

In [13]:
drugs = ['Rifampicin',
 'Isoniazid',
 'Pyrazinamide',
 'Ethambutol',
 'Streptomycin',
 'Kanamycin',
 'Amikacin',
 'Capreomycin',
 'Ofloxacin',
 'Moxifloxacin',
 'Ciprofloxacin',
 'Ethionamide',
 'Prothionamide']

thr = '3'

annotation = pd.read_csv(annotation_file, sep='\t', index_col=None, header=None)
annotation = annotation[annotation[2] == "CDS"]
annotation['gene'] = [x.split(';')[2].split()[-1] for x in annotation[8]]
annotation['seq'] = [x.split(';')[-1].split()[1][1:] for x in annotation[8]]
annotation = annotation[['gene', 'seq']]

if not os.path.exists(result_dir):
    os.makedirs(result_dir)

## Support functions

In [4]:
def represents_int(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

In [5]:
def conj_matrix(X):
    value00 = len(X[(X.iloc[:,0] == 0) & (X.iloc[:,1] == 0)])
    value01 = len(X[(X.iloc[:,0] == 0) & (X.iloc[:,1] == 1)])
    value10 = len(X[(X.iloc[:,0] == 1) & (X.iloc[:,1] == 0)])
    value11 = len(X[(X.iloc[:,0] == 1) & (X.iloc[:,1] == 1)])
    return [value00, value01, value10, value11]

def fisher(X):
    n = len(X)
    [value00, value01, value10, value11] = conj_matrix(X)
    return fisher_exact([[value00, value01], [value10, value11]])[1]

In [6]:
def get_ind_by_name(pos_info, name, drug):
    if name.split('_')[-1] == 'domain':
        gene = "_".join(name.split('_')[:-1])
        temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == 'changed')]
    elif name.split('_')[-1] == 'broken':
        gene = "_".join(name.split('_')[:-1])
        temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == 'broken')]
    elif name.split('_')[-1] == 'snp':
        gene = "_".join(name.split('_')[:-3])
        pos = name.split('_')[-3]
        ind = name.split('_')[-2]
        temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == pos) & (pos_info['ind2'] == ind) & (pos_info['act'] == 'snp')]
    elif name.split('_')[-1] == 'del':
        gene = "_".join(name.split('_')[:-3])
        pos = name.split('_')[-3]
        ind = name.split('_')[-2]
        if represents_int(ind):
            temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == pos) & (pos_info['ind1'] == ind)  & (pos_info['ind2'] == 'del')]
        else:
            temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == pos) & (pos_info['ind2'] == ind)  & (pos_info['act'] == 'del')]
    elif name.split('_')[-1] == 'ins':
        gene = "_".join(name.split('_')[:-3])
        pos = name.split('_')[-3]
        ind = name.split('_')[-2]
        temp = pos_info[(pos_info['gene'] == gene) & (pos_info['pos'] == pos) & (pos_info['ind2'] == ind)  & (pos_info['act'] == 'ins')]

    if len(temp) == 0: 
        return -1
    else: 
        return temp.index[0]
    
def get_name_by_ind(pos_info, j, drug):
    temp = pos_info.iloc[j]
    gene, pos, ind1, ind2, act = temp['gene'], temp['pos'], temp['ind1'], temp['ind2'], temp['act']

    if pos == 'changed':
        return f"{gene}_domain"
    elif pos == 'broken':
        return f"{gene}_broken"
    elif act == 'snp':
        return f"{gene}_{pos}_{ind2}_snp"
    elif ind2 == 'del':
        return f"{gene}_{pos}_{ind1}_{ind2}"
    elif act == 'del':
        return f"{gene}_{pos}_{ind2}_{act}"
    elif act == 'ins':
        return f"{gene}_{pos}_{ind2}_{act}"
    
def get_description(pos_info, ind):
    gene, pos, fr, to, act = pos_info.loc[ind].values
    if act == 'snp':
        return [gene, pos, 'SNV', fr, to]
    elif act == 'ins':
        return [gene, pos, 'Insertion', fr, to]
    elif act == 'del':
        return [gene, pos, 'Deletion', fr, to]
    elif to == 'del':
        seq = annotation[annotation['gene'] == gene]['seq'].to_numpy()[0]
        fr = seq[int(pos)-1: int(pos)+int(fr)]
        return [gene, pos, 'Deletion', fr, "-"]
    elif pos == 'changed':
        temp = [i for i in gene.split('_') if not represents_int(i)]
        new_gene = "_".join(temp[:-1])
        domain = "_".join(gene.split('_')[len(new_gene.split('_')):])
        return [new_gene, '-', f'{domain} is changed', '-', '-']
        return f"In {new_gene} PFAM {domain} is changed"
    elif pos == 'broken':
        return [gene, '-', 'Broken', '-', '-']

## Info about result of iterative ABESS (Table S31)

In [7]:
result = pd.DataFrame(index=drugs, columns=['N_res_samples', 'N_samples', 
                                            'per_unexp_iter_1','N_unexp_iter_1', 
                                            'N_samples_iter_2', 
                                           'per_unexp_iter_2', 'N_unexp_iter_2'])
for drug in drugs:

    N_samples = 0
    N_res_samples = 0
    N_exp_iter_1 = 0
    N_exp_iter_2 = 0
    N_samples_iter_2 = 0
    
    for ind in range(5):
        pheno = pd.read_csv(f'{data_dir_pre}/{drug}.phen.domains.3.train.{ind}', sep=' ',
                           index_col=None, header=None)
        pheno_iter_2 = pd.read_csv(f'{data_dir_second_iter_pre}/{drug}.phen.domains.3.train.{ind}', 
                                   sep=' ', index_col=None, header=None)
        N_samples += len(pheno)

        N_samples_iter_2 += len(pheno_iter_2)
        N_res_samples += np.sum(pheno[0])
    N_samples /= 5
    N_res_samples /= 5
    N_samples_iter_2 = N_samples_iter_2 / 5
    
    temp_1 = pd.read_csv(f"{output_hhs_exp_iter_1}/{drug}.csv", sep='\t', 
                         index_col=None, header=0, dtype=int)
    temp_2 = pd.read_csv(f"{output_hhs_exp_iter_2}/{drug}.csv", sep='\t', 
                         index_col=None, header=0, dtype=int)
    
    N_unexp_iter_1 = np.mean(temp_1['new'])
    N_unexp_iter_2 = np.mean(temp_2['new'])
    result.loc[drug] = [N_res_samples, N_samples, 
                        1-N_unexp_iter_1/N_res_samples, N_unexp_iter_1, 
                        N_samples_iter_2,
                        1-N_unexp_iter_2/N_res_samples, N_unexp_iter_2]
result.to_csv(f'{result_dir}/TableS31.csv', sep='\t', index=True, header=True)

## Add statical info

In [1]:
hhs_auc_1 = pd.read_csv(f"{output_hhs_lr_iter_2}/1/auc.csv", 
                        sep='\t', index_col=0, header=0)
hhs_auc_1['filter'] = 1
hhs_auc_3 = pd.read_csv(f"{output_hhs_lr_iter_2}/3/auc.csv", 
                        sep='\t', index_col=0, header=0)
hhs_auc_3['filter'] = 3
hhs_auc_5 = pd.read_csv(f"{output_hhs_lr_iter_2}/5/auc.csv", 
                        sep='\t', index_col=0, header=0)
hhs_auc_5['filter'] = 5

hhs_auc = pd.concat([hhs_auc_1, hhs_auc_3, hhs_auc_5], axis=0)

hhs_auc = hhs_auc[hhs_auc.groupby(level=0)['auc'].transform(max) == hhs_auc['auc']]

for drug in tqdm(drugs):
    f = hhs_auc.loc[drug, 'filter']
    
    pos_info = pd.read_csv(f"{data_dir_second_iter_pre}/{drug}.gt.pos.domains.3.0", sep='\t', 
                              index_col=None, header=None, usecols=[0, 1, 2, 3, 4])
    pos_info.columns = ['gene', 'pos', 'ind1', 'ind2', 'act']
    
    for i in range(5):
        pos_info_temp = pd.read_csv(f"{data_dir_second_iter_pre}/{drug}.gt.pos.domains.3.{i}", sep='\t', 
                              index_col=None, header=None, usecols=[0, 1, 2, 3, 4])
        pos_info_temp.columns = ['gene', 'pos', 'ind1', 'ind2', 'act']
        
        temp = pd.read_csv(f"{output_hhs_lr_iter_2}/{f}/{drug}.gt.domains.3.final.{i}", 
                           sep='\t', index_col=0, header=0)
        indeces = [int(str(x).split(', ')[0]) for x in temp.index]
        indeces = [get_ind_by_name(pos_info, get_name_by_ind(pos_info_temp, i, drug), drug) for i in indeces]
        
        sel_indeces = deepcopy(indeces)
        while -1 in sel_indeces: sel_indeces.remove(-1)
        
        X_test = pd.read_csv(f"{data_dir_second_iter_pre}/{drug}.gt.domains.3.test.0", sep=' ', 
                             index_col=None, header=None, usecols=sel_indeces)
        X = X_test

        y_test = pd.read_csv(f"{data_dir_second_iter_pre}/{drug}.phen.domains.3.test.0", sep=' ', 
                             index_col=None, header=None)
        y = y_test

        X['y'] = y.to_numpy()

        pvalues = []
        values00 = []
        values01 = []
        values10 = []
        values11 = []

        for j in indeces:
            if j == -1: 
                pvalues.append('-')
                [value00, value01, value10, value11] = ['-', '-', '-', '-']
            else: 
                pvalues.append(fisher(X[[j, 'y']]))
                [value00, value01, value10, value11] = conj_matrix(X[[j, 'y']])
            values00.append(value00)
            values01.append(value01)
            values10.append(value10)
            values11.append(value11)
        temp['0_0'] = values00
        temp['0_1'] = values01
        temp['1_0'] = values10
        temp['1_1'] = values11
        temp['pvalue'] = pvalues
        temp.to_csv(f"{output_hhs_lr_iter_2}/{f}/{drug}.gt.domains.3.final.{i}", 
                    sep='\t', header=True, index=True)
        

NameError: name 'pd' is not defined

## Feature Table (Table S32 and TableS33) 

In [25]:
hhs_auc_1 = pd.read_csv(f"{output_hhs_lr_iter_2}/1/auc.csv", 
                        sep='\t', index_col=0, header=0)
hhs_auc_1['filter'] = 1
hhs_auc_3 = pd.read_csv(f"{output_hhs_lr_iter_2}/3/auc.csv", 
                        sep='\t', index_col=0, header=0)
hhs_auc_3['filter'] = 3
hhs_auc_5 = pd.read_csv(f"{output_hhs_lr_iter_2}/5/auc.csv", 
                        sep='\t', index_col=0, header=0)
hhs_auc_5['filter'] = 5

hhs_auc = pd.concat([hhs_auc_1, hhs_auc_3, hhs_auc_5], axis=0)

hhs_auc = hhs_auc[hhs_auc.groupby(level=0)['auc'].transform(max) == hhs_auc['auc']]

for drug in drugs:
    filt = hhs_auc.loc[drug, 'filter']

    count_features = pd.DataFrame()
    result = pd.DataFrame(columns=['gene', 'pos', 'act', 'from', 'to', 'num', 
                                   'name', 'coef', 'pvalue', '0_0', 
                                       '0_1', '1_0', '1_1', 'count'])
    for i in range(5):
        temp = pd.read_csv(f"{output_hhs_lr_iter_2}/{filt}/{drug}.gt.domains.3.final.{i}", 
                           sep='\t', index_col=0, header=0)

        count_features = pd.concat([count_features, temp], axis=0)
    count_features['count'] = 1    

    count_features = count_features.groupby(['name']).agg({'coef': np.mean, 
                                                                  'count': np.sum,
                                                                  'pvalue': lambda x: x.iloc[0],
                                                                  '0_0': lambda x: x.iloc[0],
                                                                  '0_1': lambda x: x.iloc[0],
                                                                  '1_0': lambda x: x.iloc[0],
                                                                  '1_1': lambda x: x.iloc[0]})
    
    count_features.sort_values('count', ascending=False, inplace=True)

    for j in range(len(count_features)):
        names = count_features.index[j]
        coef, count, pvalue, value00, value01, value10, value11 = count_features.to_numpy()[j]
        for name in names.split(', '):
            for i in range(5):
                pos_info = pd.read_csv(f"{data_dir_second_iter_pre}/{drug}.gt.pos.domains.3.{i}", sep='\t', 
                                  index_col=None, header=None, usecols=[0, 1, 2, 3, 4])
                pos_info.columns = ['gene', 'pos', 'ind1', 'ind2', 'act']
                pos_info.fillna('Nope', inplace=True)
                ind = get_ind_by_name(pos_info, name, drug)
                if ind != -1:
                    break
                    
            add = pd.DataFrame([[*get_description(pos_info, ind), j, name, coef, pvalue, value00,
                                 value01, value10, value11, count]], 
                              columns=['gene', 'pos', 'act', 'from', 'to', 
                                       'num', 'name', 'coef', 'pvalue', '0_0', 
                                       '0_1', '1_0', '1_1', 'count'])
            result = pd.concat([result, add])
    result.drop(['name'], axis=1, inplace=True)
    result['drug'] = drug
    result.to_csv(f"{result_dir}/{drug}.csv", sep='\t', index=False, header=True)
    

In [19]:
# TableS32
all_data = pd.DataFrame(columns=['drug', 'gene', 'pos', 'act', 'from', 'to', 'num', 'count'])
for drug in drugs:
    result = pd.read_csv(f"{result_dir}/{drug}.csv", sep='\t', index_col=None, header=0)
    result['drug'] = drug
    result = result[['drug', 'gene', 'pos', 'act', 'from', 'to', 'num', 'count']]
    result = result[result['count'] >= 3]
    all_data = pd.concat([all_data, result], axis=0)
all_data.to_csv(f"{result_dir}/TableS32.csv", sep='\t', header=None, index=None)

In [26]:
# TableS33
all_data = pd.DataFrame(columns=['drug', 'gene', 'pos', 'act', 'from', 'to', 
                                       'num', 'coef', 'pvalue', '0_0', 
                                       '0_1', '1_0', '1_1', 'count'])
for drug in drugs:
    result = pd.read_csv(f"{result_dir}/{drug}.csv", sep='\t', index_col=None, header=0)
    result['drug'] = drug
    all_data = pd.concat([all_data, result], axis=0)
all_data.to_csv(f"{result_dir}/TableS33.csv", sep='\t', header=None, index=None)