In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
#reading alignment table
data = pd.read_csv("/home/is6/ASVs_comparison/Blast_results/alignments.outfmt6", sep="\t", header=None)

In [3]:
# necessary functions
# explode a dataframe column
def explode(df, lst_cols, fill_value='', preserve_index=False):
    # make sure `lst_cols` is list-alike
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" DF
    res = (pd.DataFrame({
                col:np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))
    # append those rows that have empty lists
    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res



In [4]:
#Set column name to the table
data.columns = ['query_id', 'subject_id', 'query_identity', 'alignment_length', 'mismatch', 'gap_open', 'qstart', 'qend', 'sstart','send', 'e_value', 'bitscore']

In [5]:
data.head()


Unnamed: 0,query_id,subject_id,query_identity,alignment_length,mismatch,gap_open,qstart,qend,sstart,send,e_value,bitscore
0,seq3,GCA.28_010093065.1,94.862,253,13,0,1,253,527,779,1.61e-112,396
1,seq3,GCA.27_010093065.1,94.862,253,13,0,1,253,527,779,1.61e-112,396
2,seq3,GCA.26_010093065.1,94.862,253,13,0,1,253,527,779,1.61e-112,396
3,seq3,GCA.25_010093065.1,94.862,253,13,0,1,253,527,779,1.61e-112,396
4,seq3,GCA.24_010093065.1,94.862,253,13,0,1,253,527,779,1.61e-112,396


In [6]:
# reading taxa files as dataframes
sxt_df = pd.read_csv("/home/is6/16s_preprocessing/ksenia/taxonomy.tsv", sep='\t', header=None)

sxt_df.columns = ['query_id', 'root', 'kingdom', 'phylum', 'order', 'class', 'family', 'genus', 'species']

In [7]:
sxt_df.head()

Unnamed: 0,query_id,root,kingdom,phylum,order,class,family,genus,species
0,seq1,Root,Bacteria,Firmicutes,Bacilli,Bacillales,Bacillaceae,Bacillus,unclassified_Bacillus
1,seq2,Root,Bacteria,Firmicutes,Bacilli,Bacillales,Bacillaceae,Bacillus,unclassified_Bacillus
2,seq3,Root,Bacteria,Bacteroidota,Bacteroidia,Chitinophagales,Chitinophagaceae,unclassified_Chitinophagaceae,
3,seq4,Root,Bacteria,Bacteroidota,Bacteroidia,Cytophagales,Microscillaceae,unclassified_Microscillaceae,
4,seq5,Root,Bacteria,Proteobacteria,Alphaproteobacteria,Azospirillales,Inquilinaceae,Inquilinus,unclassified_Inquilinus


In [8]:
# merge 16S dataframe on 'seq' to blast results dataframe to get annotations
j_data = pd.merge(data, sxt_df, on="query_id")
j_data.head()

Unnamed: 0,query_id,subject_id,query_identity,alignment_length,mismatch,gap_open,qstart,qend,sstart,send,e_value,bitscore,root,kingdom,phylum,order,class,family,genus,species
0,seq3,GCA.28_010093065.1,94.862,253,13,0,1,253,527,779,1.61e-112,396,Root,Bacteria,Bacteroidota,Bacteroidia,Chitinophagales,Chitinophagaceae,unclassified_Chitinophagaceae,
1,seq3,GCA.27_010093065.1,94.862,253,13,0,1,253,527,779,1.61e-112,396,Root,Bacteria,Bacteroidota,Bacteroidia,Chitinophagales,Chitinophagaceae,unclassified_Chitinophagaceae,
2,seq3,GCA.26_010093065.1,94.862,253,13,0,1,253,527,779,1.61e-112,396,Root,Bacteria,Bacteroidota,Bacteroidia,Chitinophagales,Chitinophagaceae,unclassified_Chitinophagaceae,
3,seq3,GCA.25_010093065.1,94.862,253,13,0,1,253,527,779,1.61e-112,396,Root,Bacteria,Bacteroidota,Bacteroidia,Chitinophagales,Chitinophagaceae,unclassified_Chitinophagaceae,
4,seq3,GCA.24_010093065.1,94.862,253,13,0,1,253,527,779,1.61e-112,396,Root,Bacteria,Bacteroidota,Bacteroidia,Chitinophagales,Chitinophagaceae,unclassified_Chitinophagaceae,


In [9]:
#filtering cover 100% of ASV sequences and 100 identity to reference sequences
N_data = j_data[(j_data[['query_identity']] >= 100).all(axis=1)]
N_data = N_data[(N_data[['mismatch']] == 0).all(axis=1)]

In [10]:
len(N_data)

28

In [11]:
# seperating sseqid column into gca and s-id, then renaming GCAxx-
# format into just GCA and combining the columns back together
# to get the original id as nsseqid column

N_data[['gca', 's_id']] = N_data["subject_id"].str.split("_", n = 1, expand = True) 
N_data['gca'] = 'GCA'
N_data['nsseqid'] = N_data[['gca', 's_id']].agg('_'.join, axis=1)

N_data = N_data.drop('gca', 1)
N_data = N_data.drop('s_id', 1)
N_data.head()

Unnamed: 0,query_id,subject_id,query_identity,alignment_length,mismatch,gap_open,qstart,qend,sstart,send,...,bitscore,root,kingdom,phylum,order,class,family,genus,species,nsseqid
203,seq15,GCA.21_000024005.1,100.0,253,0,0,1,253,520,772,...,468,Root,Bacteria,Bacteroidota,Bacteroidia,Chitinophagales,Chitinophagaceae,Chitinophaga,unclassified_Chitinophaga,GCA_000024005.1
204,seq15,GCA.20_000024005.1,100.0,253,0,0,1,253,520,772,...,468,Root,Bacteria,Bacteroidota,Bacteroidia,Chitinophagales,Chitinophagaceae,Chitinophaga,unclassified_Chitinophaga,GCA_000024005.1
205,seq15,GCA.19_000024005.1,100.0,253,0,0,1,253,520,772,...,468,Root,Bacteria,Bacteroidota,Bacteroidia,Chitinophagales,Chitinophagaceae,Chitinophaga,unclassified_Chitinophaga,GCA_000024005.1
206,seq15,GCA.18_000024005.1,100.0,253,0,0,1,253,520,772,...,468,Root,Bacteria,Bacteroidota,Bacteroidia,Chitinophagales,Chitinophagaceae,Chitinophaga,unclassified_Chitinophaga,GCA_000024005.1
207,seq15,GCA.17_000024005.1,100.0,253,0,0,1,253,520,772,...,468,Root,Bacteria,Bacteroidota,Bacteroidia,Chitinophagales,Chitinophagaceae,Chitinophaga,unclassified_Chitinophaga,GCA_000024005.1


In [12]:
# getting all sample annotations as a dataframe
dir_path = "/home/is6/ASVs_comparison/samples"
dir_name = []
file_name = []
for path, dirs,files in os.walk(dir_path):
    dir_name.append(dirs)
    file_name.append(files)
    
new_file_list = list(filter(None,file_name))
new_dir_list = list(dir_name[0])
all_samples = list(zip(new_dir_list,new_file_list))

sample_df = pd.DataFrame(all_samples)
sample_df.columns = ["genera", "samples"]

sample_df['samples'] = sample_df['samples'].astype(str)
new_df = pd.DataFrame(explode(sample_df.assign(samples=sample_df.samples.str.split(',')), 'samples'))

new_df.samples = new_df.samples.str.replace("'", '')
new_df.samples = new_df.samples.str.replace("[", '')
new_df.samples = new_df.samples.str.replace("]", '')
new_df.samples = new_df.samples.str.replace(" ", '')

new_df.samples = new_df["samples"].str.rsplit("_", n = 2, expand = True)
new_df.columns = ['subject_taxonomy', 'nsseqid']

new_df.head()

Unnamed: 0,subject_taxonomy,nsseqid
0,Herbidospora,GCA_005233835.1
1,Herbidospora,GCA_009901565.1
2,Herbidospora,GCA_001570525.1
3,Herbidospora,GCA_001570605.1
4,Herbidospora,GCA_001570585.1


In [13]:
# merge final dataframe on 'sseqid' to 16s_extracted dataframe to get species name
final_data = pd.merge(new_df,N_data, on="nsseqid")
final_data.head()

Unnamed: 0,subject_taxonomy,nsseqid,query_id,subject_id,query_identity,alignment_length,mismatch,gap_open,qstart,qend,...,e_value,bitscore,root,kingdom,phylum,order,class,family,genus,species
0,Herbidospora,GCA_005233835.1,seq62,GCA.32_005233835.1,100.0,253,0,0,1,253,...,3.36e-134,468,Root,Bacteria,Actinobacteriota,Actinobacteria,Streptosporangiales,Streptosporangiaceae,unclassified_Streptosporangiaceae,
1,Sorangium,GCA_004135755.1,seq32,GCA.52_004135755.1,100.0,253,0,0,1,253,...,3.36e-134,468,Root,Bacteria,Myxococcota,Polyangia,Polyangiales,Polyangiaceae,Sorangium,unclassified_Sorangium
2,Sorangium,GCA_004135755.1,seq32,GCA.51_004135755.1,100.0,253,0,0,1,253,...,3.36e-134,468,Root,Bacteria,Myxococcota,Polyangia,Polyangiales,Polyangiaceae,Sorangium,unclassified_Sorangium
3,Sorangium,GCA_004135755.1,seq32,GCA.50_004135755.1,100.0,253,0,0,1,253,...,3.36e-134,468,Root,Bacteria,Myxococcota,Polyangia,Polyangiales,Polyangiaceae,Sorangium,unclassified_Sorangium
4,Sorangium,GCA_004135755.1,seq32,GCA.49_004135755.1,100.0,253,0,0,1,253,...,3.36e-134,468,Root,Bacteria,Myxococcota,Polyangia,Polyangiales,Polyangiaceae,Sorangium,unclassified_Sorangium


In [14]:
len(final_data)

28

In [16]:
#query_id subject_id query_coverage query_identity subject_taxonomy

final_data = final_data[['query_id','nsseqid', 'mismatch', 'query_identity','subject_taxonomy']]
final_data.head()

Unnamed: 0,query_id,nsseqid,mismatch,query_identity,subject_taxonomy
0,seq62,GCA_005233835.1,0,100.0,Herbidospora
1,seq32,GCA_004135755.1,0,100.0,Sorangium
2,seq32,GCA_004135755.1,0,100.0,Sorangium
3,seq32,GCA_004135755.1,0,100.0,Sorangium
4,seq32,GCA_004135755.1,0,100.0,Sorangium


In [25]:
final_data = final_data.rename(columns={'mismatch':'mismatch/query_coverage','nsseqid':'subject_id'})
final_data.head()

Unnamed: 0,query_id,subject_id,mismatch/query_coverage,query_identity,subject_taxonomy
0,seq62,GCA_005233835.1,0,100.0,Herbidospora
1,seq32,GCA_004135755.1,0,100.0,Sorangium
2,seq32,GCA_004135755.1,0,100.0,Sorangium
3,seq32,GCA_004135755.1,0,100.0,Sorangium
4,seq32,GCA_004135755.1,0,100.0,Sorangium


In [26]:
final_data.to_csv ("/home/is6/ASVs_comparison/16s_database/BLAST_results.csv", index = False)