In [1]:
import os
os.chdir("/home/is6/SolidBin/CAT_taxonomic_classification/sp/input_sp")

In [2]:
import typing as t 
from glob import glob

import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

%matplotlib inline



In [3]:
gh_path = '/home/is6/glyco/annotations.tsv'

gh_matches = {
    # `sample` is either 'chz' or 'sp'
    sample: grp for sample, grp in pd.read_csv(gh_path, sep='\t').groupby('sample')
}
gh_matches.keys()

dict_keys(['chz', 'sp'])

In [4]:
annotation_root = '/home/is6/annotation'

annotations = {
    sample: list(SeqIO.parse(glob(f'{annotation_root}/{sample}/*.gbk')[0], 'gb')) 
    for sample in gh_matches
}

gh_loci = {
    sample: set(grp['query']) for sample, grp in gh_matches.items()
}

In [5]:
def extract_cds_loci(contig: SeqRecord) -> t.Set[str]:
    features = contig.features
    # extract cds locus tags
    loci = (feat.qualifiers['locus_tag'][0] 
            for feat in features if feat.type == 'CDS')
    return set(loci)


# calculate contig length for each contig if its set of loci intersects with known gh loci
contig_features = {
    sample: pd.DataFrame.from_records([
        {'label': contig.id,
         'length': len(contig)}
        for contig in contigs
        if bool(extract_cds_loci(contig) & gh_loci[sample])
    ])
    for sample, contigs in annotations.items()
}

In [6]:
# these are positive Gh matches for chz
print(contig_features['chz'].label)

0     contig_1004
1     contig_1027
2     contig_1106
3     contig_1128
4     contig_1167
5     contig_1168
6     contig_1180
7     contig_1198
8     contig_1229
9     contig_1238
10    contig_1311
11    contig_1332
12    contig_1381
13    contig_1387
14    contig_1431
15    contig_1434
16     contig_144
17    contig_1457
18     contig_146
19    contig_1460
20    contig_1464
21    contig_1500
22     contig_156
23    contig_1645
24    contig_1673
25    contig_1681
26    contig_1693
27    contig_1723
28    contig_1764
29    contig_1903
30    contig_1916
31    contig_1941
32    contig_1949
33    contig_1951
34       contig_2
35     contig_200
36    contig_2012
37    contig_2094
38    contig_2098
39      contig_22
40    contig_2251
41    contig_2253
42    contig_2327
43    contig_2353
44    contig_2383
45    contig_2418
46    contig_2426
47    contig_2492
48    contig_2535
49    contig_2627
50    contig_2891
51     contig_293
52    contig_3070
53     contig_308
Name: label, dtype: object


In [7]:

annotation_file_sp=pd.read_table("/home/is6/SolidBin/CAT_taxonomic_classification/sp/contigs_taxonomy_noscores.tsv"
                                 
                                 )
#list(dataframe.columns.values)

  """Entry point for launching an IPython kernel.


In [8]:
GH_contigs= pd.DataFrame(contig_features['sp'].label)
#list(GH_contigs.columns.values)

In [9]:
GH_contigs.head()

Unnamed: 0,label
0,contig_1014
1,contig_1089
2,contig_1109
3,contig_1147
4,contig_1149


In [10]:
annotation_file_sp.head()


Unnamed: 0,# contig,classification,number of ORFs on contig,number of ORFs classification is based on,lineage,lineage scores,superkingdom,phylum,class,order,family,genus,species
0,contig_1,classified,27.0,14.0,1;131567;2;1224,1.00;0.83;0.83;0.67,Bacteria,Proteobacteria,not classified,not classified,not classified,not classified,not classified
1,contig_10,classified,38.0,14.0,1;131567;2,1.00;1.00;0.95,Bacteria,not classified,not classified,not classified,not classified,not classified,not classified
2,contig_100,classified,31.0,25.0,1;131567;2;1224,1.00;0.93;0.91;0.71,Bacteria,Proteobacteria,not classified,not classified,not classified,not classified,not classified
3,contig_1000,classified,48.0,21.0,1;131567;2,1.00;1.00;0.96,Bacteria,not classified,not classified,not classified,not classified,not classified,not classified
4,contig_1001,classified,37.0,33.0,1;131567;2,1.00;1.00;0.94,Bacteria,not classified,not classified,not classified,not classified,not classified,not classified


In [11]:
annotation_file_sp.rename(columns={'# contig':'label'}, inplace=True)

In [12]:


new = pd.merge(GH_contigs, annotation_file_sp, on="label")

In [13]:
new.head()

Unnamed: 0,label,classification,number of ORFs on contig,number of ORFs classification is based on,lineage,lineage scores,superkingdom,phylum,class,order,family,genus,species
0,contig_1014,classified,47.0,27.0,1;131567;2;1224,1.00;1.00;1.00;0.57,Bacteria,Proteobacteria,not classified,not classified,not classified,not classified,not classified
1,contig_1089,classified,40.0,23.0,1;131567;2;1224;28211;356;45401,1.00;1.00;0.92;0.92;0.76;0.61;0.57,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Hyphomicrobiaceae,not classified,not classified
2,contig_1109,classified,26.0,15.0,1;131567;2759,1.00;1.00;0.58,Eukaryota,not classified,not classified,not classified,not classified,not classified,not classified
3,contig_1147,classified,36.0,24.0,1;131567;2,1.00;1.00;0.96,Bacteria,not classified,not classified,not classified,not classified,not classified,not classified
4,contig_1149,classified,52.0,34.0,1;131567;2;1783272;201174,1.00;1.00;0.99;0.81;0.77,Bacteria,Actinobacteria,not classified,not classified,not classified,not classified,not classified


In [14]:
new.to_csv('/home/is6/sp_annotation', index=False)

In [15]:
!head /home/is6/sp_annotation


label,classification,number of ORFs on contig,number of ORFs classification is based on,lineage,lineage scores,superkingdom,phylum,class,order,family,genus,species
contig_1014,classified,47.0,27.0,1;131567;2;1224,1.00;1.00;1.00;0.57,Bacteria,Proteobacteria,not classified,not classified,not classified,not classified,not classified
contig_1089,classified,40.0,23.0,1;131567;2;1224;28211;356;45401,1.00;1.00;0.92;0.92;0.76;0.61;0.57,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Hyphomicrobiaceae,not classified,not classified
contig_1109,classified,26.0,15.0,1;131567;2759,1.00;1.00;0.58,Eukaryota,not classified,not classified,not classified,not classified,not classified,not classified
contig_1147,classified,36.0,24.0,1;131567;2,1.00;1.00;0.96,Bacteria,not classified,not classified,not classified,not classified,not classified,not classified
contig_1149,classified,52.0,34.0,1;131567;2;1783272;201174,1.00;1.00;0.99;0.81;0.77,Bacteria,Actinobacteria,not classified,not classified,n

# repeating steps for sp sample

In [16]:
annotation_file_chz=pd.read_table("/home/is6/SolidBin/CAT_taxonomic_classification/chz/contigs_taxonomy_noscores.tsv"
                                 
                                 )
#list(dataframe.columns.values)

  """Entry point for launching an IPython kernel.


In [18]:
GH_contigs = pd.DataFrame(contig_features['chz'].label)
#list(GH_contigs.columns.values)

In [19]:
annotation_file_chz.rename(columns={'# contig':'label'}, inplace=True)

In [20]:
new_chz = pd.merge(GH_contigs, annotation_file_chz, on="label")

In [21]:
new_chz.to_csv('/home/is6/chz_annotation', index=False)

In [22]:
!head /home/is6/chz_annotation


label,classification,number of ORFs on contig,number of ORFs classification is based on,lineage,lineage scores,superkingdom,phylum,class,order,family,genus,species
contig_1004,classified,46.0,31.0,1;131567;2;1224;28211,1.00;1.00;0.97;0.75;0.68,Bacteria,Proteobacteria,Alphaproteobacteria,not classified,not classified,not classified,not classified
contig_1027,classified,41.0,12.0,1;131567;2;1783257;74201,1.00;1.00;1.00;0.69;0.69,Bacteria,Verrucomicrobia,not classified,not classified,not classified,not classified,not classified
contig_1106,classified,92.0,71.0,1;131567;2;1224;28211,1.00;0.99;0.93;0.77;0.57,Bacteria,Proteobacteria,Alphaproteobacteria,not classified,not classified,not classified,not classified
contig_1128,classified,47.0,37.0,1;131567;2,1.00;1.00;0.96,Bacteria,not classified,not classified,not classified,not classified,not classified,not classified
contig_1167,classified,29.0,25.0,1;131567;2;1224;28211,1.00;1.00;1.00;0.64;0.64,Bacteria,Proteobacteria,Alphaproteobacteri