### Clustering using PseAAC features

##### <u> Part 1: Extracting protein sequence data </u>

In [15]:
import re

class CAZy_data:
    def __init__(self,filename1,filename2):
        self.data,self.acc,self.seq=[],[],[]
        with open(filename1,'r',encoding='utf-8') as inpt:
            for each in inpt:
                self.data.append(each.rstrip().split('$'))
        with open(filename2,'r',encoding='utf-8') as inpt1:
            for each1 in inpt1:
                if each1.startswith('>'):
                    self.acc.append(each1.rstrip())
                else:
                    self.seq.append(each1.rstrip())
            
                  
    def data_fetch(self,typ,position):
        typ_data=[]
        if typ=='all':
            typ_data=self.data
        else:
            for each in self.data:
                mult=each[position].split(' ')
                if len(mult)==1:#### In case typ = EC, Multi EC number and protein with no EC number are ignore.
                    if mult[0]==typ:
                        typ_data.append(each)
        return typ_data
    
    def EC_GH(self,ec_no,gh_fam):
        self.fasta=[]
        cazy_ec=self.data_fetch(ec_no,1)
        cazy_gh=self.data_fetch(gh_fam,-1)
        self.common_data=[[i[0],i[1],i[3],i[4],i[-2],i[-1]] for i in cazy_ec if i in cazy_gh]
        rm_prt, rm_prt_fasta=[],[]
        for each in range(len(self.common_data)):
            t=self.common_data[each]
            if self.prtn_filter(t[0]):
                all_acc=t[3].split(' ')
                if all_acc[0]!='':
                    for e_acc in all_acc:
                        e_seq=self.seq_fetch(e_acc)
                        try:
                            create_error=0/len(e_seq) # to remove accession number which doesnt have hits
                            self.fasta.append(f'>{e_acc}${t[0]}${t[1]}${t[2]}${t[-2]}${t[-1]}')
                            self.fasta.append(e_seq[0])
                        except ZeroDivisionError:
                            rm_prt_fasta.append(e_acc)
            else:
                rm_prt.append(t)
#         print('Total number of sequences:',len(self.fasta)/2)
#         print('Number of removed partial or fragment proteins (CAZy):',len(rm_prt))
#         print('Number of removed partial or fragment proteins (Fasta):',len(rm_prt_fasta))
        return self.fasta,rm_prt,rm_prt_fasta
    def prtn_filter(self,prt_name):
        hit=1
        if re.search('partial|fragment',prt_name.lower()):
            hit-=1
        return hit
            
    def seq_fetch(self,accession):
        hits=[]
        temp=0
        for each in range(len(self.acc)):
            if re.search(f'{accession}\D',self.acc[each]):
                temp+=1
                
                if self.prtn_filter(self.acc[each]): # remove partial| fragment accession numbers from GenBank description
                    hits.append(self.seq[each])
        if temp>1:
            print(f'Multiple hits for {accession}')
        elif temp==0:
            print(f'No hits for {accession}')
        return hits
     
In_data=CAZy_data('D:/After_4_4_22/data/CAZy_23_6_22/cazy_char_10_6_22.txt','D:/After_4_4_22/data/CAZy_23_6_22/char_gh_23_6_22.txt')

##### <u> Part 2: Extracting feature from protein sequences </u>

In [19]:
import numpy as np
from scipy import stats
import pandas as pd

# 1: Hydrophobicity, 2: Hydrophilicity, 3: mass, 4: pk1, 5:pk2, 6:pi, 20: 14 scale, 60: Tanford
class Standard_values:
    def __init__(self,filename):
        self.data=[]
        with open(filename,'r') as inpt:
            for each in inpt:
                self.data.append(each.rstrip().split(','))
        del self.data[0]
        
    def get_prop(self,prop):
        got_prop,amino_acid={},{}
        for each in prop:
            got_prop[each]=self.properties(each)
        d=1
        for each in got_prop[1].keys():
            amino_acid[d]=each
            d+=1
        return amino_acid,got_prop
    
    def properties(self, val):
        temp,norm={},{}
        for each in self.data:
            temp[each[0]]=float(each[val])
        relative=stats.zscore(np.array(list(temp.values())))
        for a,b in zip(temp.keys(),relative):
            norm[a]=b
        return norm

class Sequence:
    def __init__(self,filename):
        self.data=[]
        if type(filename)==str:
            with open(filename,'r') as inpt:
                for each in inpt:
                    self.data.append(each.rstrip())
        else:
            self.data=filename
                       
    def output(self):
        a,s,l=[],[],[]
        unusual=0
        for ele in self.ml_sl():
            if ele.startswith('>'):
                a.append(ele)
            else:
                if re.search('[UZOBJX]',ele.upper()):
#                     print(a[-1])
                    del a[-1]
                    unusual+=1
                    continue
                l.append(len(ele))
                s.append(ele.upper())
#         print('The length of the smallest sequence:',min(l))
#         print('Sequence with "X" present:',unusual)
        return a,s,l,unusual
             
    def ml_sl(self):
        acc_seq=[]
        for k in range(len(self.data)):
            if self.data[k].startswith('>'):
                acc_seq.append(self.data[k])
                join_=0
                for l in range(k+1,len(self.data)):
                    if self.data[l].startswith('>') == False:
                        join_+=1
                    else:
                        break
                acc_seq.append(''.join(self.data[k+1:k+1+join_]))
        return acc_seq

class Pseaac:
    def __init__(self,filename):
        self.filename=filename
    def collect(self,lamb,w,pro,nf):
        val=[]
        val.append(['#']+[ea for ea in keys.values()]+['\u03BB'+str(eac+1) for eac in range(lamb)])
        for e_seq in range(len(seq)):# single sequence taken for test
            q=self.pse(seq[e_seq],lamb,w,pro,nf)
            tem=acc[e_seq].split(' ')[0][1:]
            val.append([acc[e_seq]]+q)
        df=pd.DataFrame(val[1:],columns=val[0])
#         print('PseAAC feature have been extracted!!!')
        return df
        
    def pse(self,data,lamb,w,pro,nf):
        thet=self.theta(data,lamb,pro)
        deno=1+(w*sum(thet.values()))
        p=[]
        if nf==1:
            norm=(len(data))
        else:
            norm=1
        for u in range(1,21+lamb):
            if u>=1 and u<=20:
    #             print(u,'natural')
                num=data.count(keys[u])/norm # frequency
                p.append(num/deno)
            elif u>=21 and u<=20+lamb:
    #             print(u,'pseudo')
                num=w*thet[u-20]
                p.append(num/deno)
        return p
    
    def theta(self,data,lamb,pro):
        the={}
        for u in range(1,lamb+1):
            the[u]=(1/(len(data)-u))*self.rel_cal(data,u,pro)
        return the
    
    def rel_cal(self,data,v,pro):
        tem=[]
        for u in range(len(data)-v):
            te=[]
            for u1 in pro:
                x=((values[u1][data[u]])-(values[u1][data[u+v]]))**2
                te.append(x)
    #             print(u1,u,u+v,data[u],data[u+v],x)
            tem.append(sum(te)/len(pro))
        return sum(tem)

keys,values=Standard_values('D:/After_4_4_22/data/CAZy_23_6_22/7_98_hydrophobicity.csv').get_prop([1,2,3,4,5,6,20,60])
# pseaac_data['Length']=stats.zscore(min_len)

In [13]:
'''Clustering Algorithm'''
import os
import time
import matplotlib.pyplot as plt
from sklearn import cluster, mixture, manifold, decomposition, preprocessing,metrics
import random
from collections import Counter,defaultdict
import copy

class clustering:
    rs=77
#     rn=random.randint(1,99)
    def __init__(self,folder,data,n,cat):
        self.x=data.iloc[:,1:].values
        self.y=data.iloc[:,0]
        self.folder=folder
        self.cat=cat
        self.anno_label={0:'acc',1:'prtn',2:'ec',3:'org',4:'species',5:'ghf'}
        temp=[i.split('$')[cat[0]] for i in self.y]
        lab=list(set(temp))
        self.true_lab=[lab.index(j) for j in temp]
        try:
            n.isalpha()
            self.n=len(lab)
        except AttributeError:
            self.n=n
        self.temp1=','.join([f'{k}:{v}' for k,v in dict(Counter(temp)).items()])
        
    def kmeans(self):
        start = time.time()
        self.names = 'km'
        kmeans = cluster.KMeans(n_clusters=self.n,random_state=clustering.rs) # Number of clusters
        self.labels = kmeans.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def affinity(self):
        start = time.time()
        self.names = 'apc'
        apc = cluster.AffinityPropagation(random_state=clustering.rs)
        self.labels = apc.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def meanshift(self):
        start = time.time()
        self.names = 'ms'
        ms = cluster.MeanShift()
        self.labels = ms.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def spectral(self):
        start = time.time()
        self.names = 'spec'
        spectral = cluster.SpectralClustering(n_clusters=self.n,assign_labels="discretize",random_state=clustering.rs) # Number of clusters
        self.labels = spectral.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def agglomerative(self):
        start = time.time()
        self.names = 'agglo'
        agglo = cluster.AgglomerativeClustering(n_clusters=self.n) # Number of clusters
        self.labels = agglo.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def dbscan(self):
        start = time.time()
        self.names = 'dbs'
        new_x=preprocessing.StandardScaler().fit_transform(self.x)
        dbs = cluster.DBSCAN()
        self.labels = dbs.fit_predict(new_x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def optics(self):
        start = time.time()
        self.names = 'opt'
        opt = cluster.OPTICS()
        self.labels = opt.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def gaussian(self):
        start = time.time()
        self.names = 'gm'
        gm = mixture.GaussianMixture(n_components=self.n,random_state=clustering.rs) # Number of Clusters
        self.labels = gm.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def birch(self):
        start = time.time()
        self.names = 'bir'
        brc = cluster.Birch() # Number of clusters
        self.labels = brc.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()

    def label_save(self):
        dfout = pd.DataFrame({'Accession': self.y,  'predicted': self.labels, 'expected':self.true_lab})
        try:
            os.mkdir(self.folder)
        except FileExistsError:
            pass
        dfout.to_csv(f'{self.folder}\ML_{self.names}_{len(set(self.labels))}.txt',sep='\t', index=False)
        self.file()
        return self.analysis()
    
    def analysis(self):
        value=metrics.fowlkes_mallows_score(self.true_lab,self.labels)
        tot_val=[self.names,self.anno_label[self.cat[0]],str(lambda_value),str(round(value,3)),str(self.n),str(len(set(self.labels))),self.temp1,str(len(self.true_lab))]
        return tot_val
    
    def file(self):
        try:
            os.mkdir(f'{self.folder}/table')
        except FileExistsError:
            pass
        dd=defaultdict(list)
        for i in range(len(self.labels)):
            dd[self.labels[i]].append(self.y[i])
        self.excel(dict(sorted(dd.items())))
        
    def excel(self,anno):
        all_anno={}
        for i in self.cat:
            temp={}
            for j,k in anno.items():
                te=[]
                for l in k:
                    te.append(l.split('$')[i])
                temp[j]=dict(Counter(te))
            df=pd.DataFrame(temp).fillna(0).astype(int)
            df.loc['Total']=df.sum(axis=0)
            df.loc[:,'Total']=df.sum(axis=1)
            df.to_excel(f'{self.folder}/table/{self.names}_{len(set(self.labels))}_{self.anno_label[i]}.xlsx')

In [17]:
class ec_data:
    def __init__(self,file):
        self.data=[]
        with open(file,'r') as inpt:
            for i in inpt:
                self.data.append(i.rstrip())

    def cazy(self,dom,typ):
        temp=[]
        for i in self.data:
            temp.append(i.split('$')[1].split(' '))
        return self.domain(temp,dom,typ)
    
    def domain(self,ec_n,n,include):
        single,multi=[],[]
        for i in ec_n:
            if len(i)<=n:
                single.extend(i)
            else:
                multi.append(i)
        print('Number of single domain:',len(single))
        print(f'Number of multi domain (>{n}):',len(multi))
        if include=='m':
            for j in multi:
                for k in j:
                    single.append(k)
            print('After including multi domains:',len(single))
        return single
    
class analysis:
    def __init__(self,data):
        self.data=data
        
    def non_kegg_count(self):
        temp=Counter(self.data)
        return self.specific_ec(dict(temp))
    
    def kegg_count(self,label):
        temp=defaultdict(int)
        for j,k in zip(self.data,label):
            temp[j]+=int(k)
        return self.specific_ec(dict(temp))
    
    def specific_ec(self,dat):
        caazy=[]
        for u,v in dat.items():
            if '3.2.1.' in u:
                try:
                    caazy.append([u,int(u.split('.')[-1])])
                except ValueError:
                    print(u)
        cc_ec=[x[0] for x in sorted(caazy, key=lambda x:x[1])]
        return cc_ec

In [23]:
# '1' stands for single domain, and 'm' stands for to include mutli domain in single domain sequences
a_cazy=ec_data('D:/After_4_4_22/data/CAZy_23_6_22/cazy_char_10_6_22.txt').cazy(1,'s')
c_cazy=analysis(a_cazy).non_kegg_count()

t1=time.perf_counter()
total_data=[]
total_rm_cazy,total_rm_genbank={},{}
value_error,no_entry=[],[]
for each in c_cazy:
    ec_number=each
    gh_family='all'
    cazy_acc_seq,rm_cazy,rm_genbank=In_data.EC_GH(ec_number,gh_family) # write all to fetch all the EC number or all the GH family
    total_rm_cazy[each],total_rm_genbank[each]=rm_cazy,rm_genbank
    acc,seq,min_len,x_aa=Sequence(cazy_acc_seq).output()
    print(f'Sequence has been collected for {each}....')
    min_lamb=30
    try:
        if min(min_len)<min_lamb:
            lambda_value=min(min_len)
        else:
            lambda_value=min_lamb
    except ValueError:
        print(each,': doesnt have sequences')
        no_entry.append(each)
        continue
    pseaac_data=Pseaac(f'PAAC_{ec_number}_{gh_family}_L{lambda_value}.txt').collect(lambda_value,0.05,[60,2,3],1)
    print(f'Feature has been extracted for {each}....')
    ec_=ec_number.replace('.','_')
    # In the below statement 'auto' means it takes automatic clusters based on number labels given, you can also choose any number.
    clust=clustering(f'{gh_family}_{ec_}',pseaac_data,'auto',[5]) # 0:'acc',1:'prtn',2:'ec',3:'org',4:'species',5:'ghf'
    try:
        km=clust.kmeans()
        apc=clust.affinity()
        ms=clust.meanshift()
        spec=clust.spectral()
        agglo=clust.agglomerative()
        dbs=clust.dbscan()
        opt=clust.optics()
        gm=clust.gaussian()
        bir=clust.birch()
    except ValueError:
        print(f'{each} has {len(acc)} samples which is less than 5 min_samples ')
        value_error.append([each,len(acc)])
        continue
    print(f'Clustering has been done for {each}....')
    all_clust={'km':km,'apc':apc,'ms':ms,'spec':spec,'agglo':agglo,'dbs':dbs,'opt':opt,'gm':gm,'bir':bir}
    for aa in all_clust.values():
        bb=[each]+aa+[str(len(rm_cazy)),str(len(rm_genbank))]+[str(x_aa)]
        total_data.append('$'.join(bb))
t2=time.perf_counter()
print('Mission completed in',round(t2-t1,3),'seconds')

Number of single domain: 7057
Number of multi domain (>1): 320
3.2.1.-
3.2.1.17_or_4.2.2.n1
No hits for NP_578206.1
No hits for CAC02970.1
No hits for ACD93218.2
No hits for CAB12098.1
No hits for NP_388186.1
No hits for NP_812609.1
No hits for NP_812609.1
No hits for AAA63759.1
No hits for NP_149331.1
No hits for ABC18196.1
No hits for CBL15129.1
No hits for NP_721927.1
No hits for NP_827158.1
No hits for NP_229636.1
No hits for NP_229450.1
No hits for NP_636139.1
No hits for A35282
No hits for CAO78410.1
No hits for BAB72257.1
No hits for AAA16183.1
No hits for NP_031472.1
No hits for AAA37230.1
No hits for NP_033799.1
No hits for AAA40725.1
No hits for AAA40731.1
No hits for NP_113690.1
No hits for CAB11471.1
No hits for NP_248621.1
No hits for NP_342630.1
Sequence has been collected for 3.2.1.1....
Feature has been extracted for 3.2.1.1....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.1....
No hits for NP_191958.2
No hits for NP_921898.1
Sequence has been collected for 3.2.1.2....
Feature has been extracted for 3.2.1.2....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.2....
No hits for NP_248620.1
No hits for NP_342472.1
No hits for BAB65828.1
No hits for NP_376719.1
No hits for NP_393821.1
No hits for NP_623401.1
No hits for NP_421085.1
No hits for AAB20818.1
No hits for XP_327956.1
No hits for XP_368148.1
No hits for EAA49438.1
No hits for NP_012167.1
No hits for NP_812614.1
Sequence has been collected for 3.2.1.3....
Feature has been extracted for 3.2.1.3....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.3....
No hits for NP_126623.1
No hits for NP_143072.1
No hits for AAU23613.1
No hits for NP_389695.1
No hits for NP_622045.1
No hits for CAB01405.1
No hits for ABZ70413.1
No hits for AAD48494.2
No hits for NP_241469.1
No hits for WP_018063499.1
No hits for AAC02964.1
No hits for BAA12826.1
No hits for AAZ56745.1
No hits for AAZ54939.1
No hits for AHA42547.1
No hits for NP_638867.1
No hits for NP_298108.1
No hits for XP_324942.1
No hits for XP_002475436.1
No hits for AAL33630.1
No hits for AAL33639.1
No hits for CAA16243.1
No hits for CAE55238.1
No hits for NP_214576.1
No hits for AAZ55112.1
No hits for BAB64564.1
No hits for BAB64563.1
No hits for XP_366456.1
No hits for XP_366456.2
No hits for NP_213966.1
No hits for NP_347552.1
No hits for YP_003844202.1
No hits for AAZ55662.1
No hits for AAA27397.1
No hits for AAZ56209.1
No hits for EFA05721.1
No hits for XP_001810693.1
No hits for NP_578583.1
No hits for CAC24331.1
No hits for NP_343873.1
No hits 

  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.4....
No hits for NP_437620.1
No hits for NP_437595.1
Sequence has been collected for 3.2.1.6....
Feature has been extracted for 3.2.1.6....




Clustering has been done for 3.2.1.6....
Sequence has been collected for 3.2.1.7....
Feature has been extracted for 3.2.1.7....




Clustering has been done for 3.2.1.7....
No hits for BAA31551.1
No hits for NP_149279.1
No hits for NP_149217.1
No hits for AAC98123.1
No hits for ABI49937.1
No hits for AAC98140.1
No hits for CAA82319.1
No hits for NP_242986.1
No hits for ZP_08160171.1
No hits for ZP_08159615.1
No hits for ACR61562.1
No hits for BAA19777.1
No hits for ADQ57411.1
No hits for YP_003851606.1
No hits for AAZ56824.1
No hits for AAZ56956.1
No hits for CAD48748.1
No hits for NP_227886.1
No hits for NP_227877.1
No hits for AAD32593.1
No hits for NP_644553.1
No hits for AAP31839.1
No hits for EAA78230.1
No hits for XP_002470423.1
No hits for XP_365543.1
No hits for BAA89465.1
No hits for NP_241765.1
No hits for CAD65888.1
No hits for ZP_08160424.1
No hits for ZP_08158180.1
No hits for ZP_08157788.1
No hits for NP_624448.1
No hits for NP_626540.1
No hits for BAA19778.1
No hits for AAZ55251.1
No hits for ABA39289.1
No hits for ABM55503.1
No hits for EAA73188.1
No hits for XP_383800.1
No hits for XP_368051.1
No h

  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.8....
No hits for NP_011803.1
No hits for AAI16454.1
No hits for NP_001032.1
Sequence has been collected for 3.2.1.10....
Feature has been extracted for 3.2.1.10....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.10....
No hits for NP_811999.1
Sequence has been collected for 3.2.1.11....
Feature has been extracted for 3.2.1.11....




Clustering has been done for 3.2.1.11....
No hits for NP_279794.1
No hits for NP_578962.1
No hits for NP_578963.1
No hits for NP_578962.1
No hits for NP_833450.1
No hits for AAU21943.1
No hits for NP_902605.1
No hits for AGD94964.1
No hits for NP_814153.1
No hits for NP_241782.1
No hits for NP_268107.1
No hits for CBW16119.1
No hits for ABV39247.1
No hits for ABV40327.1
No hits for ABV41826.1
No hits for ABV42574.1
No hits for NP_630126.1
No hits for NP_629155.1
No hits for NP_629515.1
No hits for NP_625711.1
No hits for AAZ54618.1
No hits for NP_232428.1
No hits for AAA83586.1
No hits for EAL00460.1
No hits for ABC59330.1
No hits for AGX26690.1
No hits for AAH36339.1
No hits for AAH47336.1
No hits for CAH70802.1
No hits for CAH70803.1
No hits for CAH70804.1
No hits for CAI19263.1
No hits for CAI19265.1
No hits for CAI19266.1
No hits for NP_068569.1
No hits for BAB25878.1
No hits for BAB90566.1
No hits for BAB91759.1
No hits for ABB97081.1
No hits for AAV58834.1
No hits for BAA31200.1


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.14....
No hits for NP_637621.1
No hits for NP_638805.1
No hits for CAM91243.1
No hits for CAM33166.1
No hits for CAQ03437.1
No hits for ACP18831.1
No hits for ADU33280.1
No hits for ADU33338.1
No hits for ADU33339.1
No hits for ADU33359.1
No hits for ADU33363.1
No hits for NP_012687.1
Sequence has been collected for 3.2.1.15....
Feature has been extracted for 3.2.1.15....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.15....
No hits for AAC00558.1
No hits for ABK34500.1
No hits for XP_046047.1
No hits for NP_815238.1
No hits for NP_420976.1
No hits for NP_845781.1
No hits for NP_845154.1
No hits for NP_347193.1
No hits for NP_562230.1
No hits for NP_815667.1
No hits for NP_814147.1
No hits for BAB45607.1
Sequence has been collected for 3.2.1.17....
Feature has been extracted for 3.2.1.17....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.17....
No hits for BAC56895.1
No hits for NP_809368.1
No hits for NP_561641.1
No hits for NP_904664.1
No hits for NP_971085.1
No hits for NP_776547.1
No hits for NP_006647.2
No hits for XP_035091.1
No hits for NP_005374.1
No hits for CAI17742.1
No hits for CAI18221.1
No hits for CAI41850.1
No hits for CAM26147.1
No hits for CAQ08322.1
No hits for CAQ09153.1
No hits for CAQ09305.1
No hits for CAQ09512.1
No hits for AAH63465.1
No hits for NP_035023.2
No hits for NP_058826.1
Sequence has been collected for 3.2.1.18....
Feature has been extracted for 3.2.1.18....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.18....
No hits for NP_229631.1
No hits for AAC73506.1
No hits for NP_414937.1
No hits for XP_392790.1
No hits for XP_393379.2
No hits for XP_394730.1
No hits for AAF69018.1
No hits for NP_011808.1
No hits for NP_009858.1
No hits for CAC24058.1
No hits for NP_344361.1
No hits for BAB67639.1
No hits for NP_378530.1
No hits for NP_393778.1
No hits for NP_809252.1
No hits for NP_812211.1
No hits for ZP_01966167.1
No hits for NP_621719.1
No hits for AAA81491.1
No hits for AAO91743.1
No hits for NP_501419.1
No hits for AAB71267.2
No hits for AAB71267.3
No hits for NP_508105.1
No hits for CAA38907.1
No hits for NP_000143.1
No hits for NP_004659.1
No hits for NP_004659.1
No hits for CAM15004.1
No hits for CAM15005.1
No hits for CAM15006.1
No hits for CAM15007.1
No hits for NP_032090.2
No hits for NP_809596.1
No hits for NP_577861.2
Sequence has been collected for 3.2.1.20....
Feature has been extracted for 3.2.1.20....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.20....
No hits for NP_578171.1
No hits for NP_142340.1
No hits for ABP70047.1
No hits for YP_066184.1
No hits for NP_242789.1
No hits for ABR73190.1
No hits for NP_386997.1
No hits for NP_625353.1
No hits for NP_631601.1
No hits for AAZ54975.1
No hits for ADI56259.1
No hits for NP_936184.1
No hits for AEE33889.1
No hits for NP_176375.1
No hits for NP_187303.1
No hits for AAG12895.1
No hits for NP_198505.1
No hits for AAA83309.1
No hits for AAC68766.1
No hits for NP_497558.1
No hits for ABI34907.1
No hits for ABI34907.2
No hits for ACD65509.1
No hits for EAA26947.1
No hits for XP_322216.1
No hits for BAB86071.1
No hits for BAC06894.1
No hits for BAF07003.1
No hits for CAE05491.1
No hits for CAE03398.1
No hits for CAE01908.1
No hits for CAE01909.1
No hits for CAE01911.1
No hits for AAA84906.2
No hits for AAT85322.1
No hits for AAL89551.1
No hits for NP_812226.1
No hits for YP_002352162.1
No hits for ZP_01543735.1
No hits for NP_227841.1
No hits for XP_3

  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.21....
No hits for NP_243094.1
No hits for NP_813068.1
No hits for CAB55878.1
No hits for CAM19136.1
No hits for CAM19137.1
No hits for NP_038491.1
No hits for BAF21463.1
No hits for BAC55816.1
No hits for BAF26853.1
No hits for NP_922125.1
No hits for CAC24100.1
No hits for NP_344437.1
No hits for BAB67666.1
No hits for NP_378557.1
No hits for AAD30994.1
No hits for ABF72189.1
No hits for NP_346329.1
No hits for NP_624603.1
No hits for NP_228997.1
No hits for CAB65477.1
No hits for ZP_06242255.1
No hits for NP_992831.1
No hits for NP_578173.1
No hits for NP_810784.1
No hits for NP_812206.1
No hits for NP_811533.1
No hits for NP_812575.1
Sequence has been collected for 3.2.1.22....
Feature has been extracted for 3.2.1.22....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.22....
No hits for BAC56904.1
No hits for NP_813062.1
No hits for NP_813067.1
No hits for NP_809374.1
No hits for NP_809906.1
No hits for NP_813578.1
No hits for ABE27151.1
No hits for AAC76111.1
No hits for NP_417547.2
No hits for CAD65569.1
No hits for NP_786691.1
No hits for NP_268137.2
No hits for NP_358159.1
No hits for NP_345155.1
No hits for NP_228998.1
No hits for NP_813071.1
No hits for NP_344609.1
No hits for NP_638243.1
No hits for NP_642100.1
No hits for NP_194344.1
No hits for NP_568978.1
No hits for NP_190852.1
No hits for NP_000395.1
No hits for BAC31151.1
No hits for BAD20774.1
No hits for ADO34790.1
No hits for ADO34790.2
No hits for YP_594332.1
No hits for NP_229000.1
No hits for NP_228122.1
No hits for MBF8418755.1
No hits for EFB02686.1
No hits for ZP_01923925.1
Sequence has been collected for 3.2.1.23....
Feature has been extracted for 3.2.1.23....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.23....
No hits for CAC24028.1
No hits for NP_344318.1
No hits for CAB07105.1
No hits for NP_625245.1
No hits for NP_229647.1
No hits for AAB52345.2
No hits for AAM81094.1
No hits for CCD62468.1
No hits for NP_491568.1
No hits for NP_740847.1
No hits for NP_740848.1
No hits for AAA81731.2
No hits for AAF52958.1
No hits for NP_006113.1
No hits for NP_006706.1
No hits for NP_034894.1
No hits for NP_640349.1
No hits for NP_812439.1
No hits for NP_812874.1
No hits for NP_811860.1
No hits for NP_809945.1
No hits for NP_811024.1
Sequence has been collected for 3.2.1.24....
Feature has been extracted for 3.2.1.24....




Clustering has been done for 3.2.1.24....
No hits for NP_578085.1
No hits for NP_142473.1
No hits for AAL42528.1
No hits for AAK87314.1
No hits for NP_354529.1
No hits for NP_532212.1
No hits for BAC56899.1
No hits for NP_809371.1
No hits for AAZ54953.1
No hits for NP_229424.1
No hits for NP_638239.1
No hits for NP_643384.1
No hits for NP_081564.1
No hits for NP_696497.1
No hits for ZP_06241352.1
No hits for WP_116885687.1
Sequence has been collected for 3.2.1.25....
Feature has been extracted for 3.2.1.25....




Clustering has been done for 3.2.1.25....
No hits for CAB15830.1
No hits for NP_391683.1
No hits for NP_347065.1
No hits for NP_601843.1
No hits for NP_346228.1
No hits for NP_346161.1
No hits for NP_229215.1
No hits for NP_176453.1
No hits for NP_187995.1
No hits for XP_323605.1
No hits for CAD41518.1
No hits for CAE04784.2
No hits for CAE04785.1
No hits for BAB72022.1
No hits for AAA69504.1
No hits for AAV88999.1
No hits for NP_485561.1
No hits for NP_484862.1
Sequence has been collected for 3.2.1.26....
Feature has been extracted for 3.2.1.26....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.26....
No hits for NP_111834.1
No hits for AAW44482.1
No hits for NP_009111.1
Sequence has been collected for 3.2.1.28....
Feature has been extracted for 3.2.1.28....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.28....
No hits for NP_809909.1
No hits for NP_813092.1
No hits for NP_561063.1
No hits for ZP_02032394.1
No hits for NP_687713.1
No hits for Q8E0N2
No hits for NP_228868.1
No hits for NP_000172.1
No hits for NP_058711.1
Sequence has been collected for 3.2.1.31....
Feature has been extracted for 3.2.1.31....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.31....
Sequence has been collected for 3.2.1.32....
Feature has been extracted for 3.2.1.32....




Clustering has been done for 3.2.1.32....
No hits for NP_000019.1
No hits for NP_000633.1
No hits for NP_000634.1
No hits for NP_000635.1
No hits for NP_000636.1
No hits for NP_000637.1
Sequence has been collected for 3.2.1.33....
Feature has been extracted for 3.2.1.33....




Clustering has been done for 3.2.1.33....
No hits for AAC16482.1
No hits for NP_033267.1
No hits for CAP19319.1
No hits for CAP19320.1
No hits for CAP19321.1
No hits for CAP19322.1
No hits for NP_032343.2
No hits for AAH60047.1
No hits for NP_034619.1
No hits for NP_742037.1
No hits for NP_561107.1
Sequence has been collected for 3.2.1.35....
Feature has been extracted for 3.2.1.35....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.35....
Sequence has been collected for 3.2.1.36....
Feature has been extracted for 3.2.1.36....




3.2.1.36 has 1 samples which is less than 5 min_samples 
No hits for ZP_03013483.1
No hits for ZP_08159508.1
No hits for BAB61064.1
No hits for NP_227892.1
No hits for NP_638240.1
No hits for NP_643385.1
No hits for NP_644530.1
No hits for XP_329671.1
No hits for 5VQJ_A
No hits for ABP67986.1
No hits for NP_421160.1
No hits for AAC98129.1
No hits for NP_241934.1
No hits for NP_644357.1
No hits for CAB13642.1
No hits for NP_389640.1
No hits for AAT98625.1
No hits for NP_244550.1
No hits for AAZ55651.1
No hits for NP_644557.1
No hits for CAC29083.1
No hits for AAC98122.1
No hits for NP_242980.1
Sequence has been collected for 3.2.1.37....
Feature has been extracted for 3.2.1.37....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.37....
No hits for ZP_07060000.1
No hits for ZP_07060001.1
Sequence has been collected for 3.2.1.38....
Feature has been extracted for 3.2.1.38....




Clustering has been done for 3.2.1.38....
No hits for NP_577805.1
No hits for ZP_02070069.1
No hits for AAB39377.1
No hits for AAB39378.1
No hits for EU589324
No hits for NP_227840.1
No hits for AIX48714.1
No hits for ABR28478.1
No hits for CAA33275.1
No hits for NP_241102.1
No hits for AAZ56163.1
No hits for WP_029429093.1
No hits for ZP_02070068.1
No hits for ZP_06243608.1
Sequence has been collected for 3.2.1.39....
Feature has been extracted for 3.2.1.39....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.39....
No hits for NP_809926.1
No hits for NP_809932.1
No hits for YP_005019950.1
No hits for CAD65560.1
No hits for CAD65558.1
No hits for ZP_07366943.1
No hits for ZP_07367044.1
No hits for NP_822003.1
Sequence has been collected for 3.2.1.40....
Feature has been extracted for 3.2.1.40....




Clustering has been done for 3.2.1.40....
No hits for CAB14971.1
No hits for NP_390871.1
No hits for NP_810576.1
No hits for NP_326475.1
No hits for WP_016313728.1
No hits for CBL15393.1
No hits for CBL15610.1
No hits for NP_688225.1
No hits for NP_229641.1
No hits for CAD39560.1
Sequence has been collected for 3.2.1.41....
Feature has been extracted for 3.2.1.41....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.41....
No hits for AFR92751.1
No hits for CAI95090.1
No hits for NP_000148.1
No hits for CAM17042.1
No hits for CAM17043.1
No hits for NP_766280.1
Sequence has been collected for 3.2.1.45....
Feature has been extracted for 3.2.1.45....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.45....
No hits for CAI11729.1
No hits for NP_000144.1
No hits for NP_032105.1
Sequence has been collected for 3.2.1.46....
Feature has been extracted for 3.2.1.46....




Clustering has been done for 3.2.1.46....
No hits for NP_638705.1
No hits for NP_643797.1
Sequence has been collected for 3.2.1.48....
Feature has been extracted for 3.2.1.48....




Clustering has been done for 3.2.1.48....
No hits for NP_032695.2
Sequence has been collected for 3.2.1.49....
Feature has been extracted for 3.2.1.49....




Clustering has been done for 3.2.1.49....
No hits for NP_561782.1
No hits for NP_000254.1
Sequence has been collected for 3.2.1.50....
Feature has been extracted for 3.2.1.50....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.50....
No hits for CAC24067.1
No hits for NP_344369.1
No hits for NP_344370.1
No hits for NP_810538.1
No hits for NP_811882.1
No hits for NP_813047.1
No hits for NP_228118.1
No hits for NP_638236.1
No hits for NP_788497.1
No hits for NP_788498.1
No hits for CAH74004.1
No hits for CAH74004.2
No hits for NP_000138.1
No hits for CAB53746.1
No hits for CAD92494.1
No hits for CAD92495.1
No hits for NP_036694.1
Sequence has been collected for 3.2.1.51....
Feature has been extracted for 3.2.1.51....




Clustering has been done for 3.2.1.51....
No hits for BAA35922.1
No hits for CAA17329.1
No hits for CAE55256.1
No hits for NP_214751.1
No hits for NP_626989.1
No hits for NP_629382.1
No hits for NP_228618.1
No hits for NP_230341.1
No hits for BAC56902.1
No hits for NP_809372.1
No hits for NP_809373.1
No hits for NP_809369.1
No hits for NP_809419.1
No hits for NP_267650.1
No hits for NP_904396.1
No hits for NP_357651.1
No hits for NP_357651.1
No hits for NP_344606.1
No hits for NP_344606.1
No hits for NP_627016.1
No hits for NP_638238.1
No hits for NP_191086.1
No hits for NP_567017.1
No hits for AAA96105.3
No hits for CAO72175.1
No hits for AAL35732.2
No hits for CAA22078.2
No hits for CAA19506.2
No hits for CAI06053.1
No hits for CAI06053.2
No hits for AAG22248.1
No hits for AAN11597.1
No hits for NP_728974.1
No hits for NP_728976.1
No hits for AAF58500.2
No hits for AAM68691.1
No hits for AAM68693.1
No hits for NP_610790.1
No hits for NP_725180.1
No hits for NP_000511.1
No hits for NP

  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.52....
Sequence has been collected for 3.2.1.53....
Feature has been extracted for 3.2.1.53....




3.2.1.53 has 1 samples which is less than 5 min_samples 
No hits for NP_229632.1
Sequence has been collected for 3.2.1.54....
Feature has been extracted for 3.2.1.54....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.54....
No hits for ZP_03458529.1
No hits for ZP_03013484.1
No hits for NP_812586.1
No hits for NP_812566.1
No hits for NP_809934.1
No hits for NP_811764.1
No hits for NP_812573.1
No hits for NP_812006.1
No hits for WP_011917910.1
No hits for WP_011917910.1
No hits for NP_644482.1
No hits for XP_370346.1
No hits for ZP_00503782.1
No hits for CAB14832.1
No hits for NP_390750.1
No hits for CAB14811.1
No hits for NP_390729.1
No hits for NP_809281.1
No hits for NP_809261.1
No hits for NP_695732.1
No hits for EDY06090.1
No hits for ZP_08160912.1
No hits for NP_228093.1
No hits for NP_641621.1
No hits for BAF22602.1
No hits for CAA52785.1
No hits for XP_330425.1
No hits for NP_630049.1
No hits for BAB84113.1
No hits for EAK85571.1
Sequence has been collected for 3.2.1.55....
Feature has been extracted for 3.2.1.55....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.55....
Sequence has been collected for 3.2.1.57....
Feature has been extracted for 3.2.1.57....




3.2.1.57 has 2 samples which is less than 5 min_samples 
No hits for ACP74152.1
Sequence has been collected for 3.2.1.58....
Feature has been extracted for 3.2.1.58....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.58....
No hits for CAG47121.1
No hits for CAB11202.1
No hits for NP_594914.1
No hits for CAA22810.1
No hits for NP_595364.1
No hits for BAP10900.1
Sequence has been collected for 3.2.1.59....
Feature has been extracted for 3.2.1.59....




Clustering has been done for 3.2.1.59....
Sequence has been collected for 3.2.1.60....
Feature has been extracted for 3.2.1.60....




Clustering has been done for 3.2.1.60....
Sequence has been collected for 3.2.1.61....
Feature has been extracted for 3.2.1.61....




3.2.1.61 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.62....
Feature has been extracted for 3.2.1.62....




3.2.1.62 has 1 samples which is less than 5 min_samples 
No hits for NP_346093.1
No hits for NP_642102.1
Sequence has been collected for 3.2.1.63....
Feature has been extracted for 3.2.1.63....




Clustering has been done for 3.2.1.63....
Sequence has been collected for 3.2.1.64....
Feature has been extracted for 3.2.1.64....




3.2.1.64 has 2 samples which is less than 5 min_samples 
No hits for NP_810673.1
Sequence has been collected for 3.2.1.65....
Feature has been extracted for 3.2.1.65....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.65....
No hits for NP_813066.1
No hits for NP_813034.1
No hits for NP_809931.1
No hits for NP_228247.1
Sequence has been collected for 3.2.1.67....
Feature has been extracted for 3.2.1.67....




Clustering has been done for 3.2.1.67....
No hits for NP_601306.1
No hits for NP_344806.1
No hits for AAZ55924.1
No hits for NP_192641.1
Sequence has been collected for 3.2.1.68....
Feature has been extracted for 3.2.1.68....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.68....
No hits for AAF26276.1
No hits for BAA90671.1
Sequence has been collected for 3.2.1.70....
Feature has been extracted for 3.2.1.70....




Clustering has been done for 3.2.1.70....
Sequence has been collected for 3.2.1.71....
Feature has been extracted for 3.2.1.71....
3.2.1.71 has 3 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.72....
Feature has been extracted for 3.2.1.72....




3.2.1.72 has 1 samples which is less than 5 min_samples 
No hits for WP_084555785.1
No hits for ZP_08159266.1
No hits for ZP_08157835.1
No hits for NP_442377.1
No hits for XP_368567.1
No hits for XP_362900.1
No hits for EAZ34091.1
Sequence has been collected for 3.2.1.73....
Feature has been extracted for 3.2.1.73....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.73....
No hits for NP_641782.1
No hits for NP_642120.1
No hits for NP_644175.1
Sequence has been collected for 3.2.1.74....
Feature has been extracted for 3.2.1.74....




Clustering has been done for 3.2.1.74....
No hits for NP_812224.1
No hits for XP_323748.1
Sequence has been collected for 3.2.1.75....
Feature has been extracted for 3.2.1.75....




Clustering has been done for 3.2.1.75....
No hits for NP_000194.1
No hits for XP_042678.2
No hits for NP_032351.1
Sequence has been collected for 3.2.1.76....
Feature has been extracted for 3.2.1.76....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.76....
No hits for ADK22147.1
No hits for AAZ54938.1
No hits for NP_229032.1
No hits for NP_229550.1
No hits for YP_001245126.1
No hits for BAG69482.1
No hits for NP_637144.1
No hits for NP_171733.1
No hits for EEF05441.1
No hits for AAG00315.1
No hits for AAU22320.1
No hits for CAB12407.1
No hits for NP_388469.1
No hits for YP_003844078.1
Sequence has been collected for 3.2.1.78....
Feature has been extracted for 3.2.1.78....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.78....
Sequence has been collected for 3.2.1.80....
Feature has been extracted for 3.2.1.80....




Clustering has been done for 3.2.1.80....
No hits for ABO93616.1
No hits for NP_627674.1
No hits for NP_627690.1
No hits for A32261
Sequence has been collected for 3.2.1.81....
Feature has been extracted for 3.2.1.81....




Clustering has been done for 3.2.1.81....
Sequence has been collected for 3.2.1.82....
Feature has been extracted for 3.2.1.82....




Clustering has been done for 3.2.1.82....
No hits for WP_106404066.1
Sequence has been collected for 3.2.1.83....
Feature has been extracted for 3.2.1.83....




Clustering has been done for 3.2.1.83....
Sequence has been collected for 3.2.1.84....
Feature has been extracted for 3.2.1.84....




Clustering has been done for 3.2.1.84....
No hits for BAB45117.1
No hits for NP_372713.1
Sequence has been collected for 3.2.1.85....
Feature has been extracted for 3.2.1.85....




Clustering has been done for 3.2.1.85....
No hits for NP_391805.1
No hits for NP_721380.1
No hits for NP_721491.1
No hits for NP_721937.1
No hits for NP_345092.1
No hits for NP_269656.1
No hits for AAA23550.1
No hits for BAA15523.1
No hits for NP_229086.1
Sequence has been collected for 3.2.1.86....
Feature has been extracted for 3.2.1.86....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.86....
No hits for NP_823362.1
Sequence has been collected for 3.2.1.88....
Feature has been extracted for 3.2.1.88....
3.2.1.88 has 4 samples which is less than 5 min_samples 
No hits for NP_391292.1
No hits for NP_813579.1
No hits for ACO06241.1
No hits for NP_695463.1
No hits for AEH26456.1
No hits for NP_229006.1
Sequence has been collected for 3.2.1.89....
Feature has been extracted for 3.2.1.89....




Clustering has been done for 3.2.1.89....
No hits for AAZ54658.1
No hits for XP_360146.1
No hits for CAA56918.1
Sequence has been collected for 3.2.1.91....
Feature has been extracted for 3.2.1.91....




Clustering has been done for 3.2.1.91....
Sequence has been collected for 3.2.1.92....




Feature has been extracted for 3.2.1.92....
3.2.1.92 has 5 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.93....
Feature has been extracted for 3.2.1.93....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.93....
Sequence has been collected for 3.2.1.94....
Feature has been extracted for 3.2.1.94....
3.2.1.94 has 3 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.95....
Feature has been extracted for 3.2.1.95....




3.2.1.95 has 1 samples which is less than 5 min_samples 
No hits for NP_809957.1
No hits for NP_812898.1
No hits for NP_816485.1
No hits for NP_269818.1
No hits for NP_696499.1
No hits for NP_241651.1
No hits for NP_345016.1
No hits for AEE74996.1
No hits for NP_187715.1
No hits for NP_196165.2
No hits for AAC46644.3
No hits for AAN84828.1
No hits for AAU05578.1
No hits for NP_498267.2
No hits for NP_073596.1
Sequence has been collected for 3.2.1.96....
Feature has been extracted for 3.2.1.96....




Clustering has been done for 3.2.1.96....
No hits for NP_695661.1
No hits for NP_561609.1
No hits for WP_004611020.1
No hits for WP_004612822.1
Sequence has been collected for 3.2.1.97....
Feature has been extracted for 3.2.1.97....




Clustering has been done for 3.2.1.97....
No hits for NP_241279.1
Sequence has been collected for 3.2.1.98....
Feature has been extracted for 3.2.1.98....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.98....
No hits for AAU22854.1
No hits for CAB15969.1
No hits for NP_391812.1
No hits for CAB14841.1
No hits for NP_390759.1
No hits for NP_809273.1
No hits for NP_809280.1
No hits for BAD89094.1
Sequence has been collected for 3.2.1.99....
Feature has been extracted for 3.2.1.99....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.99....
Sequence has been collected for 3.2.1.100....
Feature has been extracted for 3.2.1.100....




Clustering has been done for 3.2.1.100....
No hits for NP_812693.1
No hits for NP_811544.1
No hits for NP_811536.1
No hits for NP_812703.1
No hits for XP_328833.1
No hits for XP_323071.1
Sequence has been collected for 3.2.1.101....
Feature has been extracted for 3.2.1.101....




Clustering has been done for 3.2.1.101....
No hits for NP_561245.1
No hits for NP_346573.1
Sequence has been collected for 3.2.1.102....
Feature has been extracted for 3.2.1.102....




Clustering has been done for 3.2.1.102....
Sequence has been collected for 3.2.1.103....
Feature has been extracted for 3.2.1.103....




3.2.1.103 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.104....
Feature has been extracted for 3.2.1.104....




3.2.1.104 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.105....
Feature has been extracted for 3.2.1.105....




Clustering has been done for 3.2.1.105....
No hits for NP_176916.1
No hits for CAA92954.1
No hits for NP_006293.1
No hits for BAF07139.1
No hits for NP_113937.1
Sequence has been collected for 3.2.1.106....
Feature has been extracted for 3.2.1.106....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.106....
No hits for NP_079368.1
Sequence has been collected for 3.2.1.107....
Feature has been extracted for 3.2.1.107....




Clustering has been done for 3.2.1.107....
No hits for NP_002290.1
Sequence has been collected for 3.2.1.108....
Feature has been extracted for 3.2.1.108....




Clustering has been done for 3.2.1.108....
Sequence has been collected for 3.2.1.109....
Feature has been extracted for 3.2.1.109....




3.2.1.109 has 2 samples which is less than 5 min_samples 
No hits for NP_811105.1
No hits for NP_346563.1
No hits for NP_180377.1
Sequence has been collected for 3.2.1.111....
Feature has been extracted for 3.2.1.111....




Clustering has been done for 3.2.1.111....
No hits for ABE88173.1
No hits for XP_003629280.1
No hits for ABZ73160.1
No hits for AAM14008.1
No hits for AAG12868.1
No hits for AAG12762.1
No hits for AAG29692.1
No hits for CBW48349.1
No hits for NP_506006.1
No hits for NP_501577.1
No hits for ABB36773.2
No hits for CAB50704.1
No hits for CAI19713.1
No hits for CAI19714.1
No hits for CAB37989.1
No hits for CAB75695.1
No hits for CAB75695.2
No hits for CAI20315.1
No hits for CAC10451.1
No hits for CAH71079.1
No hits for CAH71080.1
No hits for CAI22315.1
No hits for CAI22316.1
No hits for CAI22317.1
No hits for CAI22318.1
No hits for CAH72871.2
No hits for CAH72887.1
No hits for CAI12781.1
No hits for NP_009161.1
No hits for NP_057303.1
No hits for CAI26211.1
No hits for CAI26213.1
No hits for CAI26214.1
No hits for CAM19090.1
No hits for CAM19091.1
No hits for CAM22047.1
No hits for XP_193956.3
No hits for EDK02328.1
No hits for XP_368250.1
No hits for NP_012665.1
No hits for NP_012074.1
No

  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.113....
No hits for CAB00104.1
No hits for NP_505995.1
No hits for AAF54376.1
No hits for NP_002363.1
No hits for NP_032575.1
No hits for NP_766491.1
Sequence has been collected for 3.2.1.114....
Feature has been extracted for 3.2.1.114....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.114....
Sequence has been collected for 3.2.1.116....
Feature has been extracted for 3.2.1.116....




3.2.1.116 has 2 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.117....
Feature has been extracted for 3.2.1.117....




Clustering has been done for 3.2.1.117....
Sequence has been collected for 3.2.1.118....
Feature has been extracted for 3.2.1.118....




Clustering has been done for 3.2.1.118....
Sequence has been collected for 3.2.1.119....
3.2.1.119 : doesnt have sequences
Sequence has been collected for 3.2.1.120....
Feature has been extracted for 3.2.1.120....




3.2.1.120 has 2 samples which is less than 5 min_samples 
No hits for NP_347172.1
No hits for NP_350016.1
Sequence has been collected for 3.2.1.122....
Feature has been extracted for 3.2.1.122....




Clustering has been done for 3.2.1.122....
Sequence has been collected for 3.2.1.123....
Feature has been extracted for 3.2.1.123....




Clustering has been done for 3.2.1.123....
No hits for NP_809926.1
No hits for NP_207375.1
Sequence has been collected for 3.2.1.124....
Feature has been extracted for 3.2.1.124....




Clustering has been done for 3.2.1.124....
Sequence has been collected for 3.2.1.125....
Feature has been extracted for 3.2.1.125....




3.2.1.125 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.127....
Feature has been extracted for 3.2.1.127....




3.2.1.127 has 2 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.128....
Feature has been extracted for 3.2.1.128....




3.2.1.128 has 2 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.129....
Feature has been extracted for 3.2.1.129....




Clustering has been done for 3.2.1.129....
No hits for CAI17346.1
No hits for CAI17347.1
No hits for NP_542963.1
No hits for XP_346734.1
Sequence has been collected for 3.2.1.130....
Feature has been extracted for 3.2.1.130....




Clustering has been done for 3.2.1.130....
No hits for AAC98128.2
No hits for ZP_08159559.1
Sequence has been collected for 3.2.1.131....
Feature has been extracted for 3.2.1.131....




Clustering has been done for 3.2.1.131....
No hits for NP_832437.1
No hits for NP_624986.1
No hits for NP_823026.1
Sequence has been collected for 3.2.1.132....
Feature has been extracted for 3.2.1.132....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.132....
No hits for NP_110736.1
No hits for ZP_00047084.1
No hits for NP_578599.1
Sequence has been collected for 3.2.1.133....
Feature has been extracted for 3.2.1.133....




Clustering has been done for 3.2.1.133....
No hits for NP_812615.1
Sequence has been collected for 3.2.1.135....
Feature has been extracted for 3.2.1.135....




Clustering has been done for 3.2.1.135....
Sequence has been collected for 3.2.1.136....
Feature has been extracted for 3.2.1.136....
3.2.1.136 has 3 samples which is less than 5 min_samples 
No hits for NP_228244.1
No hits for NP_228561.1
Sequence has been collected for 3.2.1.139....
Feature has been extracted for 3.2.1.139....




Clustering has been done for 3.2.1.139....
Sequence has been collected for 3.2.1.140....
Feature has been extracted for 3.2.1.140....




Clustering has been done for 3.2.1.140....
No hits for CAC23737.1
No hits for NP_343482.1
No hits for NP_601327.2
No hits for NP_294187.1
Sequence has been collected for 3.2.1.141....
Feature has been extracted for 3.2.1.141....




Clustering has been done for 3.2.1.141....
No hits for NP_809178.1
No hits for NP_823285.1
Sequence has been collected for 3.2.1.145....
Feature has been extracted for 3.2.1.145....




Clustering has been done for 3.2.1.145....
No hits for CAB08388.1
Sequence has been collected for 3.2.1.146....
Feature has been extracted for 3.2.1.146....




Clustering has been done for 3.2.1.146....
No hits for NP_197972.1
No hits for AAG12767.1
No hits for NP_175558.1
No hits for AEZ01595.1
Sequence has been collected for 3.2.1.147....
Feature has been extracted for 3.2.1.147....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.147....
No hits for AAS83105.1
No hits for ADV40931.1
No hits for CAK97604.1
Sequence has been collected for 3.2.1.149....
Feature has been extracted for 3.2.1.149....




Clustering has been done for 3.2.1.149....
Sequence has been collected for 3.2.1.150....
Feature has been extracted for 3.2.1.150....
3.2.1.150 has 3 samples which is less than 5 min_samples 
No hits for NP_642837.1
No hits for XP_386027.1
No hits for XP_361895.1
No hits for NP_823032.1
No hits for NP_823750.1
No hits for NP_630626.1
No hits for AAZ55647.1
No hits for ZP_00056977.1
No hits for NP_637119.1
No hits for NP_642098.1
Sequence has been collected for 3.2.1.151....
Feature has been extracted for 3.2.1.151....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.151....
No hits for AAK76608.1
No hits for AAM14288.1
No hits for NP_172375.1
No hits for NP_563833.1
No hits for BAD89079.1
Sequence has been collected for 3.2.1.152....
Feature has been extracted for 3.2.1.152....




Clustering has been done for 3.2.1.152....
No hits for AAY81958.1
Sequence has been collected for 3.2.1.153....
Feature has been extracted for 3.2.1.153....




Clustering has been done for 3.2.1.153....
Sequence has been collected for 3.2.1.154....
Feature has been extracted for 3.2.1.154....
3.2.1.154 has 3 samples which is less than 5 min_samples 
No hits for NP_242971.1
No hits for NP_644548.1
Sequence has been collected for 3.2.1.156....
Feature has been extracted for 3.2.1.156....




Clustering has been done for 3.2.1.156....
Sequence has been collected for 3.2.1.157....
Feature has been extracted for 3.2.1.157....




Clustering has been done for 3.2.1.157....
Sequence has been collected for 3.2.1.158....
Feature has been extracted for 3.2.1.158....
3.2.1.158 has 3 samples which is less than 5 min_samples 
No hits for BAQ55620.2
No hits for NP_627684.1
Sequence has been collected for 3.2.1.159....
Feature has been extracted for 3.2.1.159....




Clustering has been done for 3.2.1.159....
Sequence has been collected for 3.2.1.161....
Feature has been extracted for 3.2.1.161....




3.2.1.161 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.162....
Feature has been extracted for 3.2.1.162....




3.2.1.162 has 2 samples which is less than 5 min_samples 
No hits for BAA76779.1
No hits for XP_052620.6
Sequence has been collected for 3.2.1.163....
Feature has been extracted for 3.2.1.163....




Clustering has been done for 3.2.1.163....
No hits for NP_826382.1
No hits for XP_330352.1
Sequence has been collected for 3.2.1.164....
Feature has been extracted for 3.2.1.164....




Clustering has been done for 3.2.1.164....
No hits for AAX62629.1
No hits for NP_822398.1
No hits for NP_142480.1
Sequence has been collected for 3.2.1.165....
Feature has been extracted for 3.2.1.165....




Clustering has been done for 3.2.1.165....
No hits for CAK05012.1
No hits for NP_006656.1
Sequence has been collected for 3.2.1.166....
Feature has been extracted for 3.2.1.166....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.166....
Sequence has been collected for 3.2.1.167....
Feature has been extracted for 3.2.1.167....
3.2.1.167 has 2 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.168....




Feature has been extracted for 3.2.1.168....




3.2.1.168 has 2 samples which is less than 5 min_samples 
No hits for NP_813306.1
No hits for NP_269657.1
No hits for BAA31654.2
Sequence has been collected for 3.2.1.169....
Feature has been extracted for 3.2.1.169....




Clustering has been done for 3.2.1.169....
Sequence has been collected for 3.2.1.170....
Feature has been extracted for 3.2.1.170....




Clustering has been done for 3.2.1.170....
Sequence has been collected for 3.2.1.171....
Feature has been extracted for 3.2.1.171....




Clustering has been done for 3.2.1.171....
No hits for CAB14990.1
No hits for NP_813087.1
Sequence has been collected for 3.2.1.172....
Feature has been extracted for 3.2.1.172....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.172....
No hits for NP_813057.1
No hits for NP_813064.1
Sequence has been collected for 3.2.1.173....
Feature has been extracted for 3.2.1.173....
3.2.1.173 has 3 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.174....
Feature has been extracted for 3.2.1.174....
3.2.1.174 has 3 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.175....
Feature has been extracted for 3.2.1.175....




Clustering has been done for 3.2.1.175....
No hits for XP_327626.1
No hits for XP_367082.1
No hits for AAC38571.2
No hits for AAZ55992.1
Sequence has been collected for 3.2.1.176....
Feature has been extracted for 3.2.1.176....




Clustering has been done for 3.2.1.176....
No hits for CAC24044.1
No hits for NP_344333.1
No hits for NP_642101.1
Sequence has been collected for 3.2.1.177....
Feature has been extracted for 3.2.1.177....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.177....
Sequence has been collected for 3.2.1.178....
Feature has been extracted for 3.2.1.178....




Clustering has been done for 3.2.1.178....
Sequence has been collected for 3.2.1.179....
Feature has been extracted for 3.2.1.179....




3.2.1.179 has 1 samples which is less than 5 min_samples 
No hits for NP_736323.1
No hits for NP_357886.1
No hits for NP_268879.1
Sequence has been collected for 3.2.1.180....
Feature has been extracted for 3.2.1.180....
3.2.1.180 has 3 samples which is less than 5 min_samples 
No hits for EED49498.1
Sequence has been collected for 3.2.1.181....
Feature has been extracted for 3.2.1.181....
3.2.1.181 has 4 samples which is less than 5 min_samples 
No hits for NP_809909.1
Sequence has been collected for 3.2.1.185....
Feature has been extracted for 3.2.1.185....




Clustering has been done for 3.2.1.185....
Sequence has been collected for 3.2.1.186....
Feature has been extracted for 3.2.1.186....




3.2.1.186 has 2 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.187....
Feature has been extracted for 3.2.1.187....




3.2.1.187 has 2 samples which is less than 5 min_samples 
No hits for S50756
Sequence has been collected for 3.2.1.188....
Feature has been extracted for 3.2.1.188....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.188....
Sequence has been collected for 3.2.1.197....
Feature has been extracted for 3.2.1.197....
3.2.1.197 has 4 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.199....
Feature has been extracted for 3.2.1.199....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.199....
No hits for BAA02908.1
Sequence has been collected for 3.2.1.200....
Feature has been extracted for 3.2.1.200....




Clustering has been done for 3.2.1.200....
Sequence has been collected for 3.2.1.201....
Feature has been extracted for 3.2.1.201....




Clustering has been done for 3.2.1.201....
Sequence has been collected for 3.2.1.204....
Feature has been extracted for 3.2.1.204....
3.2.1.204 has 4 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.205....
Feature has been extracted for 3.2.1.205....




3.2.1.205 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.206....
Feature has been extracted for 3.2.1.206....




3.2.1.206 has 1 samples which is less than 5 min_samples 
No hits for NP_269660.1
No hits for NP_810791.1
No hits for NP_812042.1
No hits for NP_812905.1
No hits for NP_812684.1
No hits for NP_812902.1
No hits for NP_810682.1
No hits for NP_812769.1
No hits for NP_638242.1
Sequence has been collected for 3.2.1.207....
Feature has been extracted for 3.2.1.207....




Clustering has been done for 3.2.1.207....
Sequence has been collected for 3.2.1.211....
Feature has been extracted for 3.2.1.211....




3.2.1.211 has 2 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.212....
Feature has been extracted for 3.2.1.212....




Clustering has been done for 3.2.1.212....
Sequence has been collected for 3.2.1.213....
Feature has been extracted for 3.2.1.213....




3.2.1.213 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.214....
Feature has been extracted for 3.2.1.214....




3.2.1.214 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.215....
Feature has been extracted for 3.2.1.215....




3.2.1.215 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.216....
Feature has been extracted for 3.2.1.216....




3.2.1.216 has 2 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.217....
Feature has been extracted for 3.2.1.217....
Clustering has been done for 3.2.1.217....
Mission completed in 735.599 seconds




In [24]:
from tqdm import tqdm
titl=['EC_number','Method','label_type','lambda','FMI','ex_groups','pred_groups','distribution','total','CAZy_partial','Fasta_partial','X_aa']
outpt=open('ec_number_cluster_ghf.txt','w')
outpt.write('$'.join(titl)+'\n')
for line in tqdm(total_data):
    outpt.write(line+'\n')
outpt.close()

100%|█████████████████████████████████████████████████████████████████████████| 1080/1080 [00:00<00:00, 1073423.77it/s]
