### Clustering using PseAAC features

##### <u> Part 1: Extracting protein sequence data </u>

In [4]:
import re

class CAZy_data:
    def __init__(self,filename1,filename2):
        self.data,self.acc,self.seq=[],[],[]
        with open(filename1,'r',encoding='utf-8') as inpt:
            for each in inpt:
                self.data.append(each.rstrip().split('$'))
        with open(filename2,'r',encoding='utf-8') as inpt1:
            for each1 in inpt1:
                if each1.startswith('>'):
                    self.acc.append(each1.rstrip())
                else:
                    self.seq.append(each1.rstrip())
            
                  
    def data_fetch(self,typ,position):
        typ_data=[]
        if typ=='all':
            typ_data=self.data
        else:
            for each in self.data:
                mult=each[position].split(' ')
                if len(mult)==1:#### In case typ = EC, Multi EC number and protein with no EC number are ignore.
                    if mult[0]==typ:
                        typ_data.append(each)
        return typ_data
    
    def EC_GH(self,ec_no,gh_fam):
        self.fasta=[]
        cazy_ec=self.data_fetch(ec_no,1)
        cazy_gh=self.data_fetch(gh_fam,-1)
        self.common_data=[[i[0],i[1],i[3],i[4],i[-2],i[-1]] for i in cazy_ec if i in cazy_gh]
        rm_prt, rm_prt_fasta=[],[]
        for each in range(len(self.common_data)):
            t=self.common_data[each]
            if self.prtn_filter(t[0]):
                all_acc=t[3].split(' ')
                if all_acc[0]!='':
                    for e_acc in all_acc:
                        e_seq=self.seq_fetch(e_acc)
                        try:
                            create_error=0/len(e_seq) # to remove accession number which doesnt have hits
                            self.fasta.append(f'>{e_acc}${t[0]}${t[1]}${t[2]}${t[-2]}${t[-1]}')
                            self.fasta.append(e_seq[0])
                        except ZeroDivisionError:
                            rm_prt_fasta.append(e_acc)
            else:
                rm_prt.append(t)
#         print('Total number of sequences:',len(self.fasta)/2)
#         print('Number of removed partial or fragment proteins (CAZy):',len(rm_prt))
#         print('Number of removed partial or fragment proteins (Fasta):',len(rm_prt_fasta))
        return self.fasta,rm_prt,rm_prt_fasta
    def prtn_filter(self,prt_name):
        hit=1
        if re.search('partial|fragment',prt_name.lower()):
            hit-=1
        return hit
            
    def seq_fetch(self,accession):
        hits=[]
        temp=0
        for each in range(len(self.acc)):
            if re.search(f'{accession}\D',self.acc[each]):
                temp+=1
                
                if self.prtn_filter(self.acc[each]): # remove partial| fragment accession numbers from GenBank description
                    hits.append(self.seq[each])
        if temp>1:
            print(f'Multiple hits for {accession}')
        elif temp==0:
            print(f'No hits for {accession}')
        return hits
     
In_data=CAZy_data('D:/After_4_4_22/data/CAZy_23_6_22/cazy_char_10_6_22.txt','D:/After_4_4_22/data/CAZy_23_6_22/char_gh_23_6_22.txt')

##### <u> Part 2: Extracting feature from protein sequences </u>

In [5]:
import numpy as np
from scipy import stats
import pandas as pd

# 1: Hydrophobicity, 2: Hydrophilicity, 3: mass, 4: pk1, 5:pk2, 6:pi, 20: 14 scale, 60: Tanford
class Standard_values:
    def __init__(self,filename):
        self.data=[]
        with open(filename,'r') as inpt:
            for each in inpt:
                self.data.append(each.rstrip().split(','))
        del self.data[0]
        
    def get_prop(self,prop):
        got_prop,amino_acid={},{}
        for each in prop:
            got_prop[each]=self.properties(each)
        d=1
        for each in got_prop[1].keys():
            amino_acid[d]=each
            d+=1
        return amino_acid,got_prop
    
    def properties(self, val):
        temp,norm={},{}
        for each in self.data:
            temp[each[0]]=float(each[val])
        relative=stats.zscore(np.array(list(temp.values())))
        for a,b in zip(temp.keys(),relative):
            norm[a]=b
        return norm

class Sequence:
    def __init__(self,filename):
        self.data=[]
        if type(filename)==str:
            with open(filename,'r') as inpt:
                for each in inpt:
                    self.data.append(each.rstrip())
        else:
            self.data=filename
                       
    def output(self):
        a,s,l=[],[],[]
        unusual=0
        for ele in self.ml_sl():
            if ele.startswith('>'):
                a.append(ele)
            else:
                if re.search('[UZOBJX]',ele.upper()):
#                     print(a[-1])
                    del a[-1]
                    unusual+=1
                    continue
                l.append(len(ele))
                s.append(ele.upper())
#         print('The length of the smallest sequence:',min(l))
#         print('Sequence with "X" present:',unusual)
        return a,s,l,unusual
             
    def ml_sl(self):
        acc_seq=[]
        for k in range(len(self.data)):
            if self.data[k].startswith('>'):
                acc_seq.append(self.data[k])
                join_=0
                for l in range(k+1,len(self.data)):
                    if self.data[l].startswith('>') == False:
                        join_+=1
                    else:
                        break
                acc_seq.append(''.join(self.data[k+1:k+1+join_]))
        return acc_seq

class Pseaac:
    def __init__(self,filename):
        self.filename=filename
    def collect(self,lamb,w,pro,nf):
        val=[]
        val.append(['#']+[ea for ea in keys.values()]+['\u03BB'+str(eac+1) for eac in range(lamb)])
        for e_seq in range(len(seq)):# single sequence taken for test
            q=self.pse(seq[e_seq],lamb,w,pro,nf)
            tem=acc[e_seq].split(' ')[0][1:]
            val.append([acc[e_seq]]+q)
        df=pd.DataFrame(val[1:],columns=val[0])
#         print('PseAAC feature have been extracted!!!')
        return df
        
    def pse(self,data,lamb,w,pro,nf):
        thet=self.theta(data,lamb,pro)
        deno=1+(w*sum(thet.values()))
        p=[]
        if nf==1:
            norm=(len(data))
        else:
            norm=1
        for u in range(1,21+lamb):
            if u>=1 and u<=20:
    #             print(u,'natural')
                num=data.count(keys[u])/norm # frequency
                p.append(num/deno)
            elif u>=21 and u<=20+lamb:
    #             print(u,'pseudo')
                num=w*thet[u-20]
                p.append(num/deno)
        return p
    
    def theta(self,data,lamb,pro):
        the={}
        for u in range(1,lamb+1):
            the[u]=(1/(len(data)-u))*self.rel_cal(data,u,pro)
        return the
    
    def rel_cal(self,data,v,pro):
        tem=[]
        for u in range(len(data)-v):
            te=[]
            for u1 in pro:
                x=((values[u1][data[u]])-(values[u1][data[u+v]]))**2
                te.append(x)
    #             print(u1,u,u+v,data[u],data[u+v],x)
            tem.append(sum(te)/len(pro))
        return sum(tem)

keys,values=Standard_values('D:/After_4_4_22/data/CAZy_23_6_22/7_98_hydrophobicity.csv').get_prop([1,2,3,4,5,6,20,60])
# pseaac_data['Length']=stats.zscore(min_len)

In [6]:
'''Clustering Algorithm'''
import os
import time
import matplotlib.pyplot as plt
from sklearn import cluster, mixture, manifold, decomposition, preprocessing,metrics
import random
from collections import Counter,defaultdict
import copy

class clustering:
    rs=77
#     rn=random.randint(1,99)
    def __init__(self,folder,data,n,cat):
        self.x=data.iloc[:,1:].values
        self.y=data.iloc[:,0]
        self.folder=folder
        self.cat=cat
        self.anno_label={0:'acc',1:'prtn',2:'ec',3:'org',4:'species',5:'ghf'}
        temp=[i.split('$')[cat[0]] for i in self.y]
        lab=list(set(temp))
        self.true_lab=[lab.index(j) for j in temp]
        try:
            n.isalpha()
            self.n=len(lab)
        except AttributeError:
            self.n=n
        self.temp1=','.join([f'{k}:{v}' for k,v in dict(Counter(temp)).items()])
        
    def kmeans(self):
        start = time.time()
        self.names = 'km'
        kmeans = cluster.KMeans(n_clusters=self.n,random_state=clustering.rs) # Number of clusters
        self.labels = kmeans.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def affinity(self):
        start = time.time()
        self.names = 'apc'
        apc = cluster.AffinityPropagation(random_state=clustering.rs)
        self.labels = apc.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def meanshift(self):
        start = time.time()
        self.names = 'ms'
        ms = cluster.MeanShift()
        self.labels = ms.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def spectral(self):
        start = time.time()
        self.names = 'spec'
        spectral = cluster.SpectralClustering(n_clusters=self.n,assign_labels="discretize",random_state=clustering.rs) # Number of clusters
        self.labels = spectral.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def agglomerative(self):
        start = time.time()
        self.names = 'agglo'
        agglo = cluster.AgglomerativeClustering(n_clusters=self.n) # Number of clusters
        self.labels = agglo.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def dbscan(self):
        start = time.time()
        self.names = 'dbs'
        new_x=preprocessing.StandardScaler().fit_transform(self.x)
        dbs = cluster.DBSCAN()
        self.labels = dbs.fit_predict(new_x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def optics(self):
        start = time.time()
        self.names = 'opt'
        opt = cluster.OPTICS()
        self.labels = opt.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def gaussian(self):
        start = time.time()
        self.names = 'gm'
        gm = mixture.GaussianMixture(n_components=self.n,random_state=clustering.rs) # Number of Clusters
        self.labels = gm.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def birch(self):
        start = time.time()
        self.names = 'bir'
        brc = cluster.Birch() # Number of clusters
        self.labels = brc.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()

    def label_save(self):
        dfout = pd.DataFrame({'Accession': self.y,  'predicted': self.labels, 'expected':self.true_lab})
        try:
            os.mkdir(self.folder)
        except FileExistsError:
            pass
        dfout.to_csv(f'{self.folder}\ML_{self.names}_{len(set(self.labels))}.txt',sep='\t', index=False)
        self.file()
        return self.analysis()
    
    def analysis(self):
        value=metrics.fowlkes_mallows_score(self.true_lab,self.labels)
        tot_val=[self.names,self.anno_label[self.cat[0]],str(lambda_value),str(round(value,3)),str(self.n),str(len(set(self.labels))),self.temp1,str(len(self.true_lab))]
        return tot_val
    
    def file(self):
        try:
            os.mkdir(f'{self.folder}/table')
        except FileExistsError:
            pass
        dd=defaultdict(list)
        for i in range(len(self.labels)):
            dd[self.labels[i]].append(self.y[i])
        self.excel(dict(sorted(dd.items())))
        
    def excel(self,anno):
        all_anno={}
        for i in self.cat:
            temp={}
            for j,k in anno.items():
                te=[]
                for l in k:
                    te.append(l.split('$')[i])
                temp[j]=dict(Counter(te))
            df=pd.DataFrame(temp).fillna(0).astype(int)
            df.loc['Total']=df.sum(axis=0)
            df.loc[:,'Total']=df.sum(axis=1)
            df.to_excel(f'{self.folder}/table/{self.names}_{len(set(self.labels))}_{self.anno_label[i]}.xlsx')

In [7]:
class ec_data:
    def __init__(self,file):
        self.data=[]
        with open(file,'r') as inpt:
            for i in inpt:
                self.data.append(i.rstrip())

    def cazy(self,dom,typ,pos):
        temp=[]
        for i in self.data:
            temp.append(i.split('$')[pos].split(' '))
        return self.domain(temp,dom,typ)
    
    def domain(self,ec_n,n,include):
        single,multi=[],[]
        for i in ec_n:
            if len(i)<=n:
                single.extend(i)
            else:
                multi.append(i)
        print('Number of single domain:',len(single))
        print(f'Number of multi domain (>{n}):',len(multi))
        if include=='m':
            for j in multi:
                for k in j:
                    single.append(k)
            print('After including multi domains:',len(single))
        return single
    
class analysis:
    def __init__(self,data):
        self.data=data
        
    def non_ec(self):
        return list(set(self.data))
    
    def non_kegg_count(self):
        temp=Counter(self.data)
        return self.specific_ec(dict(temp))
    
    def kegg_count(self,label):
        temp=defaultdict(int)
        for j,k in zip(self.data,label):
            temp[j]+=int(k)
        return self.specific_ec(dict(temp))
    
    def specific_ec(self,dat):
        caazy=[]
        for u,v in dat.items():
            if '3.2.1.' in u:
                try:
                    caazy.append([u,int(u.split('.')[-1])])
                except ValueError:
                    print(u)
        cc_ec=[x[0] for x in sorted(caazy, key=lambda x:x[1])]
        return cc_ec

In [8]:
# In a_cazy.cazy()
# '1' stands for single domain, and 'm' stands for to include mutli domain in single domain sequences
# second '1' stands for EC number and -1 stands for GH family
data_type='GH'
a_cazy=ec_data('D:/After_4_4_22/data/CAZy_23_6_22/cazy_char_10_6_22.txt')
if data_type=='EC':
    c_cazy=analysis(a_cazy.cazy(1,'s',1)).non_kegg_count() #for classifing GH families in EC number
elif data_type=='GH':
    c_cazy=analysis(a_cazy.cazy(1,'s',-1)).non_ec()

Number of single domain: 7377
Number of multi domain (>1): 0


In [13]:
t1=time.perf_counter()
total_data=[]
total_rm_cazy,total_rm_genbank={},{}
value_error,no_entry=[],[]
for each in c_cazy:
    ec_number='all'
    gh_family=each
    cazy_acc_seq,rm_cazy,rm_genbank=In_data.EC_GH(ec_number,gh_family) # write all to fetch all the EC number or all the GH family
    total_rm_cazy[each],total_rm_genbank[each]=rm_cazy,rm_genbank
    acc,seq,min_len,x_aa=Sequence(cazy_acc_seq).output()
    print(f'Sequence has been collected for {each}....')
    min_lamb=30
    try:
        if min(min_len)<min_lamb:
            lambda_value=min(min_len)
        else:
            lambda_value=min_lamb
    except ValueError:
        print(each,': doesnt have sequences')
        no_entry.append(each)
        continue
    pseaac_data=Pseaac(f'PAAC_{ec_number}_{gh_family}_L{lambda_value}.txt').collect(lambda_value,0.05,[60,2,3],1)
    print(f'Feature has been extracted for {each}....')
    ec_=ec_number.replace('.','_')
    # In the below statement 'auto' means it takes automatic clusters based on number labels given, you can also choose any number.
    clust=clustering(f'{gh_family}_{ec_}',pseaac_data,'auto',[2]) # 0:'acc',1:'prtn',2:'ec',3:'org',4:'species',5:'ghf'
    try:
        km=clust.kmeans()
        apc=clust.affinity()
        ms=clust.meanshift()
        spec=clust.spectral()
        agglo=clust.agglomerative()
        dbs=clust.dbscan()
        opt=clust.optics()
        gm=clust.gaussian()
        bir=clust.birch()
    except ValueError:
        print(f'{each} has {len(acc)} samples which is less than 5 min_samples ')
        value_error.append([each,len(acc)])
        continue
    print(f'Clustering has been done for {each}....')
    all_clust={'km':km,'apc':apc,'ms':ms,'spec':spec,'agglo':agglo,'dbs':dbs,'opt':opt,'gm':gm,'bir':bir}
    for aa in all_clust.values():
        bb=[each]+aa+[str(len(rm_cazy)),str(len(rm_genbank))]+[str(x_aa)]
        total_data.append('$'.join(bb))
t2=time.perf_counter()
print('Mission completed in',round(t2-t1,3),'seconds')

Sequence has been collected for GH141....
Feature has been extracted for GH141....
GH141 has 2 samples which is less than 5 min_samples 
No hits for NP_561782.1




No hits for NP_000254.1
Sequence has been collected for GH89....
Feature has been extracted for GH89....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH89....
Sequence has been collected for GH104....
Feature has been extracted for GH104....




Clustering has been done for GH104....
Sequence has been collected for GH154....
Feature has been extracted for GH154....




GH154 has 1 samples which is less than 5 min_samples 
Sequence has been collected for GH164....
Feature has been extracted for GH164....




GH164 has 1 samples which is less than 5 min_samples 
Sequence has been collected for GH127....
Feature has been extracted for GH127....
Clustering has been done for GH127....
Sequence has been collected for GH156....
Feature has been extracted for GH156....




GH156 has 1 samples which is less than 5 min_samples 
No hits for NP_811870.1
Sequence has been collected for GH115....
Feature has been extracted for GH115....
Clustering has been done for GH115....
No hits for NP_812901.1




No hits for NP_811112.1
No hits for NP_812439.1
No hits for NP_812873.1
No hits for NP_812695.1
No hits for NP_811542.1
No hits for NP_813003.1
No hits for NP_810791.1
No hits for NP_812042.1
No hits for NP_812905.1
No hits for NP_812874.1
No hits for NP_812684.1
No hits for NP_812902.1
No hits for NP_811860.1
No hits for NP_810682.1
No hits for NP_809945.1
No hits for NP_813004.1
No hits for NP_812769.1
No hits for NP_812984.1
No hits for NP_812876.1
No hits for NP_811024.1
No hits for NP_812442.1
No hits for NP_815878.1
No hits for NP_346562.1
No hits for NP_638242.1
Sequence has been collected for GH92....
Feature has been extracted for GH92....
Clustering has been done for GH92....
Sequence has been collected for GH90....




Feature has been extracted for GH90....




Clustering has been done for GH90....
Sequence has been collected for GH107....
Feature has been extracted for GH107....




Clustering has been done for GH107....
Sequence has been collected for GH111....
Feature has been extracted for GH111....




GH111 has 1 samples which is less than 5 min_samples 
Sequence has been collected for GH91....
Feature has been extracted for GH91....




Clustering has been done for GH91....
No hits for AAL42528.1
No hits for AAK87314.1
No hits for NP_354529.1
No hits for NP_532212.1
No hits for AAX62629.1
No hits for BAC56904.1
No hits for BAC56899.1
No hits for NP_809905.1
No hits for NP_809371.1
No hits for NP_809909.1
No hits for NP_813062.1
No hits for NP_809896.1
No hits for NP_813067.1
No hits for NP_813092.1
No hits for NP_809374.1
No hits for NP_809906.1
No hits for NP_813578.1
No hits for ABE27151.1
No hits for NP_561063.1
No hits for AAC76111.1
No hits for NP_417547.2
No hits for CAD65569.1
No hits for NP_786691.1
No hits for NP_268137.2
No hits for ZP_02032394.1
No hits for NP_687713.1
No hits for Q8E0N2
No hits for NP_358159.1
No hits for NP_345155.1
No hits for NP_822398.1
No hits for AAZ54953.1
No hits for NP_228998.1
No hits for NP_228868.1
No hits for NP_229424.1
No hits for NP_638239.1
No hits for NP_643384.1
No hits for AAK76608.1
No hits for AAM14288.1
No hits for NP_172375.1
No hits for NP_563833.1
No hits for NP_0

  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH2....
Sequence has been collected for GH172....
Feature has been extracted for GH172....




GH172 has 1 samples which is less than 5 min_samples 
Sequence has been collected for GH138....
Feature has been extracted for GH138....




GH138 has 1 samples which is less than 5 min_samples 
No hits for NP_813066.1
No hits for NP_813034.1
No hits for NP_813057.1
No hits for NP_809931.1
No hits for NP_813064.1
No hits for NP_228247.1
No hits for NP_637621.1
No hits for NP_638805.1
No hits for CAM91243.1
No hits for CAM33166.1
No hits for CAQ03437.1
No hits for ACP18831.1
No hits for ADU33280.1
No hits for ADU33338.1
No hits for ADU33339.1
No hits for ADU33359.1
No hits for ADU33363.1
No hits for NP_012687.1
Sequence has been collected for GH28....
Feature has been extracted for GH28....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH28....
No hits for NP_142480.1
No hits for NP_813071.1
No hits for NP_357653.1
No hits for NP_344609.1
No hits for NP_638243.1
No hits for NP_642100.1
No hits for NP_194344.1
No hits for NP_568978.1
No hits for NP_190852.1
No hits for NP_000395.1
No hits for BAC31151.1
No hits for BAD20774.1
No hits for ADO34790.1
No hits for ADO34790.2
Sequence has been collected for GH35....
Feature has been extracted for GH35....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH35....
No hits for NP_811999.1
Sequence has been collected for GH66....
Feature has been extracted for GH66....




Clustering has been done for GH66....
Sequence has been collected for GH82....
Feature has been extracted for GH82....




Clustering has been done for GH82....
No hits for AIM52834.1
No hits for ACY92456.1
No hits for AAU08014.1
No hits for AAU08003.1
No hits for ADB43097.1
No hits for ADB43097.2
Sequence has been collected for GH70....
Feature has been extracted for GH70....




Clustering has been done for GH70....
Sequence has been collected for GH131....
Feature has been extracted for GH131....
GH131 has 3 samples which is less than 5 min_samples 
No hits for NP_347172.1
No hits for NP_350016.1
No hits for AAA23550.1
No hits for BAA15523.1
No hits for NP_243094.1
No hits for NP_229631.1
No hits for NP_228244.1
No hits for NP_228561.1
No hits for NP_229086.1
Sequence has been collected for GH4....
Feature has been extracted for GH4....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH4....
No hits for NP_241102.1
No hits for AAZ56163.1
Sequence has been collected for GH81....
Feature has been extracted for GH81....




Clustering has been done for GH81....
Sequence has been collected for GH109....
Feature has been extracted for GH109....
Clustering has been done for GH109....
Sequence has been collected for GH144....




Feature has been extracted for GH144....
GH144 has 3 samples which is less than 5 min_samples 
Sequence has been collected for GH136....
Feature has been extracted for GH136....
Clustering has been done for GH136....




No hits for NP_000019.1
No hits for NP_000633.1
No hits for NP_000634.1
No hits for NP_000635.1
No hits for NP_000636.1
No hits for NP_000637.1
Sequence has been collected for GH133....
Feature has been extracted for GH133....




Clustering has been done for GH133....
No hits for CAG47121.1
No hits for CAB11202.1
No hits for NP_594914.1
No hits for CAA22810.1
No hits for NP_595364.1
Sequence has been collected for GH71....
Feature has been extracted for GH71....




Clustering has been done for GH71....
No hits for NP_579668.1
No hits for NP_578207.1
No hits for NP_578206.1
No hits for NP_343484.1
No hits for CAC23737.1
No hits for NP_343482.1
No hits for CAC23738.1
No hits for NP_343483.1
No hits for NP_110736.1
No hits for CAC02970.1
No hits for CAC02971.1
No hits for NP_213496.1
No hits for I39805
No hits for ACD93218.2
No hits for CAB12098.1
No hits for NP_388186.1
No hits for CAB14971.1
No hits for NP_390871.1
No hits for NP_812615.1
No hits for NP_812609.1
No hits for NP_812609.1
No hits for NP_810576.1
No hits for NP_695726.1
No hits for ACO05017.1
No hits for AAA63759.1
No hits for NP_149331.1
No hits for NP_601306.1
No hits for NP_601318.1
No hits for NP_601327.2
No hits for YP_604043.1
No hits for NP_294187.1
No hits for NP_294657.1
No hits for NP_295571.1
No hits for NP_295759.1
No hits for AAC73506.1
No hits for NP_414937.1
No hits for NP_415825.2
No hits for BAB91217.1
No hits for ABC18196.1
No hits for NP_241279.1
No hits for ZP_0004

  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH13....
No hits for ZP_05748149.1
No hits for NP_763007.1
Sequence has been collected for GH112....
Feature has been extracted for GH112....
Clustering has been done for GH112....




No hits for ADW03597.1
No hits for ADW07102.1
Sequence has been collected for GH55....
Feature has been extracted for GH55....




Clustering has been done for GH55....
Sequence has been collected for GH161....
Feature has been extracted for GH161....
GH161 has 3 samples which is less than 5 min_samples 
No hits for NP_809909.1
Sequence has been collected for GH137....
Feature has been extracted for GH137....




GH137 has 1 samples which is less than 5 min_samples 
Sequence has been collected for GH83....
Feature has been extracted for GH83....




Clustering has been done for GH83....
No hits for NP_823032.1
No hits for NP_823750.1
No hits for NP_630626.1
No hits for AAZ55647.1
No hits for ZP_00056977.1
No hits for NP_228117.1
No hits for NP_637119.1
No hits for NP_642098.1
Sequence has been collected for GH74....
Feature has been extracted for GH74....




Clustering has been done for GH74....
No hits for NP_624986.1
Sequence has been collected for GH46....
Feature has been extracted for GH46....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH46....
No hits for EAA71743.1
Sequence has been collected for GH93....
Feature has been extracted for GH93....




Clustering has been done for GH93....
Sequence has been collected for GH149....
Feature has been extracted for GH149....




Clustering has been done for GH149....
No hits for NP_577805.1
No hits for NP_811736.1
No hits for ZP_02070069.1
No hits for AAB39377.1
No hits for AAB39378.1
No hits for EU589324
No hits for ABO93616.1
No hits for WP_106404066.1
No hits for WP_082768819.1
No hits for NP_437620.1
No hits for NP_437595.1
No hits for NP_627674.1
No hits for NP_227840.1
No hits for NP_193149.1
No hits for EED49498.1
No hits for AIX48714.1
No hits for XP_002301319.1
No hits for NP_010874.1
No hits for ABR28478.1
Sequence has been collected for GH16....
Feature has been extracted for GH16....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH16....
No hits for NP_814694.1
No hits for NP_266584.1
No hits for NP_079368.1
Sequence has been collected for GH65....
Feature has been extracted for GH65....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH65....
No hits for NP_813306.1
No hits for NP_561107.1
No hits for NP_562150.1
No hits for NP_269657.1
No hits for BAA31654.2
Sequence has been collected for GH84....
Feature has been extracted for GH84....




Clustering has been done for GH84....
No hits for NP_630049.1
No hits for BAB84113.1
No hits for EAK85571.1
Sequence has been collected for GH62....
Feature has been extracted for GH62....
Clustering has been done for GH62....




Sequence has been collected for GH118....
Feature has been extracted for GH118....
GH118 has 3 samples which is less than 5 min_samples 
No hits for ABP67986.1
No hits for NP_421160.1
No hits for AAC98129.1
No hits for NP_241934.1
No hits for NP_644357.1
No hits for NP_000194.1
No hits for XP_042678.2
No hits for NP_032351.1
Sequence has been collected for GH39....
Feature has been extracted for GH39....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH39....
Sequence has been collected for GH159....
Feature has been extracted for GH159....




GH159 has 2 samples which is less than 5 min_samples 
No hits for AAC34747.1
No hits for NP_471174.1
No hits for AHC20114.1
No hits for NP_229644.1
No hits for NP_639416.1
No hits for XP_332117.1
Sequence has been collected for GH94....
Feature has been extracted for GH94....




Clustering has been done for GH94....
No hits for AAU22320.1
No hits for CAB12407.1
No hits for NP_388469.1
No hits for YP_003844078.1
Sequence has been collected for GH26....
Feature has been extracted for GH26....




Clustering has been done for GH26....
No hits for NP_809926.1
No hits for NP_809932.1
No hits for YP_005019950.1
No hits for CAD65560.1
No hits for CAD65558.1
No hits for ZP_07366943.1
No hits for ZP_07367044.1
No hits for NP_822003.1
Sequence has been collected for GH78....
Feature has been extracted for GH78....




Clustering has been done for GH78....
No hits for BAB64564.1
No hits for BAB64563.1
No hits for XP_327626.1
No hits for XP_367082.1
No hits for XP_366456.1
No hits for XP_366456.2
No hits for CAF32516.1
No hits for CAF32517.1
No hits for CAF32518.1
Sequence has been collected for GH7....
Feature has been extracted for GH7....




Clustering has been done for GH7....
Sequence has been collected for GH108....
Feature has been extracted for GH108....




GH108 has 2 samples which is less than 5 min_samples 
Sequence has been collected for GH126....
Feature has been extracted for GH126....




GH126 has 1 samples which is less than 5 min_samples 
No hits for WP_084555785.1
No hits for AAC00558.1
Sequence has been collected for GH0....
Feature has been extracted for GH0....




Clustering has been done for GH0....
No hits for CAB14990.1
No hits for NP_809924.1
No hits for NP_813019.1
No hits for NP_813087.1
Sequence has been collected for GH105....
Feature has been extracted for GH105....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH105....
Sequence has been collected for GH165....
Feature has been extracted for GH165....




GH165 has 1 samples which is less than 5 min_samples 
Sequence has been collected for GH110....
Feature has been extracted for GH110....




Clustering has been done for GH110....
No hits for CAC24100.1
No hits for NP_344437.1
No hits for BAB67666.1
No hits for NP_378557.1
No hits for AAD30994.1
No hits for ABF72189.1
No hits for NP_346329.1
No hits for NP_624603.1
No hits for NP_228997.1
No hits for CAB65477.1
No hits for ZP_06242255.1
No hits for NP_992831.1
No hits for BAB64768.1
No hits for BAB92522.1
Sequence has been collected for GH36....
Feature has been extracted for GH36....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH36....
No hits for NP_809923.1
No hits for NP_346093.1
No hits for NP_642102.1
Sequence has been collected for GH95....
Feature has been extracted for GH95....




Clustering has been done for GH95....
Sequence has been collected for GH124....
Feature has been extracted for GH124....




GH124 has 1 samples which is less than 5 min_samples 
Sequence has been collected for GH173....
Feature has been extracted for GH173....




GH173 has 2 samples which is less than 5 min_samples 
No hits for CAC24067.1
No hits for NP_344369.1
No hits for NP_344370.1
No hits for NP_810538.1
No hits for NP_811105.1
No hits for NP_811882.1
No hits for NP_813047.1
No hits for NP_346563.1
No hits for NP_228118.1
No hits for NP_638236.1
No hits for NP_180377.1
No hits for NP_788497.1
No hits for NP_788498.1
No hits for CAH74004.1
No hits for CAH74004.2
No hits for NP_000138.1
No hits for CAB53746.1
No hits for CAD92494.1
No hits for CAD92495.1
No hits for NP_036694.1
Sequence has been collected for GH29....
Feature has been extracted for GH29....




Clustering has been done for GH29....
No hits for NP_191958.2
No hits for NP_921898.1
Sequence has been collected for GH14....
Feature has been extracted for GH14....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH14....
No hits for NP_815238.1
No hits for NP_418809.1
No hits for AAC74277.1
No hits for NP_415711.1
No hits for NP_208363.1
No hits for NP_207439.1
Sequence has been collected for GH23....
Feature has been extracted for GH23....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH23....
No hits for NP_126623.1
No hits for NP_143072.1
No hits for EFL62381.1
No hits for AAU23613.1
No hits for NP_389695.1
No hits for NP_696497.1
No hits for NP_622045.1
No hits for ADK22147.1
No hits for CAB01405.1
No hits for ABZ70413.1
No hits for AAD48494.2
No hits for NP_347462.1
No hits for NP_241469.1
No hits for WP_018063499.1
No hits for CAB08388.1
No hits for AAC02964.1
No hits for BAA12826.1
No hits for ZP_08159266.1
No hits for ZP_08157835.1
No hits for WP_009984467.1
No hits for AAZ54938.1
No hits for AAZ56745.1
No hits for AAZ54939.1
No hits for NP_229032.1
No hits for NP_229549.1
No hits for NP_229550.1
No hits for YP_001245126.1
No hits for AHA42547.1
No hits for BAG69482.1
No hits for ZP_06241352.1
No hits for WP_116885687.1
No hits for NP_637144.1
No hits for NP_638867.1
No hits for NP_298108.1
No hits for NP_171733.1
No hits for AFR92751.1
No hits for XP_324942.1
No hits for EEF05441.1
No hits for XP_002475436.1
No hits for AAL33630.

  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH5....
Sequence has been collected for GH72....
Feature has been extracted for GH72....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH72....
No hits for NP_578085.1
No hits for NP_577802.1
No hits for NP_578937.1
No hits for NP_578171.1
No hits for NP_142340.1
No hits for NP_142473.1
No hits for CAC24042.1
No hits for NP_344331.1
No hits for YP_256448.1
No hits for NP_111204.1
No hits for NP_391805.1
No hits for ABP70047.1
No hits for YP_066184.1
No hits for NP_242789.1
No hits for ABR73190.1
No hits for NP_386997.1
No hits for BAB45117.1
No hits for NP_372713.1
No hits for NP_721380.1
No hits for NP_721491.1
No hits for NP_721937.1
No hits for NP_345092.1
No hits for NP_269656.1
No hits for NP_625353.1
No hits for NP_631601.1
No hits for AAZ54975.1
No hits for ADI56259.1
No hits for NP_936184.1
No hits for NP_197972.1
No hits for AEE33889.1
No hits for NP_176375.1
No hits for AAG21562.1
No hits for NP_187303.1
No hits for AAK25950.1
No hits for AAK64096.1
No hits for AAG12895.1
No hits for AAG12767.1
No hits for NP_175558.1
No hits for NP_198505.1
No hits for AEZ01595.1
No hits for S50

  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH1....
No hits for EAK92130.1
No hits for XP_711361.1
Sequence has been collected for GH132....
Feature has been extracted for GH132....




GH132 has 1 samples which is less than 5 min_samples 
Sequence has been collected for GH166....
Feature has been extracted for GH166....




GH166 has 1 samples which is less than 5 min_samples 
No hits for WP_029429093.1
Sequence has been collected for GH157....
Feature has been extracted for GH157....




GH157 has 1 samples which is less than 5 min_samples 
No hits for ABZ73160.1
No hits for AAM14008.1
No hits for AAG12868.1
No hits for AAG12762.1
No hits for AAG29692.1
No hits for CBW48349.1
No hits for NP_506006.1
No hits for NP_501577.1
No hits for ABB36773.2
No hits for CAB50704.1
No hits for CAI19713.1
No hits for CAI19714.1
No hits for CAB37989.1
No hits for CAB75695.1
No hits for CAB75695.2
No hits for CAI20315.1
No hits for CAC10451.1
No hits for CAH71079.1
No hits for CAH71080.1
No hits for CAI22315.1
No hits for CAI22316.1
No hits for CAI22317.1
No hits for CAI22318.1
No hits for CAH72871.2
No hits for CAH72887.1
No hits for CAI12781.1
No hits for NP_009161.1
No hits for NP_057303.1
No hits for CAI26211.1
No hits for CAI26213.1
No hits for CAI26214.1
No hits for CAM19090.1
No hits for CAM19091.1
No hits for CAM22047.1
No hits for XP_193956.3
No hits for EDK02328.1
No hits for XP_368250.1
No hits for NP_012665.1
No hits for NP_012074.1
No hits for CAB86344.1
No hits for CAC369

  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH47....
No hits for A32261
Sequence has been collected for GH86....
Feature has been extracted for GH86....




Clustering has been done for GH86....
No hits for BAQ55620.2
No hits for NP_627684.1
Sequence has been collected for GH117....
Feature has been extracted for GH117....




Clustering has been done for GH117....
Sequence has been collected for GH148....
Feature has been extracted for GH148....
GH148 has 3 samples which is less than 5 min_samples 
No hits for AAC16482.1
No hits for NP_009296.1
No hits for NP_695016.1
No hits for NP_695018.1
No hits for NP_036401.1
No hits for NP_033267.1
No hits for CAP19319.1
No hits for CAP19320.1
No hits for CAP19321.1
No hits for CAP19322.1
No hits for NP_032343.2
No hits for AAH60047.1
No hits for NP_034619.1
No hits for NP_742037.1
Sequence has been collected for GH56....
Feature has been extracted for GH56....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH56....
No hits for NP_420976.1
Sequence has been collected for GH24....
Feature has been extracted for GH24....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH24....
No hits for NP_485561.1
No hits for NP_484862.1
Sequence has been collected for GH100....
Feature has been extracted for GH100....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH100....
Sequence has been collected for GH140....
Feature has been extracted for GH140....




GH140 has 1 samples which is less than 5 min_samples 
No hits for CAC24028.1
No hits for NP_344318.1
No hits for NP_812685.1
No hits for CAB07105.1
No hits for NP_269660.1
No hits for NP_625245.1
No hits for NP_229647.1
No hits for AAB52345.2
No hits for AAM81094.1
No hits for CCD62468.1
No hits for NP_491568.1
No hits for NP_740847.1
No hits for NP_740848.1
No hits for AAA81731.2
No hits for CAB00104.1
No hits for NP_505995.1
No hits for AAF54376.1
No hits for AAF52958.1
No hits for NP_002363.1
No hits for NP_000519.1
No hits for NP_006113.1
No hits for NP_006706.1
No hits for BAA76779.1
No hits for XP_052620.6
No hits for ABE88173.1
No hits for XP_003629280.1
No hits for NP_034894.1
No hits for NP_032575.1
No hits for NP_766491.1
No hits for NP_640349.1
Sequence has been collected for GH38....
Feature has been extracted for GH38....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH38....
No hits for NP_342799.1
No hits for NP_344351.1
No hits for CAI10981.1
No hits for CAI10982.1
No hits for CAI10983.1
No hits for CAM17042.1
No hits for CAM17043.1
No hits for NP_766280.1
Sequence has been collected for GH116....
Feature has been extracted for GH116....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH116....
Sequence has been collected for GH128....




Feature has been extracted for GH128....
Clustering has been done for GH128....
Sequence has been collected for GH151....
Feature has been extracted for GH151....
GH151 has 3 samples which is less than 5 min_samples 
No hits for NP_631319.1
No hits for CAB57191.1
No hits for NP_733504.1
No hits for AAG21643.1
No hits for NP_187856.1
No hits for NP_566426.1
No hits for AAB64320.1
No hits for AAB64318.1
No hits for AAB58239.1
No hits for CAD41539.1
Sequence has been collected for GH19....
Feature has been extracted for GH19....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH19....
Sequence has been collected for GH169....
Feature has been extracted for GH169....




GH169 has 1 samples which is less than 5 min_samples 
No hits for ZP_00503782.1
No hits for CAB14832.1
No hits for NP_390750.1
No hits for CAB14811.1
No hits for NP_390729.1
No hits for NP_809281.1
No hits for NP_809261.1
No hits for NP_695732.1
No hits for EDY06090.1
No hits for ZP_08160912.1
No hits for NP_228093.1
No hits for NP_641621.1
No hits for BAF22602.1
Sequence has been collected for GH51....
Feature has been extracted for GH51....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH51....
No hits for XP_324478.1
Sequence has been collected for GH45....
Feature has been extracted for GH45....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH45....
Sequence has been collected for GH121....
Feature has been extracted for GH121....




GH121 has 2 samples which is less than 5 min_samples 
No hits for CAI11729.1
No hits for NP_000144.1
No hits for NP_032105.1
Sequence has been collected for GH59....
Feature has been extracted for GH59....




Clustering has been done for GH59....
Sequence has been collected for GH142....
Feature has been extracted for GH142....




GH142 has 1 samples which is less than 5 min_samples 
Sequence has been collected for GH152....
Feature has been extracted for GH152....
GH152 has 2 samples which is less than 5 min_samples 




No hits for NP_248620.1
No hits for NP_342472.1
No hits for BAB65828.1
No hits for NP_376719.1
No hits for NP_393821.1
No hits for NP_111834.1
No hits for BAA90671.1
No hits for NP_623401.1
No hits for NP_421085.1
No hits for AAB20818.1
No hits for XP_327956.1
No hits for XP_368148.1
No hits for EAA49438.1
No hits for NP_012167.1
Sequence has been collected for GH15....
Feature has been extracted for GH15....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH15....
No hits for ACX91047.1
No hits for CAC24044.1
No hits for NP_344333.1
No hits for CAC24058.1
No hits for NP_344361.1
No hits for BAB67639.1
No hits for NP_378530.1
No hits for NP_393778.1
No hits for NP_809252.1
No hits for NP_812211.1
No hits for ZP_01966167.1
No hits for NP_621719.1
No hits for NP_642101.1
No hits for AAA81491.1
No hits for AAO91743.1
No hits for NP_501419.1
No hits for AAB71267.2
No hits for AAB71267.3
No hits for NP_508105.1
No hits for EAK97887.1
No hits for CAA38907.1
No hits for NP_000143.1
No hits for XP_043268.1
No hits for NP_004659.1
No hits for NP_004659.1
No hits for AAI16454.1
No hits for NP_001032.1
No hits for CAM15004.1
No hits for CAM15005.1
No hits for CAM15006.1
No hits for CAM15007.1
No hits for NP_032090.2
No hits for XP_215144.3
No hits for NP_009788.1
Sequence has been collected for GH31....
Feature has been extracted for GH31....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH31....
Sequence has been collected for GH135....
Feature has been extracted for GH135....




GH135 has 2 samples which is less than 5 min_samples 
No hits for NP_771254.1
No hits for NP_743683.1
No hits for CAA33275.1
No hits for EAZ34091.1
Sequence has been collected for GH17....
Feature has been extracted for GH17....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH17....
No hits for NP_561245.1
No hits for NP_346573.1
Sequence has been collected for GH98....
Feature has been extracted for GH98....
Clustering has been done for GH98....




No hits for NP_695661.1
No hits for NP_561609.1
No hits for WP_004611020.1
No hits for WP_004612822.1
Sequence has been collected for GH101....
Feature has been extracted for GH101....




Clustering has been done for GH101....
Sequence has been collected for GH119....
Feature has been extracted for GH119....




GH119 has 1 samples which is less than 5 min_samples 
Sequence has been collected for GH139....
Feature has been extracted for GH139....




GH139 has 1 samples which is less than 5 min_samples 
Sequence has been collected for GH134....
Feature has been extracted for GH134....




Clustering has been done for GH134....
No hits for NP_176916.1
No hits for CAA92954.1
No hits for NP_006293.1
No hits for BAF07139.1
No hits for NP_113937.1
Sequence has been collected for GH63....
Feature has been extracted for GH63....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH63....
Sequence has been collected for GH146....
Feature has been extracted for GH146....




GH146 has 1 samples which is less than 5 min_samples 
No hits for NP_577861.2
Sequence has been collected for GH122....
Feature has been extracted for GH122....




GH122 has 1 samples which is less than 5 min_samples 
Sequence has been collected for GH171....
Feature has been extracted for GH171....
GH171 has 5 samples which is less than 5 min_samples 
Sequence has been collected for GH160....




Feature has been extracted for GH160....
GH160 has 1 samples which is less than 5 min_samples 




No hits for CAB15830.1
No hits for NP_391683.1
No hits for NP_810672.1
No hits for NP_811994.1
No hits for NP_810673.1
No hits for NP_810678.1
No hits for NP_347065.1
No hits for NP_601843.1
No hits for NP_346228.1
No hits for NP_346161.1
No hits for NP_229215.1
No hits for NP_176453.1
No hits for NP_187995.1
No hits for AAY81958.1
No hits for XP_323605.1
No hits for CAD41518.1
No hits for CAE04784.2
No hits for CAE04785.1
No hits for CBM41476.1
Sequence has been collected for GH32....
Feature has been extracted for GH32....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH32....
No hits for NP_344344.1
No hits for ZP_03013483.1
No hits for NP_812479.1
No hits for NP_812226.1
No hits for ZP_02070067.1
No hits for YP_002352162.1
No hits for BAA35922.1
No hits for NP_471175.1
No hits for CAA17329.1
No hits for CAE55256.1
No hits for NP_214751.1
No hits for ZP_01543735.1
No hits for AAA86753.1
No hits for ZP_08159508.1
No hits for ZP_08160757.1
No hits for NP_626989.1
No hits for NP_629382.1
No hits for BAB61064.1
No hits for ACZ66247.2
No hits for ACZ66247.1
No hits for NP_227841.1
No hits for NP_227892.1
No hits for NP_228618.1
No hits for NP_230341.1
No hits for NP_638240.1
No hits for NP_641782.1
No hits for NP_642120.1
No hits for NP_643385.1
No hits for NP_644175.1
No hits for NP_644530.1
No hits for XP_330872.1
No hits for XP_324309.1
No hits for XP_329671.1
No hits for XP_360965.1
No hits for XP_364573.1
No hits for AAL69548.2
No hits for CAP58431.1
No hits for ABR57325.1
No hits for EAK85129.1
Sequence has been collec

  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH3....
Sequence has been collected for GH106....
Feature has been extracted for GH106....
Clustering has been done for GH106....




No hits for NP_391292.1
No hits for NP_813579.1
No hits for ACO06241.1
No hits for NP_695463.1
No hits for AEH26456.1
No hits for NP_229006.1
Sequence has been collected for GH53....
Feature has been extracted for GH53....




Clustering has been done for GH53....
Sequence has been collected for GH120....
Feature has been extracted for GH120....
GH120 has 3 samples which is less than 5 min_samples 
Sequence has been collected for GH153....
Feature has been extracted for GH153....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH153....
No hits for ABK34500.1
No hits for XP_046047.1
Sequence has been collected for GH22....
Feature has been extracted for GH22....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH22....
No hits for BAA31551.1
No hits for NP_149279.1
No hits for NP_149217.1
No hits for AAC98123.1
No hits for ABI49937.1
No hits for AAC98140.1
No hits for CAA82319.1
No hits for NP_242986.1
No hits for ZP_08160171.1
No hits for ZP_08159615.1
No hits for ACR61562.1
No hits for BAA19777.1
No hits for ADQ57411.1
No hits for YP_003851606.1
No hits for AAZ56824.1
No hits for AAZ56956.1
No hits for CAD48748.1
No hits for NP_227886.1
No hits for NP_227877.1
No hits for AAD32593.1
No hits for NP_644548.1
No hits for NP_644553.1
No hits for AAP31839.1
No hits for EAA78230.1
No hits for XP_002470423.1
No hits for XP_365543.1
No hits for BAA89465.1
Sequence has been collected for GH10....
Feature has been extracted for GH10....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH10....
No hits for NP_279794.1
No hits for NP_578962.1
No hits for NP_578963.1
No hits for NP_578962.1
No hits for NP_833450.1
No hits for AAU21943.1
No hits for NP_809957.1
No hits for NP_812898.1
No hits for NP_902605.1
No hits for AGD94964.1
No hits for NP_813917.1
No hits for NP_814153.1
No hits for NP_816485.1
No hits for NP_241782.1
No hits for NP_268107.1
No hits for BAA02908.1
No hits for CBW16119.1
No hits for ABV39247.1
No hits for ABV40327.1
No hits for ABV41826.1
No hits for ABV42574.1
No hits for NP_269818.1
No hits for NP_630126.1
No hits for NP_629155.1
No hits for NP_629515.1
No hits for NP_625711.1
No hits for AAZ54618.1
No hits for NP_232428.1
No hits for AAA83586.1
No hits for EAL00460.1
No hits for ABC59330.1
No hits for AGX26690.1
No hits for ABI32402.1
No hits for CAH70206.1
No hits for CAH70207.1
No hits for AAH36339.1
No hits for AAH47336.1
No hits for CAH70802.1
No hits for CAH70803.1
No hits for CAH70804.1
No hits for CAI19263.1


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH18....
No hits for CAA52785.1
No hits for XP_330425.1
Sequence has been collected for GH54....
Feature has been extracted for GH54....




Clustering has been done for GH54....
No hits for NP_559144.1
No hits for NP_213497.1
No hits for ABR29563.1
No hits for NP_212300.1
No hits for NP_601497.1
No hits for NP_721905.1
No hits for NP_440120.1
Sequence has been collected for GH77....
Feature has been extracted for GH77....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH77....
No hits for BAP10900.1
Sequence has been collected for GH87....
Feature has been extracted for GH87....
Clustering has been done for GH87....




No hits for NP_213966.1
No hits for NP_832437.1
No hits for NP_242971.1
Sequence has been collected for GH8....
Feature has been extracted for GH8....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH8....
No hits for YP_594332.1
No hits for NP_229000.1
No hits for NP_228122.1
No hits for MBF8418755.1
Sequence has been collected for GH42....
Feature has been extracted for GH42....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH42....
Sequence has been collected for GH64....
Feature has been extracted for GH64....




Clustering has been done for GH64....
No hits for NP_578583.1
No hits for CAC24331.1
No hits for NP_343873.1
No hits for NP_342800.1
No hits for NP_229324.1
No hits for NP_229325.1
No hits for XP_386027.1
No hits for XP_391213.1
No hits for XP_388068.1
No hits for XP_383205.1
No hits for XP_368567.1
No hits for XP_361895.1
No hits for XP_362900.1
Sequence has been collected for GH12....
Feature has been extracted for GH12....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH12....
Sequence has been collected for GH49....
Feature has been extracted for GH49....




Clustering has been done for GH49....
No hits for NP_696499.1
No hits for NP_241651.1
No hits for NP_345016.1
No hits for AEE74996.1
No hits for NP_187715.1
No hits for NP_196165.2
No hits for AAC46644.3
No hits for AAN84828.1
No hits for AAU05578.1
No hits for NP_498267.2
No hits for NP_073596.1
Sequence has been collected for GH85....
Feature has been extracted for GH85....




Clustering has been done for GH85....
Sequence has been collected for GH150....
Feature has been extracted for GH150....




GH150 has 2 samples which is less than 5 min_samples 
No hits for AAU22854.1
No hits for CAB15969.1
No hits for NP_391812.1
No hits for CAB14841.1
No hits for NP_390759.1
No hits for CAB13642.1
No hits for NP_389640.1
No hits for ZP_03458529.1
No hits for ZP_03013482.1
No hits for ZP_03013484.1
No hits for ZP_03013476.1
No hits for NP_809273.1
No hits for NP_809280.1
No hits for NP_812586.1
No hits for NP_812566.1
No hits for NP_809178.1
No hits for NP_809934.1
No hits for NP_811764.1
No hits for NP_809282.1
No hits for NP_812573.1
No hits for NP_812006.1
No hits for WP_011917910.1
No hits for WP_011917910.1
No hits for AAT98625.1
No hits for NP_244550.1
No hits for NP_822218.1
No hits for NP_823285.1
No hits for AAZ55651.1
No hits for NP_644482.1
No hits for NP_644557.1
No hits for BAD89094.1
No hits for XP_370346.1
No hits for ACP50519.1
Sequence has been collected for GH43....
Feature has been extracted for GH43....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH43....
No hits for AAW44482.1
No hits for NP_009111.1
Sequence has been collected for GH37....
Feature has been extracted for GH37....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH37....
No hits for NP_248621.1
No hits for NP_578001.1
No hits for NP_578599.1
No hits for NP_578173.1
No hits for NP_143265.1
No hits for NP_342630.1
Sequence has been collected for GH57....
Feature has been extracted for GH57....




Clustering has been done for GH57....
No hits for NP_809596.1
No hits for NP_810784.1
No hits for NP_812206.1
No hits for NP_811533.1
No hits for NP_812575.1
No hits for NP_812572.1
No hits for NP_812614.1
Sequence has been collected for GH97....
Feature has been extracted for GH97....




Clustering has been done for GH97....
No hits for CAK05012.1
No hits for NP_006656.1
Sequence has been collected for GH79....
Feature has been extracted for GH79....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH79....
No hits for BAB72022.1
No hits for YP_003867730.1
No hits for CAO78323.1
No hits for NP_964768.1
No hits for NP_791279.1
No hits for NP_792124.1
No hits for NP_808690.1
No hits for AAA69504.1
No hits for AAV88999.1
Sequence has been collected for GH68....
Feature has been extracted for GH68....




Clustering has been done for GH68....
No hits for NP_347551.1
Sequence has been collected for GH44....
Feature has been extracted for GH44....




Clustering has been done for GH44....
Sequence has been collected for GH125....




Feature has been extracted for GH125....
Clustering has been done for GH125....
No hits for NP_813569.1
No hits for NP_736323.1
No hits for NP_357886.1
No hits for NP_268879.1
Sequence has been collected for GH88....
Feature has been extracted for GH88....




Clustering has been done for GH88....
Sequence has been collected for GH34....
Feature has been extracted for GH34....




Clustering has been done for GH34....
No hits for NP_562147.1
No hits for NP_814543.1
No hits for CAD64901.1
No hits for NP_786050.1
No hits for NP_266428.1
No hits for NP_268064.1
No hits for NP_267521.1
No hits for BAB45607.1
No hits for CAA09078.1
No hits for NP_358461.1
No hits for NP_345446.1
No hits for NP_228442.1
Sequence has been collected for GH73....
Feature has been extracted for GH73....




Clustering has been done for GH73....
Sequence has been collected for GH123....
Feature has been extracted for GH123....
GH123 has 3 samples which is less than 5 min_samples 
Sequence has been collected for GH163....
Feature has been extracted for GH163....




GH163 has 1 samples which is less than 5 min_samples 
No hits for NP_812224.1
No hits for NP_149282.1
No hits for WP_162463230.1
No hits for ZP_07060000.1
No hits for ZP_07060001.1
No hits for NP_826382.1
No hits for CAI95090.1
No hits for NP_000148.1
No hits for XP_323748.1
No hits for XP_330352.1
Sequence has been collected for GH30....
Feature has been extracted for GH30....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH30....
Sequence has been collected for GH162....
Feature has been extracted for GH162....




GH162 has 1 samples which is less than 5 min_samples 
Sequence has been collected for GH58....
Feature has been extracted for GH58....




Clustering has been done for GH58....
No hits for NP_823026.1
Sequence has been collected for GH75....
Feature has been extracted for GH75....




Clustering has been done for GH75....
No hits for AAC98128.2
No hits for ZP_08159559.1
No hits for NP_227871.1
Sequence has been collected for GH67....
Feature has been extracted for GH67....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH67....
Sequence has been collected for GH113....
Feature has been extracted for GH113....




Clustering has been done for GH113....
Sequence has been collected for GH129....
Feature has been extracted for GH129....
GH129 has 2 samples which is less than 5 min_samples 
Sequence has been collected for GH102....
Feature has been extracted for GH102....




Clustering has been done for GH102....
No hits for ZP_02070068.1
No hits for ZP_06243608.1
Sequence has been collected for GH158....
Feature has been extracted for GH158....




GH158 has 1 samples which is less than 5 min_samples 
Sequence has been collected for GH170....
Feature has been extracted for GH170....




GH170 has 1 samples which is less than 5 min_samples 
Sequence has been collected for GH147....
Feature has been extracted for GH147....
GH147 has 3 samples which is less than 5 min_samples 
No hits for AAC38571.2
No hits for AAZ55992.1
Sequence has been collected for GH48....
Feature has been extracted for GH48....




Clustering has been done for GH48....
No hits for CAC29083.1
No hits for AAC98122.1
No hits for NP_242980.1
Sequence has been collected for GH52....
Feature has been extracted for GH52....




Clustering has been done for GH52....
No hits for CAA16243.1
No hits for CAE55238.1
No hits for NP_214576.1
No hits for AAZ54658.1
No hits for AAZ55112.1
No hits for XP_360146.1
Sequence has been collected for GH6....
Feature has been extracted for GH6....




Clustering has been done for GH6....
Sequence has been collected for GH96....
Feature has been extracted for GH96....
GH96 has 3 samples which is less than 5 min_samples 
No hits for CAA56918.1
No hits for NP_347552.1
No hits for YP_003844202.1
No hits for NP_442377.1
No hits for AAZ55662.1
No hits for AAA27397.1
No hits for AAZ56209.1
No hits for NP_230264.1
No hits for NP_642837.1
No hits for BAA96209.1
No hits for BAC00553.1
No hits for EFA05721.1
No hits for XP_001810693.1
Sequence has been collected for GH9....
Feature has been extracted for GH9....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH9....
Sequence has been collected for GH114....
Feature has been extracted for GH114....




GH114 has 2 samples which is less than 5 min_samples 
No hits for NP_845781.1
No hits for NP_845154.1
No hits for NP_347193.1
No hits for NP_562230.1
No hits for NP_815667.1
No hits for NP_814147.1
Sequence has been collected for GH25....
Feature has been extracted for GH25....




Clustering has been done for GH25....
No hits for NP_627690.1
No hits for EFB02686.1
No hits for ZP_01923925.1
Sequence has been collected for GH50....
Feature has been extracted for GH50....




Clustering has been done for GH50....
Sequence has been collected for GH168....
Feature has been extracted for GH168....




GH168 has 2 samples which is less than 5 min_samples 
Sequence has been collected for GH143....
Feature has been extracted for GH143....




GH143 has 1 samples which is less than 5 min_samples 
Sequence has been collected for GH167....
Feature has been extracted for GH167....




GH167 has 2 samples which is less than 5 min_samples 
No hits for BAC56902.1
No hits for NP_809372.1
No hits for NP_809373.1
No hits for NP_809369.1
No hits for NP_809419.1
No hits for NP_813917.1
No hits for NP_267650.1
No hits for NP_904396.1
No hits for NP_357651.1
No hits for NP_357651.1
No hits for NP_344606.1
No hits for NP_344606.1
No hits for NP_627016.1
No hits for NP_638238.1
No hits for NP_191086.1
No hits for NP_567017.1
No hits for AAA96105.3
No hits for CAO72175.1
No hits for AAL35732.2
No hits for CAA22078.2
No hits for CAA19506.2
No hits for CAI06053.1
No hits for CAI06053.2
No hits for AAG22248.1
No hits for AAN11597.1
No hits for NP_728974.1
No hits for NP_728976.1
No hits for AAF58500.2
No hits for AAM68691.1
No hits for AAM68693.1
No hits for NP_610790.1
No hits for NP_725180.1
No hits for NP_000511.1
No hits for NP_000512.1
No hits for NP_034551.1
Sequence has been collected for GH20....
Feature has been extracted for GH20....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH20....
Sequence has been collected for GH103....
Feature has been extracted for GH103....




Clustering has been done for GH103....
No hits for NP_813068.1
No hits for NP_823362.1
No hits for CAB55878.1
No hits for CAB41237.1
No hits for CAX14869.1
No hits for CAX14870.1
No hits for NP_032695.2
No hits for CAM19136.1
No hits for CAM19137.1
No hits for NP_038491.1
No hits for BAF21463.1
No hits for BAC55816.1
No hits for BAF26853.1
No hits for NP_922125.1
Sequence has been collected for GH27....
Feature has been extracted for GH27....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH27....
No hits for NP_812693.1
No hits for NP_811544.1
No hits for NP_811536.1
No hits for NP_812703.1
No hits for XP_328833.1
No hits for XP_323071.1
Sequence has been collected for GH76....
Feature has been extracted for GH76....
Clustering has been done for GH76....




No hits for NP_241765.1
No hits for CAD65888.1
No hits for ZP_08160424.1
No hits for ZP_08158180.1
No hits for ZP_08157788.1
No hits for NP_624448.1
No hits for NP_626540.1
No hits for BAA19778.1
No hits for AAZ55251.1
No hits for ABA39289.1
No hits for ABM55503.1
No hits for EAA73188.1
No hits for XP_383800.1
No hits for XP_368051.1
No hits for AAD37441.1
No hits for A44594
No hits for 5VQJ_A
Sequence has been collected for GH11....
Feature has been extracted for GH11....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH11....
No hits for NP_809946.1
No hits for NP_470199.1
Sequence has been collected for GH130....
Feature has been extracted for GH130....




Clustering has been done for GH130....
No hits for BAC56895.1
No hits for NP_809926.1
No hits for NP_809368.1
No hits for NP_561641.1
No hits for NP_207375.1
No hits for NP_904664.1
No hits for NP_346126.1
No hits for NP_346126.1
No hits for NP_971085.1
No hits for Q0CMX0
No hits for NP_776547.1
No hits for NP_006647.2
No hits for XP_035091.1
No hits for NP_005374.1
No hits for CAI17742.1
No hits for CAI18221.1
No hits for CAI41850.1
No hits for CAM26147.1
No hits for CAQ08322.1
No hits for CAQ09153.1
No hits for CAQ09305.1
No hits for CAQ09512.1
No hits for AAH63465.1
No hits for NP_035023.2
No hits for NP_058826.1
Sequence has been collected for GH33....
Feature has been extracted for GH33....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for GH33....
No hits for NP_812773.1
No hits for CAI17346.1
No hits for CAI17347.1
No hits for NP_542963.1
No hits for XP_346734.1
Sequence has been collected for GH99....
Feature has been extracted for GH99....
Clustering has been done for GH99....




Sequence has been collected for GH80....
Feature has been extracted for GH80....
GH80 has 4 samples which is less than 5 min_samples 
Mission completed in 786.774 seconds


In [14]:
from tqdm import tqdm
titl=['EC_number','Method','label_type','lambda','FMI','ex_groups','pred_groups','distribution','total','CAZy_partial','Fasta_partial','X_aa']
outpt=open('ghf_number_cluster_ec.txt','w')
outpt.write('$'.join(titl)+'\n')
for line in tqdm(total_data):
    outpt.write(line+'\n')
outpt.close()   

100%|██████████████████████████████████████████████████████████████████████████| 1044/1044 [00:00<00:00, 511011.01it/s]
