### Clustering using PseAAC features

##### <u> Part 1: Extracting protein sequence data </u>

In [1]:
import re

class CAZy_data:
    def __init__(self,filename1,filename2):
        self.data,self.acc,self.seq=[],[],[]
        with open(filename1,'r',encoding='utf-8') as inpt:
            for each in inpt:
                self.data.append(each.rstrip().split('$'))
        with open(filename2,'r',encoding='utf-8') as inpt1:
            for each1 in inpt1:
                if each1.startswith('>'):
                    self.acc.append(each1.rstrip())
                else:
                    self.seq.append(each1.rstrip())
            
                  
    def data_fetch(self,typ,position):
        typ_data=[]
        if typ=='all':
            typ_data=self.data
        else:
            for each in self.data:
                mult=each[position].split(' ')
                if len(mult)==1:#### In case typ = EC, Multi EC number and protein with no EC number are ignore.
                    if mult[0]==typ:
                        typ_data.append(each)
        return typ_data
    
    def EC_GH(self,ec_no,gh_fam):
        self.fasta=[]
        cazy_ec=self.data_fetch(ec_no,1)
        cazy_gh=self.data_fetch(gh_fam,-1)
        self.common_data=[[i[0],i[1],i[2],i[3],i[-2],i[-1]] for i in cazy_ec if i in cazy_gh]
        rm_prt, rm_prt_fasta=[],[]
        for each in range(len(self.common_data)):
            t=self.common_data[each]
            if self.prtn_filter(t[0]):
                all_acc=t[3].split(' ')
                if all_acc[0]!='':
                    for e_acc in all_acc:
                        e_seq=self.seq_fetch(e_acc)
                        try:
                            create_error=0/len(e_seq) # to remove accession number which doesnt have hits
                            self.fasta.append(f'>{e_acc}${t[0]}${t[1]}${t[2]}${t[-2]}${t[-1]}')
                            self.fasta.append(e_seq[0])
                        except ZeroDivisionError:
                            rm_prt_fasta.append(e_acc)
            else:
                rm_prt.append(t)
#         print('Total number of sequences:',len(self.fasta)/2)
#         print('Number of removed partial or fragment proteins (CAZy):',len(rm_prt))
#         print('Number of removed partial or fragment proteins (Fasta):',len(rm_prt_fasta))
        return self.fasta,rm_prt,rm_prt_fasta
    def prtn_filter(self,prt_name):
        hit=1
        if re.search('partial|fragment',prt_name.lower()):
            hit-=1
        return hit
            
    def seq_fetch(self,accession):
        hits=[]
        temp=0
        for each in range(len(self.acc)):
            if re.search(f'{accession}\D',self.acc[each]):
                temp+=1
                
                if self.prtn_filter(self.acc[each]): # remove partial| fragment accession numbers from GenBank description
                    hits.append(self.seq[each])
        if temp>1:
            print(f'Multiple hits for {accession}')
        elif temp==0:
            print(f'No hits for {accession}')
        return hits
     
In_data=CAZy_data('D:/before_4_4_22/cazy_data/char_gh/char_cazy_12_8_21.txt','D:/before_4_4_22/cazy_data/char_gh/char_gh_seq_nr.txt')

##### <u> Part 2: Extracting feature from protein sequences </u>

In [2]:
import numpy as np
from scipy import stats
import pandas as pd

# 1: Hydrophobicity, 2: Hydrophilicity, 3: mass, 4: pk1, 5:pk2, 6:pi, 20: 14 scale, 60: Tanford
class Standard_values:
    def __init__(self,filename):
        self.data=[]
        with open(filename,'r') as inpt:
            for each in inpt:
                self.data.append(each.rstrip().split(','))
        del self.data[0]
        
    def get_prop(self,prop):
        got_prop,amino_acid={},{}
        for each in prop:
            got_prop[each]=self.properties(each)
        d=1
        for each in got_prop[1].keys():
            amino_acid[d]=each
            d+=1
        return amino_acid,got_prop
    
    def properties(self, val):
        temp,norm={},{}
        for each in self.data:
            temp[each[0]]=float(each[val])
        relative=stats.zscore(np.array(list(temp.values())))
        for a,b in zip(temp.keys(),relative):
            norm[a]=b
        return norm

class Sequence:
    def __init__(self,filename):
        self.data=[]
        if type(filename)==str:
            with open(filename,'r') as inpt:
                for each in inpt:
                    self.data.append(each.rstrip())
        else:
            self.data=filename
                       
    def output(self):
        a,s,l=[],[],[]
        unusual
        for ele in self.ml_sl():
            if ele.startswith('>'):
                a.append(ele)
            else:
                if re.search('[UZOBJX]',ele.upper()):
#                     print(a[-1])
                    del a[-1]
                    unusual+=1
                    continue
                l.append(len(ele))
                s.append(ele.upper())
#         print('The length of the smallest sequence:',min(l))
#         print('Sequence with "X" present:',unusual)
        return a,s,l,unusual
             
    def ml_sl(self):
        acc_seq=[]
        for k in range(len(self.data)):
            if self.data[k].startswith('>'):
                acc_seq.append(self.data[k])
                join_=0
                for l in range(k+1,len(self.data)):
                    if self.data[l].startswith('>') == False:
                        join_+=1
                    else:
                        break
                acc_seq.append(''.join(self.data[k+1:k+1+join_]))
        return acc_seq

class Pseaac:
    def __init__(self,filename):
        self.filename=filename
    def collect(self,lamb,w,pro,nf):
        val=[]
        val.append(['#']+[ea for ea in keys.values()]+['\u03BB'+str(eac+1) for eac in range(lamb)])
        for e_seq in range(len(seq)):# single sequence taken for test
            q=self.pse(seq[e_seq],lamb,w,pro,nf)
            tem=acc[e_seq].split(' ')[0][1:]
            val.append([acc[e_seq]]+q)
        df=pd.DataFrame(val[1:],columns=val[0])
#         print('PseAAC feature have been extracted!!!')
        return df
        
    def pse(self,data,lamb,w,pro,nf):
        thet=self.theta(data,lamb,pro)
        deno=1+(w*sum(thet.values()))
        p=[]
        if nf==1:
            norm=(len(data))
        else:
            norm=1
        for u in range(1,21+lamb):
            if u>=1 and u<=20:
    #             print(u,'natural')
                num=data.count(keys[u])/norm # frequency
                p.append(num/deno)
            elif u>=21 and u<=20+lamb:
    #             print(u,'pseudo')
                num=w*thet[u-20]
                p.append(num/deno)
        return p
    
    def theta(self,data,lamb,pro):
        the={}
        for u in range(1,lamb+1):
            the[u]=(1/(len(data)-u))*self.rel_cal(data,u,pro)
        return the
    
    def rel_cal(self,data,v,pro):
        tem=[]
        for u in range(len(data)-v):
            te=[]
            for u1 in pro:
                x=((values[u1][data[u]])-(values[u1][data[u+v]]))**2
                te.append(x)
    #             print(u1,u,u+v,data[u],data[u+v],x)
            tem.append(sum(te)/len(pro))
        return sum(tem)

keys,values=Standard_values('7_98_hydrophobicity.csv').get_prop([1,2,3,4,5,6,20,60])
# pseaac_data['Length']=stats.zscore(min_len)

In [3]:
'''Clustering Algorithm'''
import os
import time
import matplotlib.pyplot as plt
from sklearn import cluster, mixture, manifold, decomposition, preprocessing,metrics
import random
from collections import Counter,defaultdict
import copy

class clustering:
    rs=77
#     rn=random.randint(1,99)
    def __init__(self,folder,data,n,cat):
        self.x=data.iloc[:,1:].values
        self.y=data.iloc[:,0]
        self.folder=folder
        self.cat=cat
        self.anno_label={0:'acc',1:'prtn',2:'ec',3:'org',4:'species',5:'ghf'}
        temp=[i.split('$')[cat[0]] for i in self.y]
        lab=list(set(temp))
        self.true_lab=[lab.index(j) for j in temp]
        try:
            n.isalpha()
            self.n=len(lab)
        except AttributeError:
            self.n=n
        self.temp1=','.join([f'{k}:{v}' for k,v in dict(Counter(temp)).items()])
        
    def kmeans(self):
        start = time.time()
        self.names = 'km'
        kmeans = cluster.KMeans(n_clusters=self.n,random_state=clustering.rs) # Number of clusters
        self.labels = kmeans.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def affinity(self):
        start = time.time()
        self.names = 'apc'
        apc = cluster.AffinityPropagation(random_state=clustering.rs)
        self.labels = apc.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def meanshift(self):
        start = time.time()
        self.names = 'ms'
        ms = cluster.MeanShift()
        self.labels = ms.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def spectral(self):
        start = time.time()
        self.names = 'spec'
        spectral = cluster.SpectralClustering(n_clusters=self.n,assign_labels="discretize",random_state=clustering.rs) # Number of clusters
        self.labels = spectral.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def agglomerative(self):
        start = time.time()
        self.names = 'agglo'
        agglo = cluster.AgglomerativeClustering(n_clusters=self.n) # Number of clusters
        self.labels = agglo.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def dbscan(self):
        start = time.time()
        self.names = 'dbs'
        new_x=preprocessing.StandardScaler().fit_transform(self.x)
        dbs = cluster.DBSCAN()
        self.labels = dbs.fit_predict(new_x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def optics(self):
        start = time.time()
        self.names = 'opt'
        opt = cluster.OPTICS()
        self.labels = opt.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def gaussian(self):
        start = time.time()
        self.names = 'gm'
        gm = mixture.GaussianMixture(n_components=self.n,random_state=clustering.rs) # Number of Clusters
        self.labels = gm.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()
        
    def birch(self):
        start = time.time()
        self.names = 'bir'
        brc = cluster.Birch() # Number of clusters
        self.labels = brc.fit_predict(self.x)
        end = time.time()
        self.t = round((end-start),3)
        return self.label_save()

    def label_save(self):
        dfout = pd.DataFrame({'Accession': self.y,  'predicted': self.labels, 'expected':self.true_lab})
        try:
            os.mkdir(self.folder)
        except FileExistsError:
            pass
        dfout.to_csv(f'{self.folder}\ML_{self.names}_{len(set(self.labels))}.txt',sep='\t', index=False)
        self.file()
        return self.analysis()
    
    def analysis(self):
        value=metrics.fowlkes_mallows_score(self.true_lab,self.labels)
        tot_val=[self.names,self.anno_label[self.cat[0]],str(lambda_value),str(round(value,3)),str(self.n),str(len(set(self.labels))),self.temp1,str(len(self.true_lab))]
        return tot_val
    
    def file(self):
        try:
            os.mkdir(f'{self.folder}/table')
        except FileExistsError:
            pass
        dd=defaultdict(list)
        for i in range(len(self.labels)):
            dd[self.labels[i]].append(self.y[i])
        self.excel(dict(sorted(dd.items())))
        
    def excel(self,anno):
        all_anno={}
        for i in self.cat:
            temp={}
            for j,k in anno.items():
                te=[]
                for l in k:
                    te.append(l.split('$')[i])
                temp[j]=dict(Counter(te))
            df=pd.DataFrame(temp).fillna(0).astype(int)
            df.loc['Total']=df.sum(axis=0)
            df.loc[:,'Total']=df.sum(axis=1)
            df.to_excel(f'{self.folder}/table/{self.names}_{len(set(self.labels))}_{self.anno_label[i]}.xlsx')

In [4]:
class ec_data:
    def __init__(self,file):
        self.data=[]
        with open(file,'r') as inpt:
            for i in inpt:
                self.data.append(i.rstrip())

    def cazy(self,dom,typ):
        temp=[]
        for i in self.data:
            temp.append(i.split('$')[1].split(' '))
        return self.domain(temp,dom,typ)
    
    def domain(self,ec_n,n,include):
        single,multi=[],[]
        for i in ec_n:
            if len(i)<=n:
                single.extend(i)
            else:
                multi.append(i)
        print('Number of single domain:',len(single))
        print(f'Number of multi domain (>{n}):',len(multi))
        if include=='m':
            for j in multi:
                for k in j:
                    single.append(k)
            print('After including multi domains:',len(single))
        return single
    
class analysis:
    def __init__(self,data):
        self.data=data
        
    def non_kegg_count(self):
        temp=Counter(self.data)
        return self.specific_ec(dict(temp))
    
    def kegg_count(self,label):
        temp=defaultdict(int)
        for j,k in zip(self.data,label):
            temp[j]+=int(k)
        return self.specific_ec(dict(temp))
    
    def specific_ec(self,dat):
        caazy=[]
        for u,v in dat.items():
            if '3.2.1.' in u:
                try:
                    caazy.append([u,int(u.split('.')[-1])])
                except ValueError:
                    print(u)
        cc_ec=[x[0] for x in sorted(caazy, key=lambda x:x[1])]
        return cc_ec

In [13]:
# '1' stands for single domain, and 'm' stands for to include mutli domain in single domain sequences
a_cazy=ec_data('D:/before_4_4_22/cazy_data/char_gh/char_cazy_12_8_21.txt').cazy(1,'s')
c_cazy=analysis(a_cazy).non_kegg_count()

t1=time.perf_counter()
total_data=[]
total_rm_cazy,total_rm_genbank={},{}
value_error,no_entry=[],[]
for each in c_cazy:
    ec_number=each
    gh_family='all'
    cazy_acc_seq,rm_cazy,rm_genbank=In_data.EC_GH(ec_number,gh_family) # write all to fetch all the EC number or all the GH family
    total_rm_cazy[each],total_rm_genbank[each]=rm_cazy,rm_genbank
    acc,seq,min_len,x_aa=Sequence(cazy_acc_seq).output()
    print(f'Sequence has been collected for {each}....')
    min_lamb=30
    try:
        if min(min_len)<min_lamb:
            lambda_value=min(min_len)
        else:
            lambda_value=min_lamb
    except ValueError:
        print(each,': doesnt have sequences')
        no_entry.append(each)
        continue
    pseaac_data=Pseaac(f'PAAC_{ec_number}_{gh_family}_L{lambda_value}.txt').collect(lambda_value,0.05,[60,2,3],1)
    print(f'Feature has been extracted for {each}....')
    ec_=ec_number.replace('.','_')
    # In the below statement 'auto' means it takes automatic clusters based on number labels given, you can also choose any number.
    clust=clustering(f'{gh_family}_{ec_}',pseaac_data,'auto',[4]) # 0:'acc',1:'prtn',2:'ec',3:'org',4:'species',5:'ghf'
    try:
        km=clust.kmeans()
        apc=clust.affinity()
        ms=clust.meanshift()
        spec=clust.spectral()
        agglo=clust.agglomerative()
        dbs=clust.dbscan()
        opt=clust.optics()
        gm=clust.gaussian()
        bir=clust.birch()
    except ValueError:
        print(f'{each} has {len(acc)} samples which is less than 5 min_samples ')
        value_error.append([each,len(acc)])
        continue
    print(f'Clustering has been done for {each}....')
    all_clust={'km':km,'apc':apc,'ms':ms,'spec':spec,'agglo':agglo,'dbs':dbs,'opt':opt,'gm':gm,'bir':bir}
    for aa in all_clust.values():
        bb=[each]+aa+[str(len(rm_cazy)),str(len(rm_genbank))]+[str(x_aa)]
        total_data.append('$'.join(bb))
t2=time.perf_counter()
print('Mission completed in',round(t2-t1,3),'seconds')

Number of single domain: 7063
Number of multi domain (>1): 411
3.2.1.-
3.2.1.17_or_4.2.2.n1
Sequence has been collected for 3.2.1.1....
Feature has been extracted for 3.2.1.1....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.1....
Sequence has been collected for 3.2.1.2....
Feature has been extracted for 3.2.1.2....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.2....
Sequence has been collected for 3.2.1.3....
Feature has been extracted for 3.2.1.3....




Clustering has been done for 3.2.1.3....
No hits for JX0131
Sequence has been collected for 3.2.1.4....
Feature has been extracted for 3.2.1.4....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.4....
Sequence has been collected for 3.2.1.6....
Feature has been extracted for 3.2.1.6....




Clustering has been done for 3.2.1.6....
Sequence has been collected for 3.2.1.7....
Feature has been extracted for 3.2.1.7....
Clustering has been done for 3.2.1.7....




Sequence has been collected for 3.2.1.8....
Feature has been extracted for 3.2.1.8....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.8....
Sequence has been collected for 3.2.1.10....
Feature has been extracted for 3.2.1.10....
Clustering has been done for 3.2.1.10....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Sequence has been collected for 3.2.1.11....
Feature has been extracted for 3.2.1.11....
Clustering has been done for 3.2.1.11....




Sequence has been collected for 3.2.1.14....
Feature has been extracted for 3.2.1.14....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.14....
Sequence has been collected for 3.2.1.15....
Feature has been extracted for 3.2.1.15....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.15....
Sequence has been collected for 3.2.1.17....
Feature has been extracted for 3.2.1.17....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.17....
Sequence has been collected for 3.2.1.18....
Feature has been extracted for 3.2.1.18....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.18....
Sequence has been collected for 3.2.1.20....
Feature has been extracted for 3.2.1.20....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.20....
Sequence has been collected for 3.2.1.21....
Feature has been extracted for 3.2.1.21....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.21....
Sequence has been collected for 3.2.1.22....
Feature has been extracted for 3.2.1.22....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.22....
Sequence has been collected for 3.2.1.23....
Feature has been extracted for 3.2.1.23....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.23....
Sequence has been collected for 3.2.1.24....
Feature has been extracted for 3.2.1.24....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.24....
Sequence has been collected for 3.2.1.25....
Feature has been extracted for 3.2.1.25....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.25....
Sequence has been collected for 3.2.1.26....
Feature has been extracted for 3.2.1.26....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.26....
Sequence has been collected for 3.2.1.28....
Feature has been extracted for 3.2.1.28....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.28....
Sequence has been collected for 3.2.1.31....
Feature has been extracted for 3.2.1.31....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.31....
Sequence has been collected for 3.2.1.32....
Feature has been extracted for 3.2.1.32....




Clustering has been done for 3.2.1.32....
Sequence has been collected for 3.2.1.35....
Feature has been extracted for 3.2.1.35....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.35....
Sequence has been collected for 3.2.1.36....
Feature has been extracted for 3.2.1.36....
3.2.1.36 has 3 samples which is less than 5 min_samples 
No hits for 5VQJ_A
Sequence has been collected for 3.2.1.37....
Feature has been extracted for 3.2.1.37....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.37....
Sequence has been collected for 3.2.1.38....
Feature has been extracted for 3.2.1.38....
3.2.1.38 has 4 samples which is less than 5 min_samples 
No hits for EU589324
Sequence has been collected for 3.2.1.39....
Feature has been extracted for 3.2.1.39....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.39....
Sequence has been collected for 3.2.1.40....
Feature has been extracted for 3.2.1.40....




Clustering has been done for 3.2.1.40....
Sequence has been collected for 3.2.1.41....
Feature has been extracted for 3.2.1.41....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.41....
Sequence has been collected for 3.2.1.45....
Feature has been extracted for 3.2.1.45....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.45....
Sequence has been collected for 3.2.1.46....
Feature has been extracted for 3.2.1.46....




Clustering has been done for 3.2.1.46....
Sequence has been collected for 3.2.1.48....
Feature has been extracted for 3.2.1.48....
3.2.1.48 has 3 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.49....
Feature has been extracted for 3.2.1.49....




Clustering has been done for 3.2.1.49....
Sequence has been collected for 3.2.1.50....
Feature has been extracted for 3.2.1.50....
Clustering has been done for 3.2.1.50....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Sequence has been collected for 3.2.1.51....
Feature has been extracted for 3.2.1.51....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.51....
Sequence has been collected for 3.2.1.52....
Feature has been extracted for 3.2.1.52....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.52....
Sequence has been collected for 3.2.1.53....
Feature has been extracted for 3.2.1.53....




3.2.1.53 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.54....
Feature has been extracted for 3.2.1.54....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.54....
Sequence has been collected for 3.2.1.55....
Feature has been extracted for 3.2.1.55....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.55....
Sequence has been collected for 3.2.1.57....
Feature has been extracted for 3.2.1.57....




3.2.1.57 has 2 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.58....
Feature has been extracted for 3.2.1.58....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.58....
Sequence has been collected for 3.2.1.59....
Feature has been extracted for 3.2.1.59....
Clustering has been done for 3.2.1.59....




Sequence has been collected for 3.2.1.60....
Feature has been extracted for 3.2.1.60....




Clustering has been done for 3.2.1.60....
Sequence has been collected for 3.2.1.61....
Feature has been extracted for 3.2.1.61....




3.2.1.61 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.63....
Feature has been extracted for 3.2.1.63....




Clustering has been done for 3.2.1.63....
Sequence has been collected for 3.2.1.64....
Feature has been extracted for 3.2.1.64....




3.2.1.64 has 2 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.65....
Feature has been extracted for 3.2.1.65....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.65....
Sequence has been collected for 3.2.1.67....
Feature has been extracted for 3.2.1.67....




Clustering has been done for 3.2.1.67....
Sequence has been collected for 3.2.1.68....
Feature has been extracted for 3.2.1.68....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.68....
Sequence has been collected for 3.2.1.70....
Feature has been extracted for 3.2.1.70....
Clustering has been done for 3.2.1.70....
Sequence has been collected for 3.2.1.71....




Feature has been extracted for 3.2.1.71....
3.2.1.71 has 3 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.73....
Feature has been extracted for 3.2.1.73....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.73....
Sequence has been collected for 3.2.1.74....
Feature has been extracted for 3.2.1.74....




Clustering has been done for 3.2.1.74....
Sequence has been collected for 3.2.1.75....
Feature has been extracted for 3.2.1.75....




Clustering has been done for 3.2.1.75....
Sequence has been collected for 3.2.1.76....
Feature has been extracted for 3.2.1.76....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.76....
Multiple hits for A37219
Sequence has been collected for 3.2.1.78....
Feature has been extracted for 3.2.1.78....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.78....
Sequence has been collected for 3.2.1.80....
Feature has been extracted for 3.2.1.80....




Clustering has been done for 3.2.1.80....
Sequence has been collected for 3.2.1.81....
Feature has been extracted for 3.2.1.81....




Clustering has been done for 3.2.1.81....
Sequence has been collected for 3.2.1.82....
Feature has been extracted for 3.2.1.82....




Clustering has been done for 3.2.1.82....
Sequence has been collected for 3.2.1.83....
Feature has been extracted for 3.2.1.83....




Clustering has been done for 3.2.1.83....
Sequence has been collected for 3.2.1.84....
Feature has been extracted for 3.2.1.84....




Clustering has been done for 3.2.1.84....
Sequence has been collected for 3.2.1.85....
Feature has been extracted for 3.2.1.85....




Clustering has been done for 3.2.1.85....
Sequence has been collected for 3.2.1.86....
Feature has been extracted for 3.2.1.86....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.86....
Sequence has been collected for 3.2.1.88....
Feature has been extracted for 3.2.1.88....




Clustering has been done for 3.2.1.88....
Sequence has been collected for 3.2.1.89....
Feature has been extracted for 3.2.1.89....




Clustering has been done for 3.2.1.89....
Sequence has been collected for 3.2.1.91....
Feature has been extracted for 3.2.1.91....




Clustering has been done for 3.2.1.91....
Sequence has been collected for 3.2.1.92....
Feature has been extracted for 3.2.1.92....




3.2.1.92 has 6 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.93....
Feature has been extracted for 3.2.1.93....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.93....
Sequence has been collected for 3.2.1.94....
Feature has been extracted for 3.2.1.94....
3.2.1.94 has 3 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.95....
Feature has been extracted for 3.2.1.95....




3.2.1.95 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.96....
Feature has been extracted for 3.2.1.96....




Clustering has been done for 3.2.1.96....
Sequence has been collected for 3.2.1.97....
Feature has been extracted for 3.2.1.97....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.97....
Sequence has been collected for 3.2.1.98....
Feature has been extracted for 3.2.1.98....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.98....
Sequence has been collected for 3.2.1.99....
Feature has been extracted for 3.2.1.99....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.99....
Sequence has been collected for 3.2.1.100....
Feature has been extracted for 3.2.1.100....




Clustering has been done for 3.2.1.100....
Sequence has been collected for 3.2.1.101....
Feature has been extracted for 3.2.1.101....
Clustering has been done for 3.2.1.101....




Sequence has been collected for 3.2.1.102....
Feature has been extracted for 3.2.1.102....
3.2.1.102 has 4 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.103....
Feature has been extracted for 3.2.1.103....




3.2.1.103 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.104....
Feature has been extracted for 3.2.1.104....




3.2.1.104 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.105....
Feature has been extracted for 3.2.1.105....




Clustering has been done for 3.2.1.105....
Sequence has been collected for 3.2.1.106....
Feature has been extracted for 3.2.1.106....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.106....
Sequence has been collected for 3.2.1.107....
Feature has been extracted for 3.2.1.107....




Clustering has been done for 3.2.1.107....
Sequence has been collected for 3.2.1.108....
Feature has been extracted for 3.2.1.108....
Clustering has been done for 3.2.1.108....
Sequence has been collected for 3.2.1.109....




Feature has been extracted for 3.2.1.109....
3.2.1.109 has 2 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.111....
Feature has been extracted for 3.2.1.111....




Clustering has been done for 3.2.1.111....
Sequence has been collected for 3.2.1.113....
Feature has been extracted for 3.2.1.113....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.113....
Sequence has been collected for 3.2.1.114....
Feature has been extracted for 3.2.1.114....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.114....
Sequence has been collected for 3.2.1.116....
Feature has been extracted for 3.2.1.116....




3.2.1.116 has 2 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.117....
Feature has been extracted for 3.2.1.117....




Clustering has been done for 3.2.1.117....
Sequence has been collected for 3.2.1.118....
Feature has been extracted for 3.2.1.118....




Clustering has been done for 3.2.1.118....
Sequence has been collected for 3.2.1.119....
3.2.1.119 : doesnt have sequences
Sequence has been collected for 3.2.1.120....
Feature has been extracted for 3.2.1.120....




3.2.1.120 has 2 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.122....
Feature has been extracted for 3.2.1.122....




Clustering has been done for 3.2.1.122....
Sequence has been collected for 3.2.1.123....
Feature has been extracted for 3.2.1.123....




Clustering has been done for 3.2.1.123....
Sequence has been collected for 3.2.1.125....
Feature has been extracted for 3.2.1.125....




3.2.1.125 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.129....
Feature has been extracted for 3.2.1.129....




Clustering has been done for 3.2.1.129....
Sequence has been collected for 3.2.1.130....
Feature has been extracted for 3.2.1.130....




Clustering has been done for 3.2.1.130....
Sequence has been collected for 3.2.1.131....
Feature has been extracted for 3.2.1.131....




Clustering has been done for 3.2.1.131....
Sequence has been collected for 3.2.1.132....
Feature has been extracted for 3.2.1.132....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.132....
Sequence has been collected for 3.2.1.133....
Feature has been extracted for 3.2.1.133....




Clustering has been done for 3.2.1.133....
Sequence has been collected for 3.2.1.135....
Feature has been extracted for 3.2.1.135....




Clustering has been done for 3.2.1.135....
Sequence has been collected for 3.2.1.136....
Feature has been extracted for 3.2.1.136....




3.2.1.136 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.139....
Feature has been extracted for 3.2.1.139....
Clustering has been done for 3.2.1.139....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Sequence has been collected for 3.2.1.140....
Feature has been extracted for 3.2.1.140....




Clustering has been done for 3.2.1.140....
Sequence has been collected for 3.2.1.141....
Feature has been extracted for 3.2.1.141....
Clustering has been done for 3.2.1.141....




Sequence has been collected for 3.2.1.145....
Feature has been extracted for 3.2.1.145....
Clustering has been done for 3.2.1.145....




Sequence has been collected for 3.2.1.146....
Feature has been extracted for 3.2.1.146....




Clustering has been done for 3.2.1.146....
Sequence has been collected for 3.2.1.147....
Feature has been extracted for 3.2.1.147....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.147....
Sequence has been collected for 3.2.1.149....
Feature has been extracted for 3.2.1.149....




Clustering has been done for 3.2.1.149....
Sequence has been collected for 3.2.1.150....
Feature has been extracted for 3.2.1.150....
3.2.1.150 has 3 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.151....
Feature has been extracted for 3.2.1.151....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.151....
Sequence has been collected for 3.2.1.152....
Feature has been extracted for 3.2.1.152....




Clustering has been done for 3.2.1.152....
Sequence has been collected for 3.2.1.153....
Feature has been extracted for 3.2.1.153....




Clustering has been done for 3.2.1.153....
Sequence has been collected for 3.2.1.154....
Feature has been extracted for 3.2.1.154....
3.2.1.154 has 3 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.155....
Feature has been extracted for 3.2.1.155....




3.2.1.155 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.156....
Feature has been extracted for 3.2.1.156....




Clustering has been done for 3.2.1.156....
Sequence has been collected for 3.2.1.157....
Feature has been extracted for 3.2.1.157....




Clustering has been done for 3.2.1.157....
Sequence has been collected for 3.2.1.158....
Feature has been extracted for 3.2.1.158....
3.2.1.158 has 3 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.159....
Feature has been extracted for 3.2.1.159....




Clustering has been done for 3.2.1.159....
Sequence has been collected for 3.2.1.161....
Feature has been extracted for 3.2.1.161....




3.2.1.161 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.162....
Feature has been extracted for 3.2.1.162....




3.2.1.162 has 2 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.164....
Feature has been extracted for 3.2.1.164....
Clustering has been done for 3.2.1.164....




Sequence has been collected for 3.2.1.165....
Feature has been extracted for 3.2.1.165....




Clustering has been done for 3.2.1.165....
Sequence has been collected for 3.2.1.166....
Feature has been extracted for 3.2.1.166....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.166....
Sequence has been collected for 3.2.1.167....
Feature has been extracted for 3.2.1.167....




3.2.1.167 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.169....
Feature has been extracted for 3.2.1.169....
Clustering has been done for 3.2.1.169....




Sequence has been collected for 3.2.1.170....
Feature has been extracted for 3.2.1.170....




Clustering has been done for 3.2.1.170....
Sequence has been collected for 3.2.1.171....
Feature has been extracted for 3.2.1.171....




Clustering has been done for 3.2.1.171....
Sequence has been collected for 3.2.1.172....
Feature has been extracted for 3.2.1.172....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.172....
Sequence has been collected for 3.2.1.173....
Feature has been extracted for 3.2.1.173....




3.2.1.173 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.174....
Feature has been extracted for 3.2.1.174....
3.2.1.174 has 3 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.176....
Feature has been extracted for 3.2.1.176....




Clustering has been done for 3.2.1.176....
Sequence has been collected for 3.2.1.177....
Feature has been extracted for 3.2.1.177....


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clustering has been done for 3.2.1.177....
Sequence has been collected for 3.2.1.178....
Feature has been extracted for 3.2.1.178....




Clustering has been done for 3.2.1.178....
Sequence has been collected for 3.2.1.181....
Feature has been extracted for 3.2.1.181....




Clustering has been done for 3.2.1.181....
Sequence has been collected for 3.2.1.185....
Feature has been extracted for 3.2.1.185....
3.2.1.185 has 4 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.197....
Feature has been extracted for 3.2.1.197....
3.2.1.197 has 4 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.199....
Feature has been extracted for 3.2.1.199....




3.2.1.199 has 8 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.204....
Feature has been extracted for 3.2.1.204....
3.2.1.204 has 4 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.205....
Feature has been extracted for 3.2.1.205....




3.2.1.205 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.206....
Feature has been extracted for 3.2.1.206....




3.2.1.206 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.211....
Feature has been extracted for 3.2.1.211....




3.2.1.211 has 1 samples which is less than 5 min_samples 
Sequence has been collected for 3.2.1.212....
Feature has been extracted for 3.2.1.212....
Clustering has been done for 3.2.1.212....
Mission completed in 651.444 seconds




In [14]:
from tqdm import tqdm
titl=['EC_number','Method','label_type','lambda','FMI','ex_groups','pred_groups','distribution','total','CAZy_partial','Fasta_partial','X_aa']
outpt=open('ec_number_cluster_species_edited.txt','w')
outpt.write('$'.join(titl)+'\n')
for line in tqdm(total_data):
    outpt.write(line+'\n')
outpt.close()   

100%|████████████████████████████████████████████████████████████████████████████| 972/972 [00:00<00:00, 486093.18it/s]
