In [4]:
from Bio import Entrez
from Bio import SeqIO

In [5]:
Entrez.email = 'minotetrastichus@gmail.com'

In [8]:
prot_seqs=[]
with open('./Clusters.txt') as clusters_file: 
    cluster_number = 0
    print ('Cluster ', cluster_number)
    for line in clusters_file: 
        if line == '\n': 
            cluster_number +=1
            print ('\n\n Cluster ', cluster_number)
        else: 
            coordinates = line.split()[2]
            start = int(coordinates.split('_')[0])
            end = int(coordinates.split('_')[1])
            gb_id = line.split()[1]
            if int(start) > int (end): 
                start, end = end, start 
            print(f'Gb id: {gb_id}, start: {start}, end: {end}')
            handle = Entrez.efetch(db='nucleotide', rettype="gb", id=gb_id)
            for rec in SeqIO.parse(handle, "genbank"):
                for feature in rec.features:
                    if feature.type=='CDS':
                        if 'protein_id' in feature.qualifiers:
                            fstart = str(feature.location.start)
                            fstart = int(fstart.replace('<',''))

                            fend = str(feature.location.end)
                            fend = int(fend.replace('>',''))
                            
                            if fstart > fend: 
                                fstart, fend = fend, fstart 
                            if fstart >= start-10 and fend <= end+10:
                                
                                if 'product' not in feature.qualifiers:
                                    product = 'unknown'
                                else:
                                    product = feature.qualifiers["product"][0]
                                    
                                prot_id = feature.qualifiers["protein_id"][0]
                                print (f'Feature start: {fstart}, feature end: {fend}, product: {product}, prot_id: {prot_id}')
                                prot_seq = Entrez.efetch(db="protein", id=prot_id, retmode="text", rettype="fasta").read()
                                prot_seqs.append(prot_seq)

Cluster  0
Gb id: CABMGN010000008.1, start: 240389, end: 241435
Feature start: 240388, feature end: 241435, product: D-inositol-3-phosphate glycosyltransferase, prot_id: VVB80282.1
Gb id: CABMGN010000008.1, start: 242848, end: 244005
Feature start: 242847, feature end: 244005, product: Glycosyltransferase Gtf1, prot_id: VVB80288.1
Gb id: CABMGN010000008.1, start: 244002, end: 244889
Feature start: 244001, feature end: 244889, product: Glycosyltransferase AglE, prot_id: VVB80290.1
Gb id: CABMGN010000008.1, start: 247150, end: 248259
Feature start: 247149, feature end: 248259, product: Trehalose synthase, prot_id: VVB80296.1
Gb id: CABMGN010000008.1, start: 249026, end: 250255
Feature start: 249025, feature end: 250240, product: Polysaccharide biosynthesis protein, prot_id: VVB80300.1


 Cluster  1
Gb id: CABMIH010000075.1, start: 83777, end: 84511
Feature start: 83776, feature end: 84511, product: Sulfite exporter TauE/SafE, prot_id: VVB96841.1
Gb id: CABMIH010000075.1, start: 85252, en

Gb id: CABMCA010000104.1, start: 19759, end: 21363
Feature start: 19758, feature end: 21363, product: Uncharacterised protein, prot_id: VVB55859.1
Gb id: CABMCA010000104.1, start: 21447, end: 22064
Gb id: CABMCA010000104.1, start: 22067, end: 22498
Feature start: 22066, feature end: 22498, product: Uncharacterised protein, prot_id: VVB55861.1
Gb id: CABMCA010000104.1, start: 23312, end: 24037
Feature start: 23311, feature end: 24037, product: Peptidase S24-like protein, prot_id: VVB55863.1
Gb id: CABMCA010000104.1, start: 25599, end: 26612
Feature start: 25598, feature end: 26612, product: Uncharacterised protein, prot_id: VVB55865.1


 Cluster  12
Gb id: CABMCA010000104.1, start: 35498, end: 36100
Feature start: 35497, feature end: 36100, product: Uncharacterised protein, prot_id: VVB55876.1
Gb id: CABMCA010000104.1, start: 36173, end: 36634
Feature start: 36172, feature end: 36634, product: Uncharacterised protein, prot_id: VVB55877.1
Gb id: CABMCA010000104.1, start: 37687, end: 3914

Feature start: 3364, feature end: 4144, product: Uncharacterised protein, prot_id: VVB89876.1
Gb id: CABMIB010000079.1, start: 4162, end: 7431
Feature start: 4161, feature end: 7431, product: Uncharacterised protein, prot_id: VVB89879.1
Gb id: CABMIB010000079.1, start: 8961, end: 9929
Feature start: 8960, feature end: 9929, product: Uncharacterised protein, prot_id: VVB89883.1
Gb id: CABMIB010000079.1, start: 9992, end: 10618
Feature start: 9991, feature end: 10618, product: Uncharacterised protein, prot_id: VVB89885.1


 Cluster  25
Gb id: CABMII010000005.1, start: 10731, end: 12665
Feature start: 10730, feature end: 12665, product: Osmosensitive K+ channel His kinase sensor domain protein, prot_id: VVB92121.1
Gb id: CABMII010000005.1, start: 12806, end: 13372
Feature start: 12805, feature end: 13372, product: Potassium-transporting ATPase KdpC subunit, prot_id: VVB92123.1
Gb id: CABMII010000005.1, start: 14168, end: 16264
Feature start: 14167, feature end: 16249, product: Potassium-t

In [9]:
with open("cluster_proteins_unique.fasta", "w") as ouf:
    for protein_seq in set(prot_seqs):
        ouf.write(protein_seq)