In [21]:
%reload_ext autoreload
%autoreload 2
# BASE
import os
import json
# COOL
from pyeed.core import ProteinRecord
from pyeed.core import DNARecord

In [22]:
current_path = os.getcwd()

input_folder_alignment = os.path.join(current_path, "data", "alignment_protein_ids_fetch_all")
output_folder_filters = os.path.join(current_path, "data", "filtered_protein_fetch_ids_all")

In [23]:
# we read in the proteins and ids from input folder
protein_dic = {}

for file in os.listdir(input_folder_alignment):
    if file.endswith(".json"):
        with open(os.path.join(input_folder_alignment, file), "r") as f:
            # read in the file as a dic
            file_data = json.load(f)
            protein_dic[file.split('.')[0]] = {'protein': ProteinRecord.from_json_string(file_data['protein']), 'alignment': file_data['alignment'], 'dna': DNARecord.from_json_string(file_data['dna'])}

print(len(protein_dic))

207


In [24]:
# now we apply a filter to set all of the TEM-1 beta-lactamase domains, that is done by finding the region in TEM-1
# the region id is: PRK15442 --> https://www.ncbi.nlm.nih.gov/Structure/cdd/cddsrv.cgi?ascbin=8&maxaln=10&seltype=2&uid=PRK15442&querygi=1132604215&aln=2,6,26,27,33,54,228
# the position has to be found and then we apply this to all other proteins after the alignment
# if they already have a region with the same name we don't to anyting
# if they don't have a region we add it
set_up_filter = 'TEM-1'
set_up_region = 'PRK15442'
for region in protein_dic[set_up_filter]['protein'].regions:
    if region.id == set_up_region:
        print(f"The region is {region.id} and the start is {region.start} and the end is {region.end}")
        # now we need to find the start and end in the alignment
        protein_filter = protein_dic[set_up_filter]['protein']
        alignment_filter = protein_dic[set_up_filter]['alignment']

        tem_region = protein_filter.sequence[region.start:region.end]
        # now we need the pos in the alignment
        set_up_filter_region_start = alignment_filter.index(tem_region)
        set_up_filter_region_end = set_up_filter_region_start + len(tem_region)

for protein_id, protein_data in protein_dic.items():
    protein = protein_data['protein']
    alignment = protein_data['alignment']
    protein_data['TEM-Domain'] = alignment[set_up_filter_region_start:set_up_filter_region_end]
    # check if the region is already in the protein
    found_region = False
    for region in protein.regions:
        if region.id == set_up_region:
            print(f"Region {set_up_region} already in protein {protein_id}")
            found_region = True
    
    # find the position of the region
    if not found_region:                
        protein.add_to_coding_sequence(start=set_up_filter_region_start, end=set_up_filter_region_end, id=set_up_region, name=set_up_region)

The region is PRK15442 and the start is 2 and the end is 286
Region PRK15442 already in protein TEM-19
Region PRK15442 already in protein TEM-P116
Region PRK15442 already in protein TEM-21
Region PRK15442 already in protein TEM-60
Region PRK15442 already in protein TEM-219
Region PRK15442 already in protein TEM-186
Region PRK15442 already in protein TEM-209
Region PRK15442 already in protein TEM-136
Region PRK15442 already in protein TEM-80
Region PRK15442 already in protein TEM-220
Region PRK15442 already in protein TEM-55
Region PRK15442 already in protein TEM-132
Region PRK15442 already in protein TEM-234
Region PRK15442 already in protein TEM-133
Region PRK15442 already in protein TEM-78
Region PRK15442 already in protein TEM-226
Region PRK15442 already in protein TEM-146
Region PRK15442 already in protein TEM-232
Region PRK15442 already in protein TEM-151
Region PRK15442 already in protein TEM-229
Region PRK15442 already in protein TEM-111
Region PRK15442 already in protein TEM-24

In [25]:
# now that we have the regions and can count the number of diffrent amino acids in the region
# as a diff counter we can set any TEM-n name
set_up_region = 'PRK15442'


# we now run the diff counter for all combinations of all proteins
for set_up_counter in protein_dic.keys():

    diff_counter_dic = {}

    reference_region = protein_dic[set_up_counter]['TEM-Domain']

    for protein_id, protein_data in protein_dic.items():
        region = protein_data['TEM-Domain']
        # count the differences
        diff_counter = 0
        for i in range(len(reference_region)):
            if reference_region[i] != region[i]:
                diff_counter += 1

        diff_counter_dic[protein_id] = diff_counter


    protein_dic[set_up_counter]['TEM-Domain-Diff'] = diff_counter_dic


In [26]:
protein_dic['TEM-1'].keys()

dict_keys(['protein', 'alignment', 'dna', 'TEM-Domain', 'TEM-Domain-Diff'])

In [27]:
# we now save the new dic with the filtered proteins and the diff count added to the output folder
os.makedirs(output_folder_filters, exist_ok=True)

def dumper(obj):
    try:
        return obj.json()
    except:
        return obj.__dict__

for protein_id, protein_data in protein_dic.items():
    with open(os.path.join(output_folder_filters, f"{protein_id}.json"), "w") as f:
        json.dump(protein_data, f, default=dumper)

In [28]:
# with this diff count we now create a network and show the proteins