In [4]:
!pip install Bio

Collecting Bio
  Downloading bio-1.5.9-py3-none-any.whl (276 kB)
     ---------------------------------------- 0.0/276.4 kB ? eta -:--:--
     -------- ------------------------------ 61.4/276.4 kB 1.1 MB/s eta 0:00:01
     -------------------------------- ----- 235.5/276.4 kB 2.4 MB/s eta 0:00:01
     -------------------------------------- 276.4/276.4 kB 2.1 MB/s eta 0:00:00
Collecting pooch
  Downloading pooch-1.7.0-py3-none-any.whl (60 kB)
     ---------------------------------------- 0.0/60.9 kB ? eta -:--:--
     ---------------------------------------- 60.9/60.9 kB ? eta 0:00:00
Collecting gprofiler-official
  Downloading gprofiler_official-1.0.0-py3-none-any.whl (9.3 kB)
Collecting mygene
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting biopython>=1.80
  Downloading biopython-1.81-cp310-cp310-win_amd64.whl (2.7 MB)
     ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
     ----- ---------------------------------- 0.4/2.7 MB 7.8 MB/s eta 0:00:01


[notice] A new release of pip is available: 23.0.1 -> 23.3
[notice] To update, run: C:\Users\Paulius\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [72]:
from Bio import SeqIO
import numpy as np
from collections import defaultdict
import math
from scipy.spatial.distance import pdist, squareform

In [73]:
from Bio.Seq import Seq

def extract_valid_start_stop_pairs(sequence):
    start_codon = "ATG"
    stop_codons = ["TAA", "TAG", "TGA"]
    
    valid_pairs = []
    
    def get_pairs(seq):
        pairs = []
        for i in range(len(seq) - 2):
            if seq[i:i+3] == start_codon:
                for j in range(i+3, len(seq) - 2, 3):
                    if seq[j:j+3] in stop_codons and not any(stop in seq[i+3:j] for stop in stop_codons):
                        pairs.append((i, j))
                        break
        return pairs
    
    valid_pairs.extend(get_pairs(sequence))
    
    reverse_seq = str(Seq(sequence).reverse_complement())
    valid_pairs.extend(get_pairs(reverse_seq))

    return valid_pairs
#returns whole fragment
def filter_long_fragments(sequence, pairs, max_length=100):
    fragments = []
    for start, stop in pairs:
        fragment = sequence[start:stop+3]
        if len(fragment) <= max_length:
            fragments.append(fragment)
    return fragments
#returns start and finish pairs
def filter_long_fragment_pairs(sequence, pairs, max_length=100):
    positions = []
    for start, stop in pairs:
        fragment_length = stop + 3 - start
        if fragment_length <= max_length:
            positions.append((start, stop))
    return positions

In [74]:
def convert_to_protein(dna_sequence):
    dna_seq = Seq(dna_sequence)
    protein_seq = dna_seq.translate()
    return protein_seq
def extract_dikodons(protein_sequence):
    dikodons = [protein_sequence[i:i+2] for i in range(0, len(protein_sequence)-1)]
    return dikodons

In [75]:
def codon_frequencies(sequence):
    codons = all_possible_codons()
    frequencies = {codon: 0 for codon in codons}
    
    for i in range(0, len(sequence)-2, 3):
        codon = sequence[i:i+3]
        if codon in frequencies:
            frequencies[codon] += 1

    total_codons = sum(frequencies.values())
    for codon, count in frequencies.items():
        frequencies[codon] = count / total_codons

    return frequencies
def dikodon_frequencies(sequence):
    dikodons = all_possible_dikodons()
    frequencies = {dikodon: 0 for dikodon in dikodons}
    
    for i in range(0, len(sequence)-5, 3):
        dikodon = sequence[i:i+6]
        if dikodon in frequencies:
            frequencies[dikodon] += 1

    total_dikodons = sum(frequencies.values())
    for dikodon, count in frequencies.items():
        frequencies[dikodon] = count / total_dikodons 

    return frequencies

def all_possible_codons():
    bases = 'ATGC'
    return [a+b+c for a in bases for b in bases for c in bases]

def all_possible_dikodons():
    codons = all_possible_codons()
    return [a+b for a in codons for b in codons]

In [76]:
def amino_acid_frequencies(protein_sequences):
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
    amino_acid_counts = {aa: 0 for aa in amino_acids}

    for protein in protein_sequences:
        for aa in protein:
            if aa in amino_acid_counts:
                amino_acid_counts[aa] += 1

    total_amino_acids = sum(amino_acid_counts.values())

    amino_acid_freqs = {aa: count / total_amino_acids for aa, count in amino_acid_counts.items()}
    return amino_acid_freqs
def print_amino_acid_frequencies(protein_sequences):
    amino_acid_freqs = amino_acid_frequencies(protein_sequences)
    header = "|Amino Acid | Frequency|"
    print("-" * len(header))
    print(header)
    print("-" * len(header))
    for aa, freq in amino_acid_freqs.items():
        print(f"|{aa:^11}| {freq:.6f} |")
    print("-" * len(header))

def dikodon_frequency_matrix(protein_sequences):
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
    
    matrix = {(aa1, aa2): 0 for aa1 in amino_acids for aa2 in amino_acids}
    total_dikodons = 0

    for protein in protein_sequences:
        for i in range(len(protein) - 1):
            dikodon = (protein[i], protein[i+1])
            if dikodon in matrix:
                matrix[dikodon] += 1
                total_dikodons += 1

    for dikodon, count in matrix.items():
        matrix[dikodon] = count / total_dikodons
    
    return matrix
    
def print_dikodon_frequency_matrix(protein_sequences):
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
    matrix = dikodon_frequency_matrix(protein_sequences)
    print("-"*190)
    print("|" + " " * 8 + "|", " | ".join([f"{aa:^6}" for aa in amino_acids]), "|")
    print("-" * 190)

    for aa1 in amino_acids:
        row_data = ["| " + f"{matrix[(aa1, aa2)]:>6.4f} " for aa2 in amino_acids]
        print(f"| {aa1:^6}", "".join(row_data), "|")

    print("-" * 190)

In [77]:
def print_results(file_name):
    sequence_record = SeqIO.read(file_name, "fasta")
    sequence = sequence_record.seq

    pairs = extract_valid_start_stop_pairs(sequence)
    #print("\n")


    filtered_fragments = filter_long_fragment_pairs(sequence, pairs)
    #print("\n")

    protein_sequences = []
    for start, stop in filtered_fragments:
        dna_fragment = sequence[start:stop+3]  
        protein_sequence = convert_to_protein(dna_fragment)
        protein_sequences.append(protein_sequence)
        dikodons = extract_dikodons(protein_sequence)
        #print(f"\nDNA Fragment: {dna_fragment}")
        #print(f"Protein sequence: {protein_sequence}")
        #print(f"Dikodons: {dikodons}")
    print_amino_acid_frequencies(protein_sequences)
    print_dikodon_frequency_matrix(protein_sequences)

In [78]:
def calculate_amino_acid_dikodon_frequencies(dna_sequence):
    protein_sequence = convert_to_protein(dna_sequence)

    amino_acid_freqs = amino_acid_frequencies(protein_sequence)  # Assuming you have a function for this
    dikodon_freqs = dikodon_frequencies(protein_sequence)
    
    dikodon_format_conversion = {(dikodon[0], dikodon[1]): freq for dikodon, freq in dikodon_freqs.items()}
    
    return amino_acid_freqs, dikodon_format_conversion

def euclidean_distance(freqs1, freqs2):
    dist = 0
    for k in freqs1.keys():
        dist += (freqs1[k] - freqs2[k]) ** 2
    return math.sqrt(dist)

def compute_distance_matrices(fasta_files):
    sequences = {}
    for file in fasta_files:
        with open(file, 'r') as f:
            for record in SeqIO.parse(f, 'fasta'):
                sequences[record.id] = str(record.seq)
    
    freqs = {id: calculate_codon_dikodon_frequencies(seq) for id, seq in sequences.items()}
    
    distance_matrix_codon = {}
    distance_matrix_dikodon = {}
    for id1 in sequences:
        distance_matrix_codon[id1] = {}
        distance_matrix_dikodon[id1] = {}
        for id2 in sequences:
            dist_codon = euclidean_distance(freqs[id1][0], freqs[id2][0])
            dist_dikodon = euclidean_distance(freqs[id1][1], freqs[id2][1])
            
            distance_matrix_codon[id1][id2] = dist_codon
            distance_matrix_dikodon[id1][id2] = dist_dikodon
    
    return distance_matrix_codon, distance_matrix_dikodon

def print_phylip_format(distance_matrix, title):
    ids = list(distance_matrix.keys())
    n = len(ids)
    
    print(title)
    print(n)
    for id1 in ids:
        row = [id1] + ["{:.3f}".format(distance_matrix[id1][id2]) for id2 in ids]
        print(" ".join(row))

fasta_files = ["data\\mamalian1.fasta", "data\\mamalian2.fasta", "data\\mamalian3.fasta", 
               "data\\mamalian4.fasta", "data\\bacterial1.fasta", "data\\bacterial2.fasta",
               "data\\bacterial3.fasta", "data\\bacterial4.fasta"]
distance_matrix_codon, distance_matrix_dikodon = compute_distance_matrices(fasta_files)
print_phylip_format(distance_matrix_codon, "Codon Distance Matrix")
print_phylip_format(distance_matrix_dikodon, "Dikodon Distance Matrix")

Codon Distance Matrix
8
coronavirus 0.000 0.111 0.067 0.159 0.035 0.068 0.065 0.087
adenovirus 0.111 0.000 0.174 0.053 0.141 0.043 0.116 0.184
U18337.1 0.067 0.174 0.000 0.223 0.042 0.132 0.111 0.082
herpesvirus 0.159 0.053 0.223 0.000 0.191 0.093 0.164 0.231
Lactococcus_phage 0.035 0.141 0.042 0.191 0.000 0.098 0.069 0.067
KM389305.1 0.068 0.043 0.132 0.093 0.098 0.000 0.082 0.143
NC_028697.1 0.065 0.116 0.111 0.164 0.069 0.082 0.000 0.081
KC821626.1 0.087 0.184 0.082 0.231 0.067 0.143 0.081 0.000
Dikodon Distance Matrix
8
coronavirus 0.000 0.019 0.023 0.023 0.020 0.020 0.020 0.025
adenovirus 0.019 0.000 0.023 0.017 0.020 0.016 0.018 0.026
U18337.1 0.023 0.023 0.000 0.025 0.019 0.022 0.022 0.024
herpesvirus 0.023 0.017 0.025 0.000 0.024 0.018 0.022 0.030
Lactococcus_phage 0.020 0.020 0.019 0.024 0.000 0.020 0.018 0.023
KM389305.1 0.020 0.016 0.022 0.018 0.020 0.000 0.019 0.027
NC_028697.1 0.020 0.018 0.022 0.022 0.018 0.019 0.000 0.023
KC821626.1 0.025 0.026 0.024 0.030 0.023 0.027 0.

In [79]:
print_results("data\\mamalian1.fasta")

------------------------
|Amino Acid | Frequency|
------------------------
|     A     | 0.031453 |
|     C     | 0.042436 |
|     D     | 0.016975 |
|     E     | 0.022966 |
|     F     | 0.050924 |
|     G     | 0.031952 |
|     H     | 0.047928 |
|     I     | 0.045931 |
|     K     | 0.043435 |
|     L     | 0.127309 |
|     M     | 0.127808 |
|     N     | 0.039940 |
|     P     | 0.025462 |
|     Q     | 0.045931 |
|     R     | 0.039940 |
|     S     | 0.069396 |
|     T     | 0.060409 |
|     V     | 0.072391 |
|     W     | 0.017474 |
|     Y     | 0.039940 |
------------------------
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|        |   A    |   C    |   D    |   E    |   F    |   G    |   H    |   I    |   K    |   L    |   M    |   N    |   P    |   Q    |   R    |   S    |   T    |   V    |   W    |   Y    |
------------------

In [86]:
print_results("data\\mamalian2.fasta")

------------------------
|Amino Acid | Frequency|
------------------------
|     A     | 0.065159 |
|     C     | 0.036124 |
|     D     | 0.025996 |
|     E     | 0.033086 |
|     F     | 0.046590 |
|     G     | 0.064146 |
|     H     | 0.027684 |
|     I     | 0.037812 |
|     K     | 0.038488 |
|     L     | 0.085078 |
|     M     | 0.082714 |
|     N     | 0.033761 |
|     P     | 0.068535 |
|     Q     | 0.037475 |
|     R     | 0.076975 |
|     S     | 0.090142 |
|     T     | 0.054693 |
|     V     | 0.049966 |
|     W     | 0.022957 |
|     Y     | 0.022620 |
------------------------
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|        |   A    |   C    |   D    |   E    |   F    |   G    |   H    |   I    |   K    |   L    |   M    |   N    |   P    |   Q    |   R    |   S    |   T    |   V    |   W    |   Y    |
------------------

In [87]:
print_results("data\\mamalian3.fasta")

------------------------
|Amino Acid | Frequency|
------------------------
|     A     | 0.018357 |
|     C     | 0.030157 |
|     D     | 0.029283 |
|     E     | 0.015297 |
|     F     | 0.067745 |
|     G     | 0.025787 |
|     H     | 0.037150 |
|     I     | 0.111014 |
|     K     | 0.040647 |
|     L     | 0.102710 |
|     M     | 0.102273 |
|     N     | 0.035839 |
|     P     | 0.032343 |
|     Q     | 0.024913 |
|     R     | 0.054633 |
|     S     | 0.086976 |
|     T     | 0.049825 |
|     V     | 0.057255 |
|     W     | 0.013986 |
|     Y     | 0.063811 |
------------------------
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|        |   A    |   C    |   D    |   E    |   F    |   G    |   H    |   I    |   K    |   L    |   M    |   N    |   P    |   Q    |   R    |   S    |   T    |   V    |   W    |   Y    |
------------------

In [88]:
print_results("data\\mamalian4.fasta")

------------------------
|Amino Acid | Frequency|
------------------------
|     A     | 0.074598 |
|     C     | 0.035592 |
|     D     | 0.019015 |
|     E     | 0.030717 |
|     F     | 0.032179 |
|     G     | 0.072160 |
|     H     | 0.025353 |
|     I     | 0.035592 |
|     K     | 0.024866 |
|     L     | 0.078986 |
|     M     | 0.074110 |
|     N     | 0.024378 |
|     P     | 0.053145 |
|     Q     | 0.031692 |
|     R     | 0.103364 |
|     S     | 0.108727 |
|     T     | 0.059971 |
|     V     | 0.072160 |
|     W     | 0.023403 |
|     Y     | 0.019990 |
------------------------
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|        |   A    |   C    |   D    |   E    |   F    |   G    |   H    |   I    |   K    |   L    |   M    |   N    |   P    |   Q    |   R    |   S    |   T    |   V    |   W    |   Y    |
------------------

In [71]:
print_results("data\\bacterial1.fasta")

------------------------
|Amino Acid | Frequency|
------------------------
|     A     | 0.039463 |
|     C     | 0.029203 |
|     D     | 0.018942 |
|     E     | 0.026835 |
|     F     | 0.061563 |
|     G     | 0.037885 |
|     H     | 0.022099 |
|     I     | 0.064720 |
|     K     | 0.062352 |
|     L     | 0.092344 |
|     M     | 0.138122 |
|     N     | 0.037096 |
|     P     | 0.021310 |
|     Q     | 0.051302 |
|     R     | 0.050513 |
|     S     | 0.089187 |
|     T     | 0.049724 |
|     V     | 0.050513 |
|     W     | 0.012628 |
|     Y     | 0.044199 |
------------------------
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|        |   A    |   C    |   D    |   E    |   F    |   G    |   H    |   I    |   K    |   L    |   M    |   N    |   P    |   Q    |   R    |   S    |   T    |   V    |   W    |   Y    |
------------------

In [90]:
print_results("data\\bacterial2.fasta")

------------------------
|Amino Acid | Frequency|
------------------------
|     A     | 0.045950 |
|     C     | 0.031542 |
|     D     | 0.027259 |
|     E     | 0.022586 |
|     F     | 0.043614 |
|     G     | 0.047118 |
|     H     | 0.029595 |
|     I     | 0.061137 |
|     K     | 0.038551 |
|     L     | 0.103583 |
|     M     | 0.091121 |
|     N     | 0.029984 |
|     P     | 0.059579 |
|     Q     | 0.035047 |
|     R     | 0.069704 |
|     S     | 0.090732 |
|     T     | 0.047508 |
|     V     | 0.072430 |
|     W     | 0.020639 |
|     Y     | 0.032321 |
------------------------
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|        |   A    |   C    |   D    |   E    |   F    |   G    |   H    |   I    |   K    |   L    |   M    |   N    |   P    |   Q    |   R    |   S    |   T    |   V    |   W    |   Y    |
------------------

In [91]:
print_results("data\\bacterial3.fasta")

------------------------
|Amino Acid | Frequency|
------------------------
|     A     | 0.046138 |
|     C     | 0.023847 |
|     D     | 0.029549 |
|     E     | 0.021773 |
|     F     | 0.031104 |
|     G     | 0.041991 |
|     H     | 0.018144 |
|     I     | 0.052359 |
|     K     | 0.064800 |
|     L     | 0.114567 |
|     M     | 0.131156 |
|     N     | 0.040435 |
|     P     | 0.022291 |
|     Q     | 0.044583 |
|     R     | 0.071021 |
|     S     | 0.076724 |
|     T     | 0.051322 |
|     V     | 0.054432 |
|     W     | 0.025920 |
|     Y     | 0.037843 |
------------------------
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|        |   A    |   C    |   D    |   E    |   F    |   G    |   H    |   I    |   K    |   L    |   M    |   N    |   P    |   Q    |   R    |   S    |   T    |   V    |   W    |   Y    |
------------------

In [92]:
print_results("data\\bacterial4.fasta")

------------------------
|Amino Acid | Frequency|
------------------------
|     A     | 0.022508 |
|     C     | 0.020900 |
|     D     | 0.017685 |
|     E     | 0.016881 |
|     F     | 0.064309 |
|     G     | 0.040997 |
|     H     | 0.012058 |
|     I     | 0.073151 |
|     K     | 0.081190 |
|     L     | 0.125402 |
|     M     | 0.216238 |
|     N     | 0.045820 |
|     P     | 0.011254 |
|     Q     | 0.035370 |
|     R     | 0.049035 |
|     S     | 0.040193 |
|     T     | 0.023312 |
|     V     | 0.047428 |
|     W     | 0.012058 |
|     Y     | 0.044212 |
------------------------
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|        |   A    |   C    |   D    |   E    |   F    |   G    |   H    |   I    |   K    |   L    |   M    |   N    |   P    |   Q    |   R    |   S    |   T    |   V    |   W    |   Y    |
------------------

In [80]:
#aggregate same category viruses to one 
def aggregate_results_by_category(file_names):
    total_amino_acid_counts = {aa: 0 for aa in 'ACDEFGHIKLMNPQRSTVWY'}
    total_dikodon_counts = {(a, b): 0 for a in 'ACDEFGHIKLMNPQRSTVWY' for b in 'ACDEFGHIKLMNPQRSTVWY'}

    protein_sequences = []

    for file_name in file_names:
        sequence_record = SeqIO.read(file_name, "fasta")
        sequence = str(sequence_record.seq)

        pairs = extract_valid_start_stop_pairs(sequence)
        filtered_fragments = filter_long_fragment_pairs(sequence, pairs)

        for start, stop in filtered_fragments:
            dna_fragment = sequence[start:stop+3]
            protein_sequence = convert_to_protein(dna_fragment)
            protein_sequences.append(protein_sequence)

            dikodons = extract_dikodons(protein_sequence)
            for dikodon in dikodons:
                if (dikodon[0], dikodon[1]) in total_dikodon_counts:
                    total_dikodon_counts[(dikodon[0], dikodon[1])] += 1

    amino_acid_freqs = amino_acid_frequencies(protein_sequences)

    total_dikodons = sum(total_dikodon_counts.values())
    dikodon_frequencies = {dikodon: count/total_dikodons for dikodon, count in total_dikodon_counts.items()}
    
    return amino_acid_freqs, dikodon_frequencies

#top 15
def print_categroty_results(amino_acid_frequencies, dikodon_frequencies):
    print("Amino Acid | Frequency")
    
    sorted_amino_acids = sorted(amino_acid_frequencies.items(), key=lambda x: x[1], reverse=True)
    for aa, freq in sorted_amino_acids:
        print(f"{aa} \t {freq:.5f}")

    print("\nDikodon Frequencies:")
    
    sorted_dikodons = sorted(dikodon_frequencies.items(), key=lambda x: x[1], reverse=True)[:15]
    for (a, b), freq in sorted_dikodons:
        print(f"({a}, {b}) \t {freq:.5f}")

In [81]:
#mamalian frequencies
mammalian_filenames = ["data\\mamalian1.fasta", "data\\mamalian2.fasta", "data\\mamalian3.fasta", 
               "data\\mamalian4.fasta"]
mam_amino_acid_freqs, mam_dikodon_freqs = aggregate_results_by_category(mammalian_filenames)
print_categroty_results(mam_amino_acid_freqs, mam_dikodon_freqs)

Amino Acid | Frequency
L 	 0.09716
M 	 0.09534
S 	 0.08899
R 	 0.06933
V 	 0.06148
I 	 0.05707
T 	 0.05589
F 	 0.04955
G 	 0.04955
A 	 0.04847
P 	 0.04697
K 	 0.03708
C 	 0.03590
Y 	 0.03590
Q 	 0.03493
H 	 0.03386
N 	 0.03353
E 	 0.02601
D 	 0.02332
W 	 0.01967

Dikodon Frequencies:
(L, L) 	 0.01440
(S, S) 	 0.01245
(M, L) 	 0.01219
(M, S) 	 0.01089
(R, S) 	 0.00921
(R, R) 	 0.00843
(L, S) 	 0.00817
(S, L) 	 0.00804
(V, L) 	 0.00804
(I, L) 	 0.00778
(S, R) 	 0.00765
(M, V) 	 0.00752
(V, S) 	 0.00752
(L, P) 	 0.00713
(L, F) 	 0.00687


In [82]:
#bacterial frequencies
bacterial_filenames = ["data\\bacterial1.fasta", "data\\bacterial2.fasta",
               "data\\bacterial3.fasta", "data\\bacterial4.fasta"]
bac_amino_acid_freqs, bac_dikodon_freqs = aggregate_results_by_category(bacterial_filenames)
print_categroty_results(bac_amino_acid_freqs, bac_dikodon_freqs)

Amino Acid | Frequency
M 	 0.13285
L 	 0.10845
S 	 0.07763
R 	 0.06293
I 	 0.06150
V 	 0.05908
K 	 0.05765
F 	 0.04709
T 	 0.04466
G 	 0.04295
A 	 0.04067
Q 	 0.04067
Y 	 0.03810
N 	 0.03696
P 	 0.03382
C 	 0.02711
D 	 0.02469
E 	 0.02212
H 	 0.02197
W 	 0.01912

Dikodon Frequencies:
(M, L) 	 0.01717
(L, L) 	 0.01610
(I, L) 	 0.01252
(M, K) 	 0.01234
(L, S) 	 0.01163
(M, I) 	 0.01127
(M, V) 	 0.01127
(M, S) 	 0.00948
(S, S) 	 0.00948
(M, F) 	 0.00912
(M, Q) 	 0.00877
(S, L) 	 0.00877
(V, L) 	 0.00859
(L, V) 	 0.00841
(M, T) 	 0.00841


In [83]:
def compare_amino_acid_frequencies(mam_amino_acid_freqs, bac_amino_acid_freqs):
    amino_acid_difference = {aa: abs(mam_amino_acid_freqs[aa] - bac_amino_acid_freqs[aa]) for aa in mam_amino_acid_freqs}
    
    sorted_aas = sorted(amino_acid_difference.keys(), key=lambda x: abs(amino_acid_difference[x]), reverse=True)
    
    for aa in sorted_aas:
        difference = amino_acid_difference[aa]
        print(f"Amino Acid: {aa} - Difference: {difference:.4f}")
        
compare_amino_acid_frequencies(mam_amino_acid_freqs, bac_amino_acid_freqs)

Amino Acid: M - Difference: 0.0375
Amino Acid: K - Difference: 0.0206
Amino Acid: P - Difference: 0.0132
Amino Acid: H - Difference: 0.0119
Amino Acid: S - Difference: 0.0114
Amino Acid: L - Difference: 0.0113
Amino Acid: T - Difference: 0.0112
Amino Acid: C - Difference: 0.0088
Amino Acid: A - Difference: 0.0078
Amino Acid: G - Difference: 0.0066
Amino Acid: R - Difference: 0.0064
Amino Acid: Q - Difference: 0.0057
Amino Acid: I - Difference: 0.0044
Amino Acid: E - Difference: 0.0039
Amino Acid: N - Difference: 0.0034
Amino Acid: F - Difference: 0.0025
Amino Acid: V - Difference: 0.0024
Amino Acid: Y - Difference: 0.0022
Amino Acid: D - Difference: 0.0014
Amino Acid: W - Difference: 0.0005


In [84]:
def compare_dikodon_frequencies(mam_dikodon_freqs, bac_dikodon_freqs):
    dikodon_difference = {dk: abs(mam_dikodon_freqs[dk] - bac_dikodon_freqs[dk]) for dk in mam_dikodon_freqs}
    
    sorted_dks = sorted(dikodon_difference.keys(), key=lambda x: abs(dikodon_difference[x]), reverse=True)
    
    for dk in sorted_dks:
        difference = dikodon_difference[dk]
        print(f"Dikodon: {dk} - Difference: {difference:.4f}")

compare_dikodon_frequencies(mam_dikodon_freqs, bac_dikodon_freqs)

Dikodon: ('M', 'K') - Difference: 0.0078
Dikodon: ('M', 'Q') - Difference: 0.0059
Dikodon: ('M', 'F') - Difference: 0.0052
Dikodon: ('M', 'L') - Difference: 0.0050
Dikodon: ('I', 'L') - Difference: 0.0047
Dikodon: ('K', 'R') - Difference: 0.0047
Dikodon: ('M', 'I') - Difference: 0.0047
Dikodon: ('R', 'S') - Difference: 0.0040
Dikodon: ('M', 'V') - Difference: 0.0037
Dikodon: ('M', 'T') - Difference: 0.0036
Dikodon: ('M', 'D') - Difference: 0.0035
Dikodon: ('L', 'S') - Difference: 0.0035
Dikodon: ('T', 'S') - Difference: 0.0034
Dikodon: ('K', 'L') - Difference: 0.0032
Dikodon: ('S', 'R') - Difference: 0.0032
Dikodon: ('C', 'L') - Difference: 0.0032
Dikodon: ('P', 'R') - Difference: 0.0031
Dikodon: ('M', 'A') - Difference: 0.0031
Dikodon: ('M', 'Y') - Difference: 0.0031
Dikodon: ('H', 'L') - Difference: 0.0031
Dikodon: ('R', 'G') - Difference: 0.0030
Dikodon: ('M', 'E') - Difference: 0.0030
Dikodon: ('K', 'K') - Difference: 0.0030
Dikodon: ('S', 'S') - Difference: 0.0030
Dikodon: ('R', '