In [2]:
pip install Biopython

Collecting Biopython
  Downloading biopython-1.81-cp39-cp39-win_amd64.whl (2.7 MB)
     ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
      --------------------------------------- 0.1/2.7 MB 1.7 MB/s eta 0:00:02
     --- ------------------------------------ 0.3/2.7 MB 3.2 MB/s eta 0:00:01
     ------ --------------------------------- 0.5/2.7 MB 3.6 MB/s eta 0:00:01
     -------------- ------------------------- 1.0/2.7 MB 6.1 MB/s eta 0:00:01
     -------------------- ------------------- 1.4/2.7 MB 5.8 MB/s eta 0:00:01
     --------------------------- ------------ 1.9/2.7 MB 6.9 MB/s eta 0:00:01
     ------------------------------------- -- 2.5/2.7 MB 8.0 MB/s eta 0:00:01
     ---------------------------------------  2.7/2.7 MB 7.9 MB/s eta 0:00:01
     ---------------------------------------  2.7/2.7 MB 7.9 MB/s eta 0:00:01
     ---------------------------------------  2.7/2.7 MB 7.9 MB/s eta 0:00:01
     ---------------------------------------- 2.7/2.7 MB 5.6 MB/s 

In [3]:
from Bio import SeqIO
from Bio.SeqUtils import ProtParam

# Define a function to calculate amino acid composition from a DNA sequence
def calculate_amino_acid_composition(fasta_file):
    # Read the FASTA file
    records = SeqIO.parse(fasta_file, "fasta")

    for record in records:
        # Create a protein sequence by translating the DNA sequence
        protein_sequence = record.seq.translate()

        # Calculate amino acid composition
        protein_analyzer = ProtParam.ProteinAnalysis(str(protein_sequence))
        aa_composition = protein_analyzer.get_amino_acids_percent()

        return aa_composition

# Specify the path to your FASTA file
fasta_file_path = "Downloads/test/ncbi_dataset/data/GCA_000931575.1/GCA_000931575.1_ASM93157v1_genomic.fna"

# Call the function to calculate amino acid composition
amino_acid_composition = calculate_amino_acid_composition(fasta_file_path)

# Print the amino acid composition
print("Amino Acid Composition:")
for aa, percentage in amino_acid_composition.items():
    print(f"{aa}: {percentage}%")




Amino Acid Composition:
A: 0.05194347265846521%
C: 0.03272729636231576%
D: 0.024108777922033607%
E: 0.02714735814136385%
F: 0.06697388283429663%
G: 0.03598198950633633%
H: 0.030008823257000515%
I: 0.07464832902461575%
K: 0.06427653354868797%
L: 0.10012203068153568%
M: 0.015709622224858187%
N: 0.0555897689216615%
P: 0.03707555340345358%
Q: 0.03666120155536309%
R: 0.06304485236887389%
S: 0.07843111766130068%
T: 0.05184435319676513%
V: 0.047747957082897995%
W: 0.014643681784280303%
Y: 0.034064596640662706%


In [1]:
# Define a function to calculate amino acid composition from a protein sequence
def calculate_amino_acid_composition(sequence):
    # Initialize a dictionary to store the counts of each amino acid
    amino_acid_counts = {}
    
    # Loop through the sequence and count amino acids
    for amino_acid in sequence:
        if amino_acid.isalpha():
            amino_acid = amino_acid.upper()
            if amino_acid in amino_acid_counts:
                amino_acid_counts[amino_acid] += 1
            else:
                amino_acid_counts[amino_acid] = 1
    
    return amino_acid_counts

# Initialize a dictionary to store the amino acid composition for all sequences
total_amino_acid_counts = {}

# Open the protein FASTA file
with open("Downloads/test/ncbi_dataset/data/GCF_000931575.1/protein.faa", "r") as file:
    current_sequence = ""
    
    for line in file:
        line = line.strip()
        if line.startswith(">"):
            # Calculate amino acid composition for the previous sequence
            if current_sequence:
                amino_acid_counts = calculate_amino_acid_composition(current_sequence)
                
                # Update the total counts
                for amino_acid, count in amino_acid_counts.items():
                    if amino_acid in total_amino_acid_counts:
                        total_amino_acid_counts[amino_acid] += count
                    else:
                        total_amino_acid_counts[amino_acid] = count
                
            # Start a new sequence
            current_sequence = ""
        else:
            # Accumulate the sequence data
            current_sequence += line

# Calculate the total number of amino acids
total_amino_acids = sum(total_amino_acid_counts.values())

# Print the amino acid composition
print("Amino Acid Composition:")
for amino_acid, count in total_amino_acid_counts.items():
    percentage = (count / total_amino_acids) * 100
    print(f"{amino_acid}: {count} ({percentage:.2f}%)")

print(f"Total Amino Acids: {total_amino_acids}")


Amino Acid Composition:
M: 12671 (2.38%)
Q: 25002 (4.70%)
N: 26447 (4.97%)
R: 23954 (4.50%)
I: 37751 (7.09%)
L: 55378 (10.41%)
K: 33811 (6.35%)
A: 43545 (8.18%)
F: 23618 (4.44%)
D: 27034 (5.08%)
H: 10901 (2.05%)
S: 31349 (5.89%)
T: 27657 (5.20%)
E: 34853 (6.55%)
V: 35264 (6.63%)
G: 35120 (6.60%)
P: 19616 (3.69%)
Y: 16764 (3.15%)
W: 6072 (1.14%)
C: 5409 (1.02%)
U: 2 (0.00%)
Total Amino Acids: 532218


In [4]:
# Define a dictionary to map one-letter codes to three-letter codes
one_to_three_letter_mapping = {
    "A": "Ala",
    "R": "Arg",
    "N": "Asn",
    "D": "Asp",
    "C": "Cys",
    "Q": "Gln",
    "E": "Glu",
    "G": "Gly",
    "H": "His",
    "I": "Ile",
    "L": "Leu",
    "K": "Lys",
    "M": "Met",
    "F": "Phe",
    "P": "Pro",
    "S": "Ser",
    "T": "Thr",
    "W": "Trp",
    "Y": "Tyr",
    "V": "Val"
}

# Define a function to calculate amino acid composition from a protein sequence
def calculate_amino_acid_composition(sequence):
    # Initialize a dictionary to store the counts of each amino acid
    amino_acid_counts = {}
    
    # Loop through the sequence and count amino acids
    for amino_acid in sequence:
        if amino_acid.isalpha():
            amino_acid = amino_acid.upper()
            if amino_acid in amino_acid_counts:
                amino_acid_counts[amino_acid] += 1
            else:
                amino_acid_counts[amino_acid] = 1
    
    return amino_acid_counts

# Initialize a dictionary to store the amino acid composition for all sequences
total_amino_acid_counts = {}

# Open the protein FASTA file
with open("Downloads/test/ncbi_dataset/data/GCF_000931575.1/protein.faa", "r") as file:
    current_sequence = ""
    
    for line in file:
        line = line.strip()
        if line.startswith(">"):
            # Calculate amino acid composition for the previous sequence
            if current_sequence:
                amino_acid_counts = calculate_amino_acid_composition(current_sequence)
                
                # Update the total counts
                for amino_acid, count in amino_acid_counts.items():
                    if amino_acid in total_amino_acid_counts:
                        total_amino_acid_counts[amino_acid] += count
                    else:
                        total_amino_acid_counts[amino_acid] = count
                
            # Start a new sequence
            current_sequence = ""
        else:
            # Accumulate the sequence data
            current_sequence += line

# Calculate the total number of amino acids
total_amino_acids = sum(total_amino_acid_counts.values())

# Print the amino acid composition with three-letter codes
print("Amino Acid Composition:")
for one_letter_code, count in total_amino_acid_counts.items():
    three_letter_code = one_to_three_letter_mapping.get(one_letter_code, "Unknown")
    percentage = (count / total_amino_acids) * 100
    print(f"{three_letter_code}: {count} {percentage:.2f}")

print(f"Total Amino Acids: {total_amino_acids}")


Amino Acid Composition:
Met: 12671 2.38
Gln: 25002 4.70
Asn: 26447 4.97
Arg: 23954 4.50
Ile: 37751 7.09
Leu: 55378 10.41
Lys: 33811 6.35
Ala: 43545 8.18
Phe: 23618 4.44
Asp: 27034 5.08
His: 10901 2.05
Ser: 31349 5.89
Thr: 27657 5.20
Glu: 34853 6.55
Val: 35264 6.63
Gly: 35120 6.60
Pro: 19616 3.69
Tyr: 16764 3.15
Trp: 6072 1.14
Cys: 5409 1.02
Unknown: 2 0.00
Total Amino Acids: 532218


# hi467


In [1]:
# Define a dictionary to map one-letter codes to three-letter codes
one_to_three_letter_mapping = {
    "A": "Ala",
    "R": "Arg",
    "N": "Asn",
    "D": "Asp",
    "C": "Cys",
    "Q": "Gln",
    "E": "Glu",
    "G": "Gly",
    "H": "His",
    "I": "Ile",
    "L": "Leu",
    "K": "Lys",
    "M": "Met",
    "F": "Phe",
    "P": "Pro",
    "S": "Ser",
    "T": "Thr",
    "W": "Trp",
    "Y": "Tyr",
    "V": "Val"
}

# Define a function to calculate amino acid composition from a protein sequence
def calculate_amino_acid_composition(sequence):
    # Initialize a dictionary to store the counts of each amino acid
    amino_acid_counts = {}
    
    # Loop through the sequence and count amino acids
    for amino_acid in sequence:
        if amino_acid.isalpha():
            amino_acid = amino_acid.upper()
            if amino_acid in amino_acid_counts:
                amino_acid_counts[amino_acid] += 1
            else:
                amino_acid_counts[amino_acid] = 1
    
    return amino_acid_counts

# Initialize a dictionary to store the amino acid composition for all sequences
total_amino_acid_counts = {}

# Open the protein FASTA file
with open("Downloads/hi467/ncbi_dataset/data/GCA_001975845.1/protein.faa", "r") as file:
    current_sequence = ""
    
    for line in file:
        line = line.strip()
        if line.startswith(">"):
            # Calculate amino acid composition for the previous sequence
            if current_sequence:
                amino_acid_counts = calculate_amino_acid_composition(current_sequence)
                
                # Update the total counts
                for amino_acid, count in amino_acid_counts.items():
                    if amino_acid in total_amino_acid_counts:
                        total_amino_acid_counts[amino_acid] += count
                    else:
                        total_amino_acid_counts[amino_acid] = count
                
            # Start a new sequence
            current_sequence = ""
        else:
            # Accumulate the sequence data
            current_sequence += line

# Calculate the total number of amino acids
total_amino_acids = sum(total_amino_acid_counts.values())

# Print the amino acid composition with three-letter codes
print("Amino Acid Composition:")
for one_letter_code, count in total_amino_acid_counts.items():
    three_letter_code = one_to_three_letter_mapping.get(one_letter_code, "Unknown")
    percentage = (count / total_amino_acids) * 100
    print(f"{three_letter_code}: {count} {percentage:.2f}")

print(f"Total Amino Acids: {total_amino_acids}")


Amino Acid Composition:
Gln: 24501 4.62
Gly: 35327 6.66
Cys: 5379 1.01
Tyr: 16661 3.14
His: 10896 2.05
Phe: 23592 4.45
Leu: 55377 10.44
Trp: 6033 1.14
Ala: 43703 8.24
Glu: 34407 6.49
Asn: 26171 4.93
Pro: 19867 3.74
Ser: 30934 5.83
Val: 35529 6.70
Thr: 27894 5.26
Arg: 23635 4.46
Asp: 26875 5.07
Lys: 33525 6.32
Ile: 37477 7.06
Met: 12725 2.40
Total Amino Acids: 530508


In [5]:
# Define a dictionary to map one-letter codes to three-letter codes
one_to_three_letter_mapping = {
    "A": "Ala",
    "R": "Arg",
    "N": "Asn",
    "D": "Asp",
    "C": "Cys",
    "Q": "Gln",
    "E": "Glu",
    "G": "Gly",
    "H": "His",
    "I": "Ile",
    "L": "Leu",
    "K": "Lys",
    "M": "Met",
    "F": "Phe",
    "P": "Pro",
    "S": "Ser",
    "T": "Thr",
    "W": "Trp",
    "Y": "Tyr",
    "V": "Val"
}

# Define a function to calculate amino acid composition from a protein sequence
def calculate_amino_acid_composition(sequence):
    # Initialize a dictionary to store the counts of each amino acid
    amino_acid_counts = {}
    
    # Loop through the sequence and count amino acids
    for amino_acid in sequence:
        if amino_acid.isalpha():
            amino_acid = amino_acid.upper()
            if amino_acid in amino_acid_counts:
                amino_acid_counts[amino_acid] += 1
            else:
                amino_acid_counts[amino_acid] = 1
    
    return amino_acid_counts

# Initialize a dictionary to store the amino acid composition for all sequences
total_amino_acid_counts = {}

# Open the protein FASTA file
with open("Downloads/rdkw20/ncbi_dataset/data/GCA_000027305.1/protein.faa", "r") as file:
    current_sequence = ""
    
    for line in file:
        line = line.strip()
        if line.startswith(">"):
            # Calculate amino acid composition for the previous sequence
            if current_sequence:
                amino_acid_counts = calculate_amino_acid_composition(current_sequence)
                
                # Update the total counts
                for amino_acid, count in amino_acid_counts.items():
                    if amino_acid in total_amino_acid_counts:
                        total_amino_acid_counts[amino_acid] += count
                    else:
                        total_amino_acid_counts[amino_acid] = count
                
            # Start a new sequence
            current_sequence = ""
        else:
            # Accumulate the sequence data
            current_sequence += line

# Calculate the total number of amino acids
total_amino_acids = sum(total_amino_acid_counts.values())

# Print the amino acid composition with three-letter codes
print("Amino Acid Composition:")
for one_letter_code, count in total_amino_acid_counts.items():
    three_letter_code = one_to_three_letter_mapping.get(one_letter_code, "Unknown")
    percentage = (count / total_amino_acids) * 100
    print(f"{three_letter_code}: {count} {percentage:.2f}")

print(f"Total Amino Acids: {total_amino_acids}")


Amino Acid Composition:
Met: 12734 2.44
Ala: 42771 8.21
Ile: 37014 7.11
Lys: 32904 6.32
Gly: 34619 6.65
Asn: 25410 4.88
Phe: 23314 4.48
Arg: 23317 4.48
Val: 34778 6.68
Gln: 24183 4.64
His: 10702 2.05
Asp: 25932 4.98
Glu: 33763 6.48
Leu: 54787 10.52
Tyr: 16366 3.14
Ser: 30438 5.84
Thr: 27105 5.20
Pro: 19372 3.72
Trp: 5902 1.13
Cys: 5408 1.04
Unknown: 48 0.01
Unknown: 1 0.00
Unknown: 1 0.00
Total Amino Acids: 520869
