In [10]:
# Import the necessary libraries
from Bio import SeqIO

In [11]:
standard_aa = "ACDEFGHIKLMNPQRSTVWY"  # standard 20 amino acids
standard_count = 0  # count of sequences that contain only standard amino acids
non_standard_count = (
    0  # count of sequences that contain at least one non-standard amino acid
)
vocabulary = set()  # store unique amino acids found in the sequences
data_dir = "../data/uniprot_sprot.fasta"  # path to the FASTA file

In [12]:
# Print a few records to check the format of the FASTA file
record_iterator = SeqIO.parse(data_dir, "fasta")
record_samples = [next(record_iterator) for i in range(3)]
for record in record_samples:
    print(f"Record ID: {record.id}")
    print(f"Sequence: {record.seq}")
    print(f"Length: {len(record)}")
    print(f"Description: {record.description}")

Record ID: sp|Q6GZX4|001R_FRG3G
Sequence: MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL
Length: 256
Description: sp|Q6GZX4|001R_FRG3G Putative transcription factor 001R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-001R PE=4 SV=1
Record ID: sp|Q6GZX3|002L_FRG3G
Sequence: MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCARIKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSLAERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADCKCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNMLDDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRKVMFFVAGAVLVAILISTVRW
Length: 320
Description: sp|Q6GZX3|002L_FRG3G Uncharacterized protein 002L OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-002L PE=4 SV=1
Record ID: sp|Q197F8|002R_IIV3
Sequence: MASNTVSAQGG

In [13]:
# Count the number of standard and non-standard sequences, and vocabulary size in the FASTA file
for record in SeqIO.parse(data_dir, "fasta"):
    vocabulary.update(record.seq)  # add all amino acids in the sequence to vocabulary
    if not set(record.seq).issubset(standard_aa):
        non_standard_count += 1
    else:
        standard_count += 1
print(f"Total standard sequences: {standard_count}")
print(f"Total non-standard sequences: {non_standard_count}")
print(f"Unique amino acids found: {len(vocabulary)}")
print(f"Vocabulary: {vocabulary}")

Total standard sequences: 571959
Total non-standard sequences: 2668
Unique amino acids found: 25
Vocabulary: {'D', 'H', 'G', 'K', 'F', 'R', 'C', 'Q', 'B', 'P', 'M', 'L', 'Z', 'U', 'I', 'S', 'N', 'Y', 'T', 'O', 'V', 'E', 'A', 'X', 'W'}
