In [None]:
# Python Script to compare libraries of Antibody DNA sequences to PG9 VDJ wild type
# Convert to amino acid sequence to look to enrichment of mutations in libraries (1 - 4) 
# compared to control library

In [1]:
# Extract PG9 DNA sequence (a string made up of the genetic code consisting of A, T, C, G)

with open("source_sequence.fasta.txt", "r") as source:
    for line in source:
        if not line.startswith('>'):
            PG9_seq = line.strip()
            
print(PG9_seq)

CAGCGATTAGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGTCGTCCCTGAGACTCTCCTGTGCAGCGTCCGGATTCGACTTCAGTAGACAAGGCATGCACTGGGTCCGCCAGGCTCCAGGCCAGGGGCTGGAGTGGGTGGCATTTATTAAATATGATGGAAGTGAGAAATATCATGCTGACTCCGTATGGGGCCGACTCAGCATCTCCAGAGACAATTCCAAGGATACGCTTTATCTCCAAATGAATAGCCTGAGAGTCGAGGACACGGCTACATATTTTTGTGTGAGAGAGGCTGGTGGGCCCGACTACCGTAATGGGTACAACTATTACGATTTCTATGATGGTTATTATAACTACCACTATATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCGAGC


In [2]:
# extract DNA sequences from control or mutated libraries - FASTA files (each DNA sequence assigned a fasta name in the library)

# create a dictionary with fasta names as keys and DNA sequences as values

keys = []
values = []

# loop through library and extract fasta names as keys for dictionary and then the DNA sequences as the values

with open("control_library.fasta.txt", "r") as lib:
    for line in lib:
        if line.startswith('>'):
            keys.append(line.strip().lstrip(">"))
        else:
            values.append(line.strip())

DNA_lib = dict(zip(keys, values)) # library in dictionary format with fasta names as keys and sequences as values

print ("Total number of sequences in file",len(keys)) #total number of DNA sequences (strings) in the file

Total number of sequences in file 100000


In [3]:
# create codon:amino acid dictionary (codons (triplet DNA code) as keys and single amino acid letter as the value in the dictionary)
# single amino acid code (a letter) is obtained from the triplet DNA code - converting one string to another

bases = ['T', 'C', 'A', 'G']
codons = [a+b+c for a in bases for b in bases for c in bases] #creat list of DNA codons
amino_acids = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'  #STOP = *
codon_table = dict(zip(codons, amino_acids))

# How codon loop works:
# first loop - a and b remain constant (T) and c changes: TTT, TTC, TTA, TTG
# second loop - a is T, b is C and c changes: TCT TCC TCA TCG

print(codon_table)

{'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*', 'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'}


In [4]:
# loop through dictionary containing many DNA sequences (100000 in a library)
# compare DNA sequences to PG9 sequence and output mutated codons, translate to amino acids and get positions of the mutations
# compare DNA sequences in library (strings) to PG9 sequence (also a string) as well as convert to amino acid sequence (another string)

identifier = []  #store keys - unique identifier for each sequence in the library
aa = []   #store amino acid (translate codon after moving through sequence every 3 nucleotides and compare to PG9)
pos_aa = []  #store amino acid position


for key, value in DNA_lib.items():  #loop through library of DNA sequences
    for pos in range(0, len(value)+1, 3):  # move through sequence (values) every 3 nucleotides (codons)
        if value[pos:pos+3] != PG9_seq[pos:pos+3]:
            identifier.append(key)
            aa.append(codon_table[value[pos:pos+3]])
            pos_aa.append((pos+3)//3)
            
aa_mut = dict(zip(identifier, aa))  #keys (fasta file name) with corresponding mutated amino acid
pos_mut = dict(zip(identifier, pos_aa))#same keys as above with corresponding amino acid position

In [5]:
# Count the number of times each mutated position occurs - 
# output is (position, number of sequences with a mutation at that position)

from collections import Counter

unique = Counter(pos_aa)

# function to output a list of positions mutated - unique

track= list(unique.keys())

print("Amino acid positions at which mutations occur:",track)
print("Total number of positions mutated in every sequence:",len(track))

Amino acid positions at which mutations occur: [94, 28, 112, 79, 76, 10, 13, 128, 55, 78]
Total number of positions mutated in every sequence: 10


In [6]:
# generate multiple empty lists using the position number as the key - 
# to append keys (identifier for each sequence) sorted by position of mutation

pos_keys_lists = {}

for i in range(len(track)):
    pos_keys_lists[track[i]] = []
    
    
print (pos_keys_lists)

{94: [], 28: [], 112: [], 79: [], 76: [], 10: [], 13: [], 128: [], 55: [], 78: []}


In [7]:
# extract key/identifier for each mutated position and store in pos_keys_lists dictionary
# store all keys/identifier names corresponding to a particular position where position is the key 
# and keys (values) are stored as a list

for x in track:
    for key, value in pos_mut.items(): #dictionary of fasta file names (keys) with associated amino acid positions
        if pos_mut[key] == x:              #if value (amino acid position) = position from track (x)
            pos_keys_lists[x].append(key)  #save fasta file name (key) in list with position as key 


In [8]:
# generate multiple empty lists using position number as key - 
# store mutated amino acids for that position

aa_lists = {}

for i in range(len(track)):
    aa_lists[track[i]] = []


In [9]:
# using keys extracted according to position, now grab all the mutated amino acids for that position

for x in track:
    for key, value in aa_mut.items(): #dict of fasta file names/keys with corresponding amino acid
        if key in pos_keys_lists[x]:        #if fasta file name (key) in list corresponding to position key (x)
            aa_lists[x].append(aa_mut[key]) #grab the values (mutated amino acid) into list corresponding to position key (x)


In [10]:
# function to get amino acid frequency as a percentage

amino_acid_list = ['F', 'L', 'S', 'Y', 'C', 'W', 'P', 'H', 'Q', 'R', 'I', 'M', 'T', 'N', 'K', 'V', 'A', 'D', 'E', 'G']

def freq_aa100(aa):
    aa_freq100 = {} # generate dictionary with amnio acids as key to keep count of each amino acid
    for i in range(len(amino_acid_list)):
        aa_freq100[amino_acid_list[i]] = []
        
    for x in amino_acid_list:
        count = 0.0
        freq100 = 0.0
        for y in aa:
            total = len(aa)  #length of the list, that is, total number of mutated amino acids at that position
            if y == x:
                count += 1.0
                freq100 = "%.4f%%" % (100 * (count/total))
        aa_freq100[x].append(freq100)
    return aa_freq100

In [11]:
# get the frequency of amino acids at each position for control library - baseline frequencies, as a percentage. Then compare to mutated libraries


aa_freq_lib100 = {} # generate dictionary with positions as key to store lists of amino acid total counts for each position

for i in range(len(track)):
    aa_freq_lib100[track[i]] = []


for x in track:
    for key, value in aa_lists.items():
        if key == x:
            aa_freq_lib100[x].append(freq_aa100(value))
            

print ("Amino acid frequencies at each position:")            
for x in track:
    print("Position", x)
    print(aa_freq_lib100[x])
    print(" ")

Amino acid frequencies at each position:
Position 94
[{'F': ['1.6984%'], 'L': ['10.2207%'], 'S': ['9.6817%'], 'Y': ['3.2238%'], 'C': ['3.2645%'], 'W': ['1.5255%'], 'P': ['6.7121%'], 'H': ['3.5696%'], 'Q': ['2.9391%'], 'R': ['9.8546%'], 'I': ['4.9934%'], 'M': ['1.8407%'], 'T': ['6.6409%'], 'N': ['3.3866%'], 'K': ['3.3255%'], 'V': ['6.8240%'], 'A': ['6.9968%'], 'D': ['3.1933%'], 'E': ['3.8137%'], 'G': ['6.2951%']}]
 
Position 28
[{'F': ['1.5598%'], 'L': ['9.9195%'], 'S': ['9.8583%'], 'Y': ['3.2623%'], 'C': ['3.0278%'], 'W': ['1.6617%'], 'P': ['6.9120%'], 'H': ['3.3133%'], 'Q': ['3.3846%'], 'R': ['9.4403%'], 'I': ['5.0770%'], 'M': ['1.5700%'], 'T': ['6.4838%'], 'N': ['3.5478%'], 'K': ['3.3235%'], 'V': ['6.3105%'], 'A': ['6.9222%'], 'D': ['3.5987%'], 'E': ['3.5783%'], 'G': ['7.2484%']}]
 
Position 112
[{'F': ['3.3087%'], 'L': ['10.3165%'], 'S': ['9.7719%'], 'Y': ['3.5245%'], 'C': ['3.7402%'], 'W': ['1.7160%'], 'P': ['6.2166%'], 'H': ['3.2470%'], 'Q': ['2.9696%'], 'R': ['9.8438%'], 'I': ['4