In [1]:
# ANTIBODY CONTROL LIBRARY

In [2]:
# Etract PG9 DNA sequence (a string)

with open("source_sequence.fasta.txt", "r") as source:
    for line in source:
        if not line.startswith('>'):
            PG9_seq = line.strip()
            
print(PG9_seq)

CAGCGATTAGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGTCGTCCCTGAGACTCTCCTGTGCAGCGTCCGGATTCGACTTCAGTAGACAAGGCATGCACTGGGTCCGCCAGGCTCCAGGCCAGGGGCTGGAGTGGGTGGCATTTATTAAATATGATGGAAGTGAGAAATATCATGCTGACTCCGTATGGGGCCGACTCAGCATCTCCAGAGACAATTCCAAGGATACGCTTTATCTCCAAATGAATAGCCTGAGAGTCGAGGACACGGCTACATATTTTTGTGTGAGAGAGGCTGGTGGGCCCGACTACCGTAATGGGTACAACTATTACGATTTCTATGATGGTTATTATAACTACCACTATATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCGAGC


In [6]:
# extract sequences from control library - FASTA files

#create a dictionary with fasta names and sequences

keys = []
values = []

# loop through library and extract fasta names as keys for dictionary and then the sequences as the values

with open("control_library.fasta.txt", "r") as control_lib:
    for line in control_lib:
        if line.startswith('>'):
            keys.append(line.strip().lstrip(">"))
        else:
            values.append(line.strip())

control = dict(zip(keys, values)) # control library in dictionary format with fasta names as keys and sequences as values

print (control['74a74b4e-e0c6-4fb7-bf10-cf36fa80638c'])
print (len(keys)) #total number of DNA sequences (strings) in the file

CAGCGATTAGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGTCGTCCCTGAGACTCTCCTGTGCAGCGTCCGGATTCGACTTCAGTAGACAAGGCATGCACTGGGTCCGCCAGGCTCCAGGCCAGGGGCTGGAGTGGGTGGCATTTATTAAATATGATGGAAGTGAGAAATATCATGCTGACTCCGTATGGGGCCGACTCAGCATCTCCAGAGACAATTCCAAGGATACGCTTTATCTCCAAATGAATAGCCTGAGAGTCGAGGACACGGCTACATATGTATGTGTGAGAGAGGCTGGTGGGCCCGACTACCGTAATGGGTACAACTATTACGATTTCTATGATGGTTATTATAACTACCACTATATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCGAGC
100000


In [8]:
# create codon:amino acid dictionary (codons as keys in the dictionary)

bases = ['T', 'C', 'A', 'G']
codons = [a+b+c for a in bases for b in bases for c in bases]
amino_acids = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'  #STOP = *
codon_table = dict(zip(codons, amino_acids))

# How codon loop works:
# first loop - a and b remain constant (T) and c changes: TTT, TTC, TTA, TTG
# second loop - a is T, b is C and c changes: TCT TCC TCA TCG

print(codons)

['TTT', 'TTC', 'TTA', 'TTG', 'TCT', 'TCC', 'TCA', 'TCG', 'TAT', 'TAC', 'TAA', 'TAG', 'TGT', 'TGC', 'TGA', 'TGG', 'CTT', 'CTC', 'CTA', 'CTG', 'CCT', 'CCC', 'CCA', 'CCG', 'CAT', 'CAC', 'CAA', 'CAG', 'CGT', 'CGC', 'CGA', 'CGG', 'ATT', 'ATC', 'ATA', 'ATG', 'ACT', 'ACC', 'ACA', 'ACG', 'AAT', 'AAC', 'AAA', 'AAG', 'AGT', 'AGC', 'AGA', 'AGG', 'GTT', 'GTC', 'GTA', 'GTG', 'GCT', 'GCC', 'GCA', 'GCG', 'GAT', 'GAC', 'GAA', 'GAG', 'GGT', 'GGC', 'GGA', 'GGG']


In [11]:
# function to convert codon to single letter amino acid code

def translate(codon):
    for key, value in codon_table.items():
        if codon == key:
            return value
    
print(translate('ATG'))

M


In [12]:
# loop through dictionary containing many DNA sequences (control library)
# compare sequences to PG9 sequence and output mutated codons, translate to amino acids and get positions

key_c = []  #store keys - fasta names
aa_c = []   #store amino acid (translate codon after moving through sequence every 3 nucleotides and compare to PG9)
pos_c = []  #store amino acid position


for key, value in control.items():  #loop through control library of sequences
    for pos in range(0, len(value)+1, 3):  # move through sequence (values) every 3 nucleotides (codons)
        if value[pos:pos+3] != PG9_seq[pos:pos+3]:
            #print key
            #print "mutated codon:", value[pos:pos+3] 
            #print "mutated amino acid:", translate(value[pos:pos+3]) #translate function - codon to single letter aa
            #print "amino acid position:", (pos+3)/3
            key_c.append(key)
            aa_c.append(translate(value[pos:pos+3]))
            pos_c.append((pos+3)/3)
            
aa_mut_c = dict(zip(key_c, aa_c))  #keys (fasta file name) with corresponding mutated amino acid
pos_mut_c = dict(zip(key_c, pos_c))#same keys as above with corresponding amino acid position

In [14]:
# Count the number of times each mutated postion occurs - 
# output is (position, number of sequences with a mutation at that position)

from collections import Counter

unique = Counter(pos_c)
print (unique.items())

print(" ")
print ("Total number of sequences in control library:", len(keys))
print ("Total number of sequences with mutations:", len(key_c))
print ("Number of unmutated sequences:", len(keys)-len(key_c))

dict_items([(94.0, 9833), (28.0, 9809), (112.0, 9732), (79.0, 9751), (76.0, 9841), (10.0, 9742), (13.0, 9753), (128.0, 9950), (55.0, 9890), (78.0, 10034)])
 
Total number of sequences in control library: 100000
Total number of sequences with mutations: 98335
Number of unmutated sequences: 1665


In [15]:
# function to output a list of positions mutated - unique

def pos_mutated(pos):
    unique = []
    for x in pos:
        if x not in unique:
            unique.append(x)
    return unique


track = pos_mutated(pos_c) #track the positions mutated
print (track)
print (len(track))

[94.0, 28.0, 112.0, 79.0, 76.0, 10.0, 13.0, 128.0, 55.0, 78.0]
10


In [16]:
# generate multiple empty lists using the position number as the key - 
# to append keys (fasta name) sorted by position of mutation

pos_keys_lists = {}

for i in range(len(track)):
    pos_keys_lists[track[i]] = []
    
    
print (pos_keys_lists)

{94.0: [], 28.0: [], 112.0: [], 79.0: [], 76.0: [], 10.0: [], 13.0: [], 128.0: [], 55.0: [], 78.0: []}


In [18]:
# extract keys/fasta file names for each position and store in pos_keys_lists dictionary
# store all keys/fasta file names corresponding to a particular position where position is the key 
# and keys (values) are stored as a list

for x in track:
    for key, value in pos_mut_c.items(): #dictionary of fasta file names (keys) with associated aa positions
        if pos_mut_c[key] == x:              #if value (aa position) = position from track (x)
            pos_keys_lists[x].append(key)    #save fasta file name (key) in list corresponding position key of new dict

print (len(pos_keys_lists[78]))
print (len(pos_keys_lists[128]))
print (len(pos_keys_lists[94]))

10034
9950
9833


In [19]:
# generate multiple empty lists using position number as key - store mutated ammino acids for that position

aa_lists = {}

for i in range(len(track)):
    aa_lists[track[i]] = []
    
    
print (aa_lists)

{94.0: [], 28.0: [], 112.0: [], 79.0: [], 76.0: [], 10.0: [], 13.0: [], 128.0: [], 55.0: [], 78.0: []}


In [20]:
# using keys extracted according to position, now grab all the mutated amino acids for that position

for x in track:
    for key, value in aa_mut_c.items(): #dict of fasta file names/keys with corresponding aa
        if key in pos_keys_lists[x]:        #if fasta file name (key) in list corresponding to position key (x)
            aa_lists[x].append(aa_mut_c[key]) #grab the values (mutated aa) into list corresponding to position key (x)
            
            
# use Counter to get number of each amino acid for positions
for x in track:
    aa_pos = Counter(aa_lists[x])
    print ("Position:", x)
    print ("Total number of sequences for position", x, ":", len(aa_lists[x]))
    print (aa_pos.items())
    print(" ")

Position: 94.0
Total number of sequences for position 94.0 : 9833
dict_items([('V', 671), ('M', 181), ('H', 351), ('I', 491), ('L', 1005), ('S', 952), ('G', 619), ('N', 333), ('R', 969), ('Y', 317), ('T', 653), ('C', 321), ('A', 688), ('E', 375), ('Q', 289), ('P', 660), ('K', 327), ('D', 314), ('W', 150), ('F', 167)])
 
Position: 28.0
Total number of sequences for position 28.0 : 9809
dict_items([('R', 926), ('E', 351), ('T', 636), ('Y', 320), ('W', 163), ('C', 297), ('Q', 332), ('L', 973), ('I', 498), ('H', 325), ('A', 679), ('F', 153), ('G', 711), ('P', 678), ('V', 619), ('N', 348), ('S', 967), ('D', 353), ('K', 326), ('M', 154)])
 
Position: 112.0
Total number of sequences for position 112.0 : 9732
dict_items([('L', 1004), ('Y', 343), ('N', 314), ('A', 673), ('C', 364), ('V', 646), ('P', 605), ('S', 951), ('K', 323), ('H', 316), ('D', 154), ('F', 322), ('T', 679), ('R', 958), ('W', 167), ('G', 631), ('I', 473), ('Q', 289), ('E', 324), ('M', 196)])
 
Position: 79.0
Total number of se

In [21]:
# function to count amino acids in a list

a_list = ['F', 'L', 'S', 'Y', 'C', 'W', 'P', 'H', 'Q', 'R', 'I', 'M', 'T', 'N', 'K', 'V', 'A', 'D', 'E', 'G']


def count_aa(aa):
    aa_count = {} # generate dictionary with amnio acids as key to keep count of each aa
    for i in range(len(a_list)):
        aa_count[a_list[i]] = []
        
    for x in a_list:
        count = 0
        for y in aa:
            if y == x:
                count += 1
        aa_count[x].append(count)
    return aa_count
        
aa_test = ['F', 'F', 'A', 'H', 'H', 'H']
print (count_aa(aa_test))

{'F': [2], 'L': [0], 'S': [0], 'Y': [0], 'C': [0], 'W': [0], 'P': [0], 'H': [3], 'Q': [0], 'R': [0], 'I': [0], 'M': [0], 'T': [0], 'N': [0], 'K': [0], 'V': [0], 'A': [1], 'D': [0], 'E': [0], 'G': [0]}


In [22]:
# count the number of amino acids at each position

aa_count_c = {} # generate dictionary with positions as key to store lists of amino acid total counts for each position

for i in range(len(track)):
    aa_count_c[track[i]] = []


for x in track:                                #track positions - keys in aa_count_c
    for key, value in aa_lists.items():    #dict with positions as keys and list of mutated amino acids at that position
        if key == x:
            aa_count_c[x].append(count_aa(value)) #count amino acids at position x
            
print ("Amino acid counts at each position:" )           
for x in track:
    print ("Position", x)
    print (aa_count_c[x])
    print ("Total number of sequences for position", x ,":", len(aa_lists[x]))
    print(" ")

Amino acid counts at each position:
Position 94.0
[{'F': [167], 'L': [1005], 'S': [952], 'Y': [317], 'C': [321], 'W': [150], 'P': [660], 'H': [351], 'Q': [289], 'R': [969], 'I': [491], 'M': [181], 'T': [653], 'N': [333], 'K': [327], 'V': [671], 'A': [688], 'D': [314], 'E': [375], 'G': [619]}]
Total number of sequences for position 94.0 : 9833
 
Position 28.0
[{'F': [153], 'L': [973], 'S': [967], 'Y': [320], 'C': [297], 'W': [163], 'P': [678], 'H': [325], 'Q': [332], 'R': [926], 'I': [498], 'M': [154], 'T': [636], 'N': [348], 'K': [326], 'V': [619], 'A': [679], 'D': [353], 'E': [351], 'G': [711]}]
Total number of sequences for position 28.0 : 9809
 
Position 112.0
[{'F': [322], 'L': [1004], 'S': [951], 'Y': [343], 'C': [364], 'W': [167], 'P': [605], 'H': [316], 'Q': [289], 'R': [958], 'I': [473], 'M': [196], 'T': [679], 'N': [314], 'K': [323], 'V': [646], 'A': [673], 'D': [154], 'E': [324], 'G': [631]}]
Total number of sequences for position 112.0 : 9732
 
Position 79.0
[{'F': [307], 'L

In [23]:
# function to get amino acid frequency

def freq_aa(aa):
    aa_freq = {} # generate dictionary with amnio acids as key to keep count of each aa
    for i in range(len(a_list)):
        aa_freq[a_list[i]] = []
        
    for x in a_list:
        count = 0.0
        for y in aa:
            total = len(aa)  #length of the list, that is, total number of mutated amino acids at that position
            if y == x:
                count += 1.0
        aa_freq[x].append("%.4f"%(count/total))
    return aa_freq
        
aa_test = ['F', 'F', 'A', 'H', 'H', 'H']
print (freq_aa(aa_test))

{'F': ['0.3333'], 'L': ['0.0000'], 'S': ['0.0000'], 'Y': ['0.0000'], 'C': ['0.0000'], 'W': ['0.0000'], 'P': ['0.0000'], 'H': ['0.5000'], 'Q': ['0.0000'], 'R': ['0.0000'], 'I': ['0.0000'], 'M': ['0.0000'], 'T': ['0.0000'], 'N': ['0.0000'], 'K': ['0.0000'], 'V': ['0.0000'], 'A': ['0.1667'], 'D': ['0.0000'], 'E': ['0.0000'], 'G': ['0.0000']}


In [25]:
# get the frequency of amino acids at each position for control library - baseline frequencies

aa_freq_c = {} # generate dictionary with positions as key to store lists of amino acid total counts for each position

for i in range(len(track)):
    aa_freq_c[track[i]] = []


for x in track:
    for key, value in aa_lists.items():
        if key == x:
            aa_freq_c[x].append(freq_aa(value))
            

print ("Amino acid frequencies at each position:")            
for x in track:
    print ("Position", x)
    print (aa_freq_c[x])
    print(" ")

Amino acid frequencies at each position:
Position 94.0
[{'F': ['0.0170'], 'L': ['0.1022'], 'S': ['0.0968'], 'Y': ['0.0322'], 'C': ['0.0326'], 'W': ['0.0153'], 'P': ['0.0671'], 'H': ['0.0357'], 'Q': ['0.0294'], 'R': ['0.0985'], 'I': ['0.0499'], 'M': ['0.0184'], 'T': ['0.0664'], 'N': ['0.0339'], 'K': ['0.0333'], 'V': ['0.0682'], 'A': ['0.0700'], 'D': ['0.0319'], 'E': ['0.0381'], 'G': ['0.0630']}]
 
Position 28.0
[{'F': ['0.0156'], 'L': ['0.0992'], 'S': ['0.0986'], 'Y': ['0.0326'], 'C': ['0.0303'], 'W': ['0.0166'], 'P': ['0.0691'], 'H': ['0.0331'], 'Q': ['0.0338'], 'R': ['0.0944'], 'I': ['0.0508'], 'M': ['0.0157'], 'T': ['0.0648'], 'N': ['0.0355'], 'K': ['0.0332'], 'V': ['0.0631'], 'A': ['0.0692'], 'D': ['0.0360'], 'E': ['0.0358'], 'G': ['0.0725']}]
 
Position 112.0
[{'F': ['0.0331'], 'L': ['0.1032'], 'S': ['0.0977'], 'Y': ['0.0352'], 'C': ['0.0374'], 'W': ['0.0172'], 'P': ['0.0622'], 'H': ['0.0325'], 'Q': ['0.0297'], 'R': ['0.0984'], 'I': ['0.0486'], 'M': ['0.0201'], 'T': ['0.0698'], 'N'

In [26]:
def freq_aa100(aa):
    aa_freq100 = {} # generate dictionary with amnio acids as key to keep count of each aa
    for i in range(len(a_list)):
        aa_freq100[a_list[i]] = []
        
    for x in a_list:
        count = 0.0
        freq100 = 0.0
        for y in aa:
            total = len(aa)  #length of the list, that is, total number of mutated amino acids at that position
            if y == x:
                count += 1.0
                freq100 = "%.4f%%" % (100 * (count/total))
        aa_freq100[x].append(freq100)
    return aa_freq100
        
aa_test = ['F', 'F', 'A', 'H', 'H', 'H']
print (freq_aa100(aa_test))

{'F': ['33.3333%'], 'L': [0.0], 'S': [0.0], 'Y': [0.0], 'C': [0.0], 'W': [0.0], 'P': [0.0], 'H': ['50.0000%'], 'Q': [0.0], 'R': [0.0], 'I': [0.0], 'M': [0.0], 'T': [0.0], 'N': [0.0], 'K': [0.0], 'V': [0.0], 'A': ['16.6667%'], 'D': [0.0], 'E': [0.0], 'G': [0.0]}


In [28]:
# get the frequency of amino acids at each position for control library - baseline frequencies, as a percentage

aa_freq_c100 = {} # generate dictionary with positions as key to store lists of amino acid total counts for each position

for i in range(len(track)):
    aa_freq_c100[track[i]] = []


for x in track:
    for key, value in aa_lists.items():
        if key == x:
            aa_freq_c100[x].append(freq_aa100(value))
            

print ("Amino acid frequencies at each position:")            
for x in track:
    print ("Position", x)
    print (aa_freq_c100[x])
    print(" ")

Amino acid frequencies at each position:
Position 94.0
[{'F': ['1.6984%'], 'L': ['10.2207%'], 'S': ['9.6817%'], 'Y': ['3.2238%'], 'C': ['3.2645%'], 'W': ['1.5255%'], 'P': ['6.7121%'], 'H': ['3.5696%'], 'Q': ['2.9391%'], 'R': ['9.8546%'], 'I': ['4.9934%'], 'M': ['1.8407%'], 'T': ['6.6409%'], 'N': ['3.3866%'], 'K': ['3.3255%'], 'V': ['6.8240%'], 'A': ['6.9968%'], 'D': ['3.1933%'], 'E': ['3.8137%'], 'G': ['6.2951%']}]
 
Position 28.0
[{'F': ['1.5598%'], 'L': ['9.9195%'], 'S': ['9.8583%'], 'Y': ['3.2623%'], 'C': ['3.0278%'], 'W': ['1.6617%'], 'P': ['6.9120%'], 'H': ['3.3133%'], 'Q': ['3.3846%'], 'R': ['9.4403%'], 'I': ['5.0770%'], 'M': ['1.5700%'], 'T': ['6.4838%'], 'N': ['3.5478%'], 'K': ['3.3235%'], 'V': ['6.3105%'], 'A': ['6.9222%'], 'D': ['3.5987%'], 'E': ['3.5783%'], 'G': ['7.2484%']}]
 
Position 112.0
[{'F': ['3.3087%'], 'L': ['10.3165%'], 'S': ['9.7719%'], 'Y': ['3.5245%'], 'C': ['3.7402%'], 'W': ['1.7160%'], 'P': ['6.2166%'], 'H': ['3.2470%'], 'Q': ['2.9696%'], 'R': ['9.8438%'], 'I