#Diversification of sequence pool

In this activity we will be generating *in silico* diversity using different methods and assessing each method together.

Please download this notebook and add it to your folder.

<font color='grey' > Created by Parisa Hosseinzadeh for *Protein Engineering and Design*, Winter 2022

In [1]:
#@title Importing necessary modules
#@markdown Run this cell to download 
#@markdown necessary modules to run the code.

#importing modules necessary for plotting
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import random

In [2]:
#@title Generating our sequence-fitness landscape
#@markdown Run this cell to generate 
#@markdown your sequence-fitness landscape

#Functions to get fitness
x= np.linspace(0,4,num=100)
fx = []
for i in range(len(x)):
  fx.append(
      (-0.8*(x[i]-2)**6+4*(x[i]-2)**4-0.5*(x[i]-1)**3-3.5*(x[i]-2)**2+2*x[i])*3
      )

def plot(x,fx):
  '''Plots the fitness function.'''
  plt.plot(x,fx,'-r')
  plt.grid()
  #plt.axvline()
  #plt.axhline()
  plt.xlim(0,4)
  plt.ylim(0,15)
  #plt.show()

#Hidden for now. You can remove the comment to see.
#plot(x,fx)

def get_fitness(x):
  '''Generates fitness given number x.'''
  fitness = (-0.8*(x-2)**6+4*(x-2)**4-0.5*(x-1)**3-3.5*(x-2)**2+2*x)*3
  return  fitness


# Random mutagenesis

In the following next steps, you'll be working your way through a random mutagenesis scenario. Follow along the in-class assignment and answer the questions.

In [None]:
#@title Our starting population
#@markdown By running this cell, you'll see your original
#@markdown population of choice.
#@markdown Which parent sequence you'll choose 
#@markdown to start your random mutagenesis?

pop_dict ={
    1: [],
    2: [],
    3: [],
    4: [],
    5: [],
}

x=[random.random()*4 for _ in range(50)]
counter = 1

for i in range(len(x)):
  if get_fitness(x[i]) < 7.5 :
    if get_fitness(x[i]) > 0:
      pop_dict[counter].append((x[i],get_fitness(x[i])))
      counter+=1
  else:
    continue
  if counter == 6:
    break


for key in pop_dict:
  print('Parent sequence P{}'.format(key),'has a fitness of:',pop_dict[key][0][1])


In [None]:
#@title Random mutagenesis of sequence ⬇️

# define sequence
parent = 'P2'#@param {type:"string"}
mutation_rate = 1.5 #@param {type:"number"}
library_size = 100 #@param {type:"number"}


#@markdown - `parent` Write-down the name of parent sequence (P1, P2, etc).
#@markdown -  `mutation_rate` How many average mutations per sequence.
#@markdown -  `library_size` How many offsprings to generat.

#@markdown Run this cell to create your mutated library
#@markdown and see the results.

def mutate_sequence(num,mut_loc=-1,new_num=-2):
  '''Makes mutations to the sequence, only one.'''
  #get the number and make sure all are the same digits
  #total lengith of 10
  seq_str = format(num, '.9f')
  #Finding which place to mutate
  if mut_loc == -1:
    mut_loc = random.randint(0,10)
  #we're keeping the first number between 0-3 to 
  #stay within the range of numbers
  if mut_loc == 0:
    #checking to see if the user have defined it
    if new_num == -2:
      new_num = random.randint(-1,1)
      if num < 1 and new_num == -1:
        new_num=0
      elif num > 3 and new_num == 1:
        new_num=0
    #generating mutated seq
    new_var = format(num+new_num, '.9f')
  #the second position is the .
  else:
    if mut_loc == 1:
      addition = random.randint(1,8)
      mut_loc +=addition
    #all numbers from 0-9 are fair game
    #checking to see if the user have defined it
    if new_num == -2:
      new_num = str(random.randint(0,9))
    else:
      new_num = str(new_num)
    #generate new sequence
    new_var = seq_str[:mut_loc]+new_num+seq_str[mut_loc+1:]

  return float(new_var)

#getting the actual number that the parent has.
seq = ['P1','P2','P3','P4','P5']
seq_num = pop_dict[seq.index(parent)+1][0][0]

def generate_lib(seq,
                 library_size=library_size, 
                 mutation_rate=mutation_rate, ):
  '''Generates library given sequence.'''
  #creating an empty place holder
  new_vars=[]
  #generating mutants
  for i in range(library_size):
    new_seq = seq
    mut_rate = random.randint(0,round(mutation_rate))
    while mut_rate > 0:
      new_seq = mutate_sequence(new_seq, -1,-2)
      mut_rate -= 1
    new_vars.append((new_seq, get_fitness(new_seq)))

  return new_vars

new_vars = generate_lib(seq_num)

def dist_from_native(wt, mut):
  '''
  Calculates the number of differences between
  wt and mutant sequence.
  '''
  wt_str = seq_str = format(wt, '.9f')
  mut_str = format(mut, '.9f')

  return sum(c1!=c2 for c1,c2 in zip(wt_str,mut_str))

def plot_lib(evo_lib,wt):
  '''Plots the library based on fitness and distance from WT.'''
  x=[]
  y=[]
  for elem in evo_lib:
    x.append(dist_from_native(wt,elem[0]))
    y.append(elem[1])

  plt.xlabel("Mutations from WT")
  plt.ylabel("Fitness")
  plt.scatter(x,y,c='red')

plot_lib(new_vars, seq_num)

In [None]:
#@title Continue more rounds

# define parameters
num_rounds =  4#@param {type:"number"}

#@markdown -  `num_rounds` How many more rounds to go.

#@markdown let's repeat this 4 more times on our top sequence
#@markdown from each round and check the results.

def new_ev_round(evo_lib,num_rounds=num_rounds):
  '''
  Repeats the evolution on best sequence 
  from library new_var and returns the new library.
  '''
  new_vars = evo_lib
  #going through rounds
  for i in range(num_rounds):
    #getting fitnesses
    y=[]
    for elem in new_vars:
      y.append(elem[1])
    #getting best sequence index
    best_idx = y.index(max(y))
    #getting the best sequence 
    seq = new_vars[best_idx][0]
    new_vars = generate_lib(seq)

  return new_vars
  
final_round = new_ev_round(new_vars)
plot_lib(final_round, seq_num)

In [None]:
#@title Checkout the fitness landscape
#@markdown Let's take a look at where in the energy landscape
#@markdown we are (blue dots are sequences from final round).

#@markdown What can we do to imrpove our method?

#Functions to get fitness
x= np.linspace(0,4,num=100)
fx = []
for i in range(len(x)):
  fx.append(
      (-0.8*(x[i]-2)**6+4*(x[i]-2)**4-0.5*(x[i]-1)**3-3.5*(x[i]-2)**2+2*x[i])*3
      )

x2=[]
y=[]
for elem in final_round:
  x2.append(elem[0])
  y.append(elem[1])

plt.xlabel("Sequence")
plt.ylabel("Fitness")
plot(x,fx)
plt.scatter(x2,y,c='blue',s=75)
plt.show()


#Iterative site directed mutagenesis

In the following steps, we will be mimicking ISD to generate more functional proteins. Follow the in-class activity questions and the steps to learn more about this technique.

In [7]:
#@title Choosing starting positions
#@markdown From the previous work, you know that the following
#@markdown locations can be important for activity:
#@markdown - location **A**: A residue in proximity of the substrate binding site
#@markdown - location **B**: A hydrophobic residue at the surface 
#@markdown - location **C**: A site identified with previous experiments to be important

#@markdown choose two sites to consider for ISD:

loc1 = 'A'#@param {type:"string"}
loc2 = 'B' #@param {type:"string"}


In [None]:
#@title Running first round of ISD
#@markdown Let's perform the first round of ISD

loc_dict = {
    'A':0,
    'B':7,
    'C':2,
}


def saturate(seq_num, mut_loc):
  '''
  Performs saturation mutagenesis on sequence at a
  given location and returns library of mutants.
  '''
  p1_lib = []
  if mut_loc == 'A':
    seq_str = format(seq_num, '.9f')
    seq_decimal = seq_str[1:]
    for i in range(4):
      new_seq = float(str(i)+seq_decimal)
      p1_lib.append((new_seq, get_fitness(new_seq)))
  else:
    for i in range(10):
      mut1 = mutate_sequence(seq_num,loc_dict[mut_loc],i)
      p1_lib.append(
          (mut1, get_fitness(mut1))
          )
      
  return p1_lib
  
p1_lib = saturate(seq_num, loc1)
p2_lib = saturate(seq_num, loc2)
print ('--------Saturation Mutagenesis at Location {}--------'.format(loc1))
for i in range(len(p1_lib)):
  print(
    'fitness for substittuion-{} at location-{} is {}'.format(
        i, loc1, p1_lib[i][1])
    )
print ('--------Saturation Mutagenesis at Location {}--------'.format(loc2))

for i in range(len(p2_lib)):
  print(
    'fitness for substittuion-{} at location-{} is {}'.format(
        i, loc2, p2_lib[i][1])
    )

In [None]:
#@title Performing iteration
#@markdown Which substitution you'd like to use
#@markdown for the next round of site saturation mutagenesis?

s1_2 = 1 #@param {type:"number"}
s2_1 = 1 #@param {type:"number"}

#@markdown - `s1_2` Substitution in location 1 to be mutated at location 2.
#@markdown - `s2_1` Substitution in location 2 to be mutated at location 1.

seq1 = p1_lib[s1_2-1][0]
seq2 = p2_lib[s2_1-1][0]

p1_2 = saturate(seq1, loc2)
p2_1 = saturate(seq2, loc1)

print ('--------SDM of location {} on chosen sequence of location {}--------'.format(loc2, loc1))
for i in range(len(p1_2)):
  print(
    'fitness for substittuion-{} at location-{} is {}'.format(
        i, loc1, p1_2[i][1])
    )
print ('--------SDM of location {} on chosen sequence of location {}--------'.format(loc1, loc2))

for i in range(len(p2_1)):
  print(
    'fitness for substittuion-{} at location-{} is {}'.format(
        i, loc2, p2_1[i][1])
    )

In [None]:
#@title Analyzing results
#@markdown let's take a look at the trajectory of changes
#@markdown at each round of SDM.
#@markdown Plot color code:
#@markdown - square: first round
#@markdown - circle: second round
#@markdown - shades of blue: loc 1 --> loc 2
#@markdown - shades of purple: loc2 --> loc 1

#@markdown How do you compare ISD to random mutagenesis?

x= np.linspace(0,4,num=100)
fx = []
for i in range(len(x)):
  fx.append(
      (-0.8*(x[i]-2)**6+4*(x[i]-2)**4-0.5*(x[i]-1)**3-3.5*(x[i]-2)**2+2*x[i])*3
      )


def get_x_y(evo_lib):
  '''Returns (x,y) from library.'''
  x=[]
  y=[]
  for elem in evo_lib:
    x.append(elem[0])
    y.append(elem[1])

  return x,y

x1, y1 = get_x_y(p1_lib)
x1_2, y1_2 = get_x_y(p1_2)
x2, y2 = get_x_y(p2_lib)
x2_1, y2_1 = get_x_y(p2_1)

plt.xlabel("Sequence")
plt.ylabel("Fitness")
plot(x,fx)
plt.scatter(x1,y1,c='cyan',s=150, marker='s')
plt.scatter(x1_2,y1_2,c='blue',s=75)
plt.show()

plt.xlabel("Sequence")
plt.ylabel("Fitness")
plot(x,fx)
plt.scatter(x2,y2,c='magenta',s=150,marker='s')
plt.scatter(x2_1,y2_1,c='purple',s=75)
plt.show()

#Gene shuffling
The following steps take you through a mimic of gene suffling for improving your protein's activity.

Wlak through these steps and in-class activity and answer the questions.

In [None]:
#@title Starting population
#@markdown By running this cell, you'll see the fitness distribution 
#@markdown of your original population (default size = 25). 
#@markdown Let's take a look at starting fitnesses.

orig_pop = 25 #@param {type:"number"}

#@markdown `orig_pop` : Size of original population. Should be < 500.


population = []

init_x=[random.random()*4 for _ in range(5000)]
counter = 1

for i in range(len(init_x)):
  if get_fitness(init_x[i]) < 7.5 :
    if get_fitness(init_x[i]) > 0:
      population.append((init_x[i],get_fitness(init_x[i])))
      counter+=1
  else:
    continue
  if counter == orig_pop + 1:
    break

x1,y1 = get_x_y(population)
plt.ylabel("Fitness")
plt.hist(y1)

In [12]:
#@title Shuffling
#@markdown Let's shuffle our genes. 
#@markdown There are two parameters you can set below:

shuff_rate = 3 #@param {type:"number"}
pop_size = 1000 #@param {type:"number"}

#@markdown - `shuff_rate` Max number of cuts in gene
#@markdown - `pop_size` Final population size

#@markdown Let's run this cell to generate 
#@markdown our shuffled population.

def break_seq(seq, shuff_rate=shuff_rate):
  '''Returns broken sequences with max number of cuts of shuff_rate.'''
  seq_str= format(seq, '.9f')
  cut_loc = []
  for i in range(shuff_rate):
    cut_loc.append(random.randint(2,7))
  cuts = sorted(list(set(cut_loc)))
  cut_seq = []

  counter = 0
  last_dig = 0
  for cut in cuts:
    if counter == 0:
      cut_seq.append((cut,float(seq_str[:cut])))
      counter += 1
      last_dig=cut

    else:
      num_0='0.'
      for i in range(last_dig-2):
        num_0 += '0'

      cut_seq.append(
          (cut,
           float(num_0+seq_str[last_dig:cut])
           )
      )
      counter +=1
      last_dig = cut

    if counter == len(cuts):
      num_0='0.'
      for i in range(cut-2):
        num_0 += '0'
      cut_seq.append((10,float(num_0+seq_str[cut:])))

  return cut_seq

def combine_frag(frag_dict, pop_size=pop_size):
  '''Combines fragments and returns seq + fitness.'''
  final_lib =[]
  for i in range(pop_size):
    p1 = frag_dict['p1'][random.randint(0,len(frag_dict['p1'])-1)]
    p2 = frag_dict['p2'][random.randint(0,len(frag_dict['p2'])-1)]
    p3 = frag_dict['p3'][random.randint(0,len(frag_dict['p3'])-1)]
    p4 = frag_dict['p4'][random.randint(0,len(frag_dict['p4'])-1)]

    new_seq = p1+p2+p3+p4
    final_lib.append((new_seq, get_fitness(new_seq)))

  return final_lib

frag_dict ={
    'p1':[],
    'p2':[],
    'p3':[],
    'p4':[],
}

for seq in x:
  cutted = break_seq(seq)
  for elem in cutted:
    if elem[1] > 0.5:
      frag_dict['p1'].append(elem[1])
    elif elem[1] > 0.005:
      frag_dict['p2'].append(elem[1])
    elif elem[1] > 0.00005:
      frag_dict['p3'].append(elem[1])
    else:
      frag_dict['p4'].append(elem[1])

evo_lib = combine_frag(frag_dict)






In [None]:
#@title Analyzing the results
#@markdown Let's take a look at the results of our shuffling.
#@markdown - Blue circles: Shuffled population
#@markdown - magenta squares: Original parents

#@markdown What is your conclusion?

#Functions to get fitness
x= np.linspace(0,4,num=100)
fx = []
for i in range(len(x)):
  fx.append(
      (-0.8*(x[i]-2)**6+4*(x[i]-2)**4-0.5*(x[i]-1)**3-3.5*(x[i]-2)**2+2*x[i])*3
      )

x2=[]
y2=[]
for elem in evo_lib:
  x2.append(elem[0])
  y2.append(elem[1])

plt.xlabel("Sequence")
plt.ylabel("Fitness")
plot(x,fx)
plt.scatter(x2,y2,c='blue',s=75)
plt.scatter(x1,y1,c='magenta',s=50, marker='D')
plt.show()