# Generating LacI constructs

(c) 2020 Tom Röschinger. This work is licensed under a [Creative Commons Attribution License CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/). All code contained herein is licensed under an [MIT license](https://opensource.org/licenses/MIT).

In [1]:
import wgregseq
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

import copy

from bokeh.plotting import figure
from bokeh.models import ColumnDataSource

import bokeh.io

bokeh.io.output_notebook()

Load the wild type sequences and the energy matrix from Barnes 2019.

In [2]:
O1 = 'AATTGTGAGCGGATAACAATT'
lacUV5 = 'TCGAGTTTACACTTTATGCTTCCGGCTCGTATAATGTGTGG'

lacUV5_O1 = lacUV5 + O1

O1_matrix = np.array([[ 0.      ,  0.032177,  0.009131,  0.091165],
       [ 0.      ,  0.114247,  0.071831,  0.03971 ],
       [-0.000628,  0.064351,  0.086849,  0.      ],
       [ 0.20005 ,  0.177245,  0.230581,  0.      ],
       [ 0.425216,  0.447047,  0.      ,  0.485461],
       [ 0.326155,  0.334896,  0.211105,  0.      ],
       [ 0.359728,  0.359606,  0.      ,  0.332162],
       [ 0.      ,  0.401932,  0.351278,  0.382362],
       [ 0.199617,  0.140798,  0.      ,  0.306306],
       [ 0.385491,  0.      ,  0.208968,  0.422716],
       [ 0.363034,  0.485714,  0.      ,  0.510156],
       [ 0.088837,  0.004424,  0.      ,  0.033584],
       [ 0.      ,  0.135131,  0.018889,  0.217285],
       [ 0.202876,  0.065591,  0.232629,  0.      ],
       [ 0.      ,  0.003501,  0.100287,  0.114927],
       [ 0.      ,  0.067364,  0.150227,  0.166777],
       [ 0.181258,  0.      ,  0.16264 ,  0.210426],
       [ 0.      ,  0.149875,  0.237717,  0.187568],
       [ 0.      ,  0.107793,  0.143145,  0.029033],
       [-0.066676,  0.082292,  0.070096,  0.      ],
       [-0.001548, -0.063899,  0.018943,  0.      ]])

Import the dictionary which transforms DNA into integers.

In [3]:
seq_dict, _ = wgregseq.choose_dict("dna")

Let's define a function which evaluates the energy matrix for a sequence.

In [4]:
def energy_from_sequence(sequence, matrix):
    seq_list = list(sequence.upper())
    num_seq = [seq_dict[x] for x in seq_list]
    energy = sum([matrix[i, num_seq[i]] for i in range(len(sequence))])
    return energy

This energy matrix gives energy contributions compared to wildtype, therefore the wild type sequence should have an energy of 0.

In [5]:
energy_from_sequence(O1, O1_matrix)

0.0

Now we can generate mutants. Therefore we generate all single and double mutants, as well as 10000 triple mutants using the function which creates all possible mutants and then chooses, to prevent duplicates.

In [6]:
mutants_single = wgregseq.mutations_det(O1, mut_per_seq=1)
mutants_double = wgregseq.mutations_det(O1, mut_per_seq=2)
mutants_triple = wgregseq.mutations_det(O1, mut_per_seq=3, num_mutants=10000)



In [7]:
import itertools
mutants = []

mutants = [(j, i) for i in range(3) for j in range(3)]

somelists = 2 * [mutants]
elements = np.array(list(itertools.product(*somelists)))


For higher order mutants we don't have to worried about duplicates and can randomly generate mutants (while keeping the number of mutations fixed).

In [8]:
mutants_quadruple = wgregseq.mutations_rand(O1, rate=0.2, num_mutants=10000, number_fixed=True)
mutants_quintuple = wgregseq.mutations_rand(O1, rate=0.25, num_mutants=10000, number_fixed=True)
mutants_sextuple = wgregseq.mutations_rand(O1, rate=0.3, num_mutants=10000, number_fixed=True)

Let's write all the mutants into a dataframe. We exclude the single mutants for now, since we want to use all of those anyways, so we add them back to the pool in the end.

In [9]:
df_1 = pd.DataFrame({"seq": mutants_single, "mutations": 1})
df_2 = pd.DataFrame({"seq": mutants_double, "mutations": 2})
df_3 = pd.DataFrame({"seq": mutants_triple, "mutations": 3})
df_4 = pd.DataFrame({"seq": mutants_quadruple, "mutations": 4})
df_5 = pd.DataFrame({"seq": mutants_quintuple, "mutations": 5})
df_6 = pd.DataFrame({"seq": mutants_sextuple, "mutations": 6})
df = pd.concat([df_2, df_3, df_4, df_5, df_6], ignore_index=True)

Now we can compute the difference in energy matrix for every mutant and add it to the dataframe.

In [10]:
df["energy"] = df['seq'].apply(energy_from_sequence, args= (O1_matrix, ))
df.head()

Unnamed: 0,seq,mutations,energy
0,ccTTGTGAGCGGATAACAATT,2,0.146424
1,cAaTGTGAGCGGATAACAATT,2,0.031549
2,cATaGTGAGCGGATAACAATT,2,0.232227
3,cATTaTGAGCGGATAACAATT,2,0.457393
4,cATTGaGAGCGGATAACAATT,2,0.358332


Let's have a look on how the mutants are distributed.

In [11]:
p = figure(x_axis_label="mutations", y_axis_label="ΔE [k_BT]")
ColumnDataSource(df)
p.scatter(x='mutations', y='energy', source=df)
bokeh.io.show(p)

To choose mutants, we can select bins of energy differences, here is an example.

In [12]:
bins = [(-0.05, 0.05), (0.4, 0.5), (0.85, .95)]
for (b1, b2) in bins:
    p.line(x=[2, 6], y=[b1, b1], color="orange")
    p.line(x=[2, 6], y=[b2, b2], color="orange")
    p.varea(x=[2, 6], y1=[b1, b1], y2=[b2, b2], alpha=0.2, color="orange")
    
bokeh.io.show(p)

Select mutants that fall within the bins.

In [13]:
df_list = []
for i, (x,y) in enumerate(bins):
    temp_df = copy.deepcopy(df.loc[[x < E < y for E in df["energy"] ], :])
    temp_df["bin"] = np.ones(len(temp_df), dtype=int) * i
    df_list.append(temp_df)
    
binned_df = pd.concat(df_list, ignore_index=True)
binned_df.head(10)

Unnamed: 0,seq,mutations,energy,bin
0,cAaTGTGAGCGGATAACAATT,2,0.031549,0
1,cATTGTGAGCGGATcACAATT,2,0.035678,0
2,cATTGTGAGCGGATAACAAaT,2,-0.034499,0
3,cATTGTGAGCGGATAACAATa,2,0.030629,0
4,cATTGTGAGCGcATAACAATT,2,0.036601,0
5,cATTGTGAGCGGATAACAATc,2,-0.031722,0
6,AcTTGTGAGCGGATAACAAaT,2,0.047571,0
7,cAaTGTGAGCGGATAACAATT,2,0.031549,0
8,AAaTGTGAGCGGATcACAATT,2,0.002873,0
9,AAaTGTGAGCGGATAACAATa,2,-0.002176,0


Now we need to select mutants from the bins. We want to try to get an equal number of sequences for each number of mutations per bin. However, some bins are sparsely populated by some type of mutants. Therefore we choose a maximal number of sequences per mutation type per bin. If there are more sequences in a bin, we randomly select sequences.

In [14]:
def select_seqs(df, ind_bin, num_seqs):
    seqs_per_mut = np.floor(num_seqs / len(df.mutations.unique()))
    rest = num_seqs - seqs_per_mut * len(df.mutations.unique())
    count_df = df.groupby(["mutations", "bin"]).size().to_frame(name="num_mutants").reset_index()
    count_df = count_df.loc[count_df["bin"] == ind_bin, :]
    
    ret_df = pd.DataFrame(columns=["seq", "mutations", "energy", "bin"])
    
    for i in df.mutations.unique():
        if count_df.loc[count_df["mutations"] == i, "num_mutants"].values < seqs_per_mut:
            ret_df = pd.concat([ret_df, df.loc[(df["mutations"] == i) & (df["bin"] == ind_bin), :]], ignore_index=True)
        else:
            indices = df.loc[(df["mutations"] == i) & (df["bin"] == ind_bin), :].index.to_numpy(dtype=int)
            selected_indices = np.random.choice(indices, size=int(seqs_per_mut), replace=False)
            ret_df = pd.concat([ret_df, df.iloc[selected_indices]])
    return ret_df

Now we only need to apply the function to each bin and collect sequences. 

In [15]:
O1_mutants_df_list = []
for Bin in range(len(bins)):
    O1_mutants_df_list.append(select_seqs(binned_df, 0, 300))

O1_mutants_df = pd.concat(O1_mutants_df_list)
O1_mutants_df.head()

Unnamed: 0,seq,mutations,energy,bin
0,AtaTGTGAGCGGATAACAATT,2,0.039082,0
1,AATTGTGAGCGtATAACAATc,2,-0.030315,0
2,AATTGTGAGCGcATAACAATa,2,0.002876,0
3,AATTGTGAGCGGATcACAATg,2,0.022444,0
4,AATTGTGAGCGGATAACAtTg,2,0.047976,0


Let's see how many sequences we have.

In [16]:
len(O1_mutants_df)

633

Finally, we add all single mutants back to the oligo pool.

In [17]:
df_1["energy"] = df_1['seq'].apply(energy_from_sequence, args= (O1_matrix, ))
df_1["bin"] = "x"
O1_mutants_df = pd.concat([O1_mutants_df, df_1])

Now we only need to add the lacUV5 sequence to each mutant to get the final constructs.

In [18]:
oligos = copy.deepcopy(O1_mutants_df)

oligos.seq = [lacUV5 + seq for seq in oligos.seq]
oligos.head()

Unnamed: 0,seq,mutations,energy,bin
0,TCGAGTTTACACTTTATGCTTCCGGCTCGTATAATGTGTGGAtaTG...,2,0.039082,0
1,TCGAGTTTACACTTTATGCTTCCGGCTCGTATAATGTGTGGAATTG...,2,-0.030315,0
2,TCGAGTTTACACTTTATGCTTCCGGCTCGTATAATGTGTGGAATTG...,2,0.002876,0
3,TCGAGTTTACACTTTATGCTTCCGGCTCGTATAATGTGTGGAATTG...,2,0.022444,0
4,TCGAGTTTACACTTTATGCTTCCGGCTCGTATAATGTGTGGAATTG...,2,0.047976,0


## Computational environment

In [19]:
%load_ext watermark
%watermark -v -p numpy,pandas,wgregseq,bokeh

CPython 3.8.5
IPython 7.10.0

numpy 1.18.1
pandas 1.0.3
wgregseq 0.0.1
bokeh 2.0.2
