In [1]:
import pandas as pd
import numpy as np

In [2]:
# import dataset into a pandas data frame

df = pd.read_csv('../41467_2022_32829_MOESM5_ESM.csv')
df.head()

Unnamed: 0,ID,DNA1 [counts],DNA2 [counts],DNA3 [counts],RNA1 [counts],RNA2 [counts],RNA3 [counts],TX1 [au],TX2 [au],TX3 [au],...,high quality,Observed log(TX/Txref),Predicted log(TX/Txref),dG10,dG35,dGDisc,dGITR,dGEXT10,dGSPAC,dGUP
0,0,8263,7261,5173,16341,10320,13506,2.258071,1.523795,1.545541,...,Yes,-3.386326,-3.844827,-1.781524,-1.477218,-0.106428,-0.021112,0.191352,-0.0924,0.400862
1,1,5600,4886,3264,10986,7250,10800,2.240001,1.590845,1.958709,...,Yes,-3.50314,-3.905283,-1.781524,-1.477218,-0.166884,-0.021112,0.191352,-0.0924,0.400862
2,2,7898,6790,4752,19572,32204,30585,2.829533,5.084911,3.810029,...,Yes,-4.207206,-3.905283,-1.781524,-1.477218,-0.166884,-0.021112,0.191352,-0.0924,0.400862
3,3,10651,9875,6466,15734,16246,18908,1.686729,1.763814,1.731036,...,Yes,-3.392439,-3.877808,-1.781524,-1.477218,-0.139409,-0.021112,0.191352,-0.0924,0.400862
4,4,12188,10793,6965,28609,21796,26803,2.680198,2.1651,2.278025,...,Yes,-3.698903,-3.672384,-1.781524,-1.477218,0.066015,-0.021112,0.191352,-0.0924,0.400862


In [3]:
# All input and output data

X = df[['UP', 'h35', 'spacs', 'h10', 'disc', 'ITR']]
y = df['Observed log(TX/Txref)']

X.head()

Unnamed: 0,UP,h35,spacs,h10,disc,ITR
0,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,CCCCGCGG,CTCTACCTTAGTTTGTACGTT
1,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,CGCGGCGG,CTCTACCTTAGTTTGTACGTT
2,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,CGCGCCCG,CTCTACCTTAGTTTGTACGTT
3,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,GCGGCGGC,CTCTACCTTAGTTTGTACGTT
4,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,CGGGGGGC,CTCTACCTTAGTTTGTACGTT


In [37]:
# remove all rows with spacer sequences that are not 16-18 nucleotides long


_df = df[(df['spacs'].str.len() >= 15) & (df['spacs'].str.len() <= 19)]

_df = _df[['UP', 'h35', 'spacs', 'h10', 'disc', 'ITR', 'Observed log(TX/Txref)', 'Predicted log(TX/Txref)']]


X = _df[['UP', 'h35', 'spacs', 'h10', 'disc', 'ITR']]
y = _df['Observed log(TX/Txref)']

print(f'Removed {df.shape[0] - _df.shape[0]} rows')


Removed 140 rows


In [5]:
# Function to one-hot encode DNA sequences, including padding 0's

def padded_one_hot_encode(sequence):
    mapping = {'A': [1,0,0,0], 'C': [0,1,0,0], 'G': [0,0,1,0], 'T': [0,0,0,1], '0': [0,0,0,0]}
    encoding = []
    for nucleotide in sequence:
         encoding += [mapping[nucleotide]]
    return encoding

In [6]:
upstream_padding = {}

for col in X.columns:
    max_len = X[col].apply(len).max()
    upstream_padding[col] = np.array([padded_one_hot_encode('0' * (max_len - len(seq)) + seq) for seq in X[col]])

# Concatenate the one-hot encoded, upstream-padded sequences
X = np.concatenate([upstream_padding[col] for col in X.columns], axis=1)

In [7]:
X[0]

array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0,

In [8]:
from keras.saving import load_model

model = load_model('Hyperparameter_tuned.keras')

In [9]:
print(model.summary())

None


In [10]:
def format(str):
    return str.replace(" ", "").upper()

In [11]:
print(format("A G A G A C C A G C G G C A T G C A T T T A C G T T G A C A C C A T C G A A T G G T G C A A A A C C T T T C G C G G T A T"))

AGAGACCAGCGGCATGCATTTACGTTGACACCATCGAATGGTGCAAAACCTTTCGCGGTAT


In [12]:
def encode_sequence(dataframe):
    encoded_seq = []

    lengths = {
        'UP': 22,
        'h35': 6,
        'spacs': 19,
        'h10': 6,
        'disc': 8,
        'ITR': 21
    }

    for col in dataframe.columns:
        max_len = lengths[col]
        encoded_seq += [padded_one_hot_encode('0' * (max_len - len(seq)) + seq) for seq in dataframe[col]]

    encoded_dataframe = np.array([np.concatenate([each for each in encoded_seq])])

    return encoded_dataframe

## Making Predictions

#### Prediction for promoter lac

In [13]:
UP = 'AATACTAGAGGTCTTCCGAC'
h35 = 'TTTACA'
spacs = 'CTTTATGCTTCCGGCTCG'
h10 = 'TATGTT'
disc = 'GTGTGG'
ITR = 'GCGGGAAGACAACTAGGGGC'

pClone_promoter_lac = pd.DataFrame({'UP': [format(UP)], 'h35': [format(h35)], 'spacs': [format(spacs)], 'h10': [format(h10)], 'disc': [format(disc)], 'ITR': [format(ITR)]})
pClone_promoter_lac.head()

Unnamed: 0,UP,h35,spacs,h10,disc,ITR
0,AATACTAGAGGTCTTCCGAC,TTTACA,CTTTATGCTTCCGGCTCG,TATGTT,GTGTGG,GCGGGAAGACAACTAGGGGC


In [14]:
encoded_pClone_promoter_lac = encode_sequence(pClone_promoter_lac)

In [15]:
print(encoded_pClone_promoter_lac)

[[[0 0 0 0]
  [0 0 0 0]
  [1 0 0 0]
  [1 0 0 0]
  [0 0 0 1]
  [1 0 0 0]
  [0 1 0 0]
  [0 0 0 1]
  [1 0 0 0]
  [0 0 1 0]
  [1 0 0 0]
  [0 0 1 0]
  [0 0 1 0]
  [0 0 0 1]
  [0 1 0 0]
  [0 0 0 1]
  [0 0 0 1]
  [0 1 0 0]
  [0 1 0 0]
  [0 0 1 0]
  [1 0 0 0]
  [0 1 0 0]
  [0 0 0 1]
  [0 0 0 1]
  [0 0 0 1]
  [1 0 0 0]
  [0 1 0 0]
  [1 0 0 0]
  [0 0 0 0]
  [0 1 0 0]
  [0 0 0 1]
  [0 0 0 1]
  [0 0 0 1]
  [1 0 0 0]
  [0 0 0 1]
  [0 0 1 0]
  [0 1 0 0]
  [0 0 0 1]
  [0 0 0 1]
  [0 1 0 0]
  [0 1 0 0]
  [0 0 1 0]
  [0 0 1 0]
  [0 1 0 0]
  [0 0 0 1]
  [0 1 0 0]
  [0 0 1 0]
  [0 0 0 1]
  [1 0 0 0]
  [0 0 0 1]
  [0 0 1 0]
  [0 0 0 1]
  [0 0 0 1]
  [0 0 0 0]
  [0 0 0 0]
  [0 0 1 0]
  [0 0 0 1]
  [0 0 1 0]
  [0 0 0 1]
  [0 0 1 0]
  [0 0 1 0]
  [0 0 0 0]
  [0 0 1 0]
  [0 1 0 0]
  [0 0 1 0]
  [0 0 1 0]
  [0 0 1 0]
  [1 0 0 0]
  [1 0 0 0]
  [0 0 1 0]
  [1 0 0 0]
  [0 1 0 0]
  [1 0 0 0]
  [1 0 0 0]
  [0 1 0 0]
  [0 0 0 1]
  [1 0 0 0]
  [0 0 1 0]
  [0 0 1 0]
  [0 0 1 0]
  [0 0 1 0]
  [0 1 0 0]]]


In [16]:
prediction = model.predict(encoded_pClone_promoter_lac)

print(f'Prediction: {prediction[0][0]}') 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 398ms/step
Prediction: -3.702486515045166


#### Prediction for P5

In [17]:
UP = 'AATACTAGAGGTCTTCCGAC'
h35 = 'TTGACA'
spacs = 'ATTAATCATCCGGCTCG'
h10 = 'TAATTT'
disc = 'ATGTGG'
ITR = 'GCGGGAAGACAACTAGGGGC'

pClone_red_p5 = pd.DataFrame({'UP': [format(UP)], 'h35': [format(h35)], 'spacs': [format(spacs)], 'h10': [format(h10)], 'disc': [format(disc)], 'ITR': [format(ITR)]})
pClone_red_p5.head()

Unnamed: 0,UP,h35,spacs,h10,disc,ITR
0,AATACTAGAGGTCTTCCGAC,TTGACA,ATTAATCATCCGGCTCG,TAATTT,ATGTGG,GCGGGAAGACAACTAGGGGC


In [18]:
encoded_pClone_red_p5 = encode_sequence(pClone_red_p5)

In [19]:
prediction = model.predict(encoded_pClone_red_p5)

print(f'Prediction: {prediction[0][0]}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Prediction: -3.9654335975646973


#### Prediction for combination: promoter lac h10, h35; P5 for the rest

In [20]:
UP = 'AATACTAGAGGTCTTCCGAC'
h35 = 'TTTACA'
spacs = 'ATTAATCATCCGGCTCG'
h10 = 'TATGTT'
disc = 'ATGTGG'
ITR = 'GCGGGAAGACAACTAGGGGC'

combination = pd.DataFrame({'UP': [format(UP)], 'h35': [format(h35)], 'spacs': [format(spacs)], 'h10': [format(h10)], 'disc': [format(disc)], 'ITR': [format(ITR)]})
combination.head()

Unnamed: 0,UP,h35,spacs,h10,disc,ITR
0,AATACTAGAGGTCTTCCGAC,TTTACA,ATTAATCATCCGGCTCG,TATGTT,ATGTGG,GCGGGAAGACAACTAGGGGC


In [21]:
encoded_combination = encode_sequence(combination)

In [22]:
prediction = model.predict(encoded_combination)

print(f'Prediction: {prediction[0][0]}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Prediction: -3.996711492538452


## LaFleurs Data 

#### This is the sequence closest to the promoter lac in the Lafleur Data. We are testing what the value is in our ANN.

In [23]:
UP = 'TTTTCTATCTACGTAC'
h35 = 'TTGACA'
spacs = 'CTATTTCCTATTTCTCT'
h10 = 'TATGTT'
disc = 'TTGTGTT'
ITR = 'CTCTACCTTAGTTTGTACGTT'

pClone_red_p5 = pd.DataFrame({'UP': [format(UP)], 'h35': [format(h35)], 'spacs': [format(spacs)], 'h10': [format(h10)], 'disc': [format(disc)], 'ITR': [format(ITR)]})
pClone_red_p5.head()

Unnamed: 0,UP,h35,spacs,h10,disc,ITR
0,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATGTT,TTGTGTT,CTCTACCTTAGTTTGTACGTT


In [24]:
encoded_pClone_red_p5 = encode_sequence(pClone_red_p5)

In [25]:
prediction = model.predict(encoded_pClone_red_p5)

print(f'Prediction: {prediction[0][0]}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Prediction: -3.5791800022125244


#### This is the sequence closest to p5 in the Lafleur Data. We are testing what the value is in our ANN.

In [26]:
UP = 'TTTTCTATCTACGTAC'
h35 = 'TTGACA'
spacs = 'CTATTTCCTATTTCTCT'
h10 = 'TAATTT'
disc = 'TTGTGTT'
ITR = 'CTCTACCTTAGTTTGTACGTT'

pClone_red_p5 = pd.DataFrame({'UP': [format(UP)], 'h35': [format(h35)], 'spacs': [format(spacs)], 'h10': [format(h10)], 'disc': [format(disc)], 'ITR': [format(ITR)]})
pClone_red_p5.head()

Unnamed: 0,UP,h35,spacs,h10,disc,ITR
0,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TAATTT,TTGTGTT,CTCTACCTTAGTTTGTACGTT


In [27]:
encoded_pClone_red_p5 = encode_sequence(pClone_red_p5)

In [28]:
prediction = model.predict(encoded_pClone_red_p5)

print(f'Prediction: {prediction[0][0]}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Prediction: -3.2705254554748535


## Finding Goldilocks sequences
Using the same UP and ITR elements; 36 bp promoter sequence; where TX is less than -4.0, around -3.3, and greater than -2.9 for Observed_LaFleur, Prediction_RSG, and Prediction_LaFleur.

In [233]:
goldilocks_df = pd.DataFrame(columns=['Name', 'Sequence', 'Length', 'Observed_LaFleur', 'Prediction_RSG', 'Prediction_LaFleur'])

In [234]:
goldilocks_df.loc[0] = ['P5', 'TTGACAATTAATCATCCGGCTCGTAATTTATGTGGA', 36, -2.859671, -4.20, -2.966256]
goldilocks_df.loc[1] = ['plac', 'TTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG', 36, -3.809935, -3.89, -3.147286]

goldilocks_df.head()

Unnamed: 0,Name,Sequence,Length,Observed_LaFleur,Prediction_RSG,Prediction_LaFleur
0,P5,TTGACAATTAATCATCCGGCTCGTAATTTATGTGGA,36,-2.859671,-4.2,-2.966256
1,plac,TTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,36,-3.809935,-3.89,-3.147286


### Lower end < -4.0

In [235]:
lower_end_df = _df[(_df['Observed log(TX/Txref)'] < -4.4) & (_df['Predicted log(TX/Txref)'] < -4)]


lower_end_df.head(None)

Unnamed: 0,UP,h35,spacs,h10,disc,ITR,Observed log(TX/Txref),Predicted log(TX/Txref)
10881,TTTTCTATCTACGTAC,TTGACA,TTTTTTTTTTTTTTTTT,TATAAT,TTGTGTT,CTCTACCTTAGTTTGTACGTT,-4.440532,-4.091806
11087,TATACTTTTTAATAATAATA,TTGACA,CTATTTCCTATTTCTCT,TATAAT,TTGTGTT,CTCTACCTTAGTTTGTACGTT,-4.439033,-4.037517
11091,ATTTAATATATTTAATTATA,TTGACA,CTATTTCCTATTTCTCT,TATAAT,TTGTGTT,CTCTACCTTAGTTTGTACGTT,-4.40556,-4.068421


In [236]:
# Lower end of TX, row 10881

UP = 'TATACTTTTTAATAATAATA'
h35 = 'TTGACA'
spacs = 'CTATTTCCTATTTCTCT'
h10 = 'TATAAT'
disc = 'TTGTGTT'
ITR = 'CTCTACCTTAGTTTGTACGTT'

prediction = model.predict(encode_sequence(pd.DataFrame({'UP': [format(UP)],
                                                         'h35': [format(h35)],
                                                         'spacs': [format(spacs)],
                                                         'h10': [format(h10)],
                                                         'disc': [format(disc)],
                                                         'ITR': [format(ITR)]})))
print(f'Prediction: {prediction[0][0]}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
Prediction: -3.820117950439453


In [237]:
# Lower end of TX, row 10881 (h35, spacs, h10, disc), ligated (UP, ITR)

UP = 'AATACTAGAGGTCTTCCGAC'
h35 = 'TTGACA'
spacs = 'CTATTTCCTATTTCTCT'
h10 = 'TATAAT'
disc = 'TTGTGTT'
ITR = 'GCGGGAAGACAACTAGGGGC'

prediction = model.predict(encode_sequence(pd.DataFrame({'UP': [format(UP)],
                                                         'h35': [format(h35)],
                                                         'spacs': [format(spacs)],
                                                         'h10': [format(h10)],
                                                         'disc': [format(disc)],
                                                         'ITR': [format(ITR)]})))
print(f'Prediction: {prediction[0][0]}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Prediction: -4.111618995666504


In [238]:
goldilocks_df.loc[3] = ['Lowest TX',
                        'TTGACACTATTTCCTATTTCTCTTATAATTTGTGTT',
                        len('TTGACACTATTTCCTATTTCTCTTATAATTTGTGTT'),
                        _df.loc[10881, 'Observed log(TX/Txref)'],
                        prediction[0][0],
                        _df.loc[10881, 'Predicted log(TX/Txref)']]

goldilocks_df.head()

Unnamed: 0,Name,Sequence,Length,Observed_LaFleur,Prediction_RSG,Prediction_LaFleur
0,P5,TTGACAATTAATCATCCGGCTCGTAATTTATGTGGA,36,-2.859671,-4.2,-2.966256
1,plac,TTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,36,-3.809935,-3.89,-3.147286
3,Lowest TX,TTGACACTATTTCCTATTTCTCTTATAATTTGTGTT,36,-4.440532,-4.111619,-4.091806


### Middle Value ~ -2.3

In [239]:
# import dataset into a pandas data frame

predictions_df = pd.read_csv('../predictions.csv')
predictions_df.head()

Unnamed: 0,UP,h35,spacs,h10,disc,ITR,Observed log(TX/Txref),Predicted log(TX/Txref),Our Prediction
0,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,CCCCGCGG,CTCTACCTTAGTTTGTACGTT,-3.386326,-3.844827,-4.049288
1,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,CGCGGCGG,CTCTACCTTAGTTTGTACGTT,-3.50314,-3.905283,-4.041058
2,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,CGCGCCCG,CTCTACCTTAGTTTGTACGTT,-4.207206,-3.905283,-4.135313
3,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,GCGGCGGC,CTCTACCTTAGTTTGTACGTT,-3.392439,-3.877808,-4.030675
4,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,CGGGGGGC,CTCTACCTTAGTTTGTACGTT,-3.698903,-3.672384,-3.941935


In [252]:
goal = -2.3
tol = 0.02

mid_end_df = predictions_df[(predictions_df['Observed log(TX/Txref)'] < goal+tol) &
                         (predictions_df['Observed log(TX/Txref)'] > goal-tol) &
                         (predictions_df['Predicted log(TX/Txref)'] < goal+tol) &
                         (predictions_df['Predicted log(TX/Txref)'] > goal-tol) &
                         (predictions_df['Our Prediction'] < goal+tol) &
                         (predictions_df['Our Prediction'] > goal-tol)]


mid_end_df.head(None)

Unnamed: 0,UP,h35,spacs,h10,disc,ITR,Observed log(TX/Txref),Predicted log(TX/Txref),Our Prediction
6830,TTTTCTATCTACGTAC,AATAGT,CTATTTCCTATTTCTCT,TATAAT,TTGTGTT,CTCTACCTTAGTTTGTACGTT,-2.311855,-2.291479,-2.289752
6905,TTTTCTATCTACGTAC,AAGTTA,CTATTTCCTATTTCTCT,TATAAT,TTGTGTT,CTCTACCTTAGTTTGTACGTT,-2.28303,-2.291479,-2.284124
6910,TTTTCTATCTACGTAC,AAGTGT,CTATTTCCTATTTCTCT,TATAAT,TTGTGTT,CTCTACCTTAGTTTGTACGTT,-2.309459,-2.316903,-2.287786
6968,TTTTCTATCTACGTAC,AACTTA,CTATTTCCTATTTCTCT,TATAAT,TTGTGTT,CTCTACCTTAGTTTGTACGTT,-2.293051,-2.311727,-2.280465


In [241]:
# Middle value for TX, row 13344 (h35, spacs, h10, disc), ligated (UP, ITR)

UP = 'AATACTAGAGGTCTTCCGAC'
h35 = 'CGGCTA'
spacs = 'TTTCCTATTTCTGTAA'
h10 = 'GAGCGC'
disc = 'TCGTGTT'
ITR = 'GCGGGAAGACAACTAGGGGC'

prediction = model.predict(encode_sequence(pd.DataFrame({'UP': ['AATACTAGAGGTCTTCCGAC'],
                                                         'h35':  predictions_df.loc[6830, 'h35'],
                                                         'spacs': predictions_df.loc[6830, 'spacs'],
                                                         'h10':predictions_df.loc[6830, 'h10'],
                                                         'disc': predictions_df.loc[6830, 'disc'],
                                                         'ITR': ['GCGGGAAGACAACTAGGGGC']
                                                         })))
print(f'Prediction: {prediction[0][0]}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Prediction: -4.165507793426514


In [242]:
len('AATAGTCTATTTCCTATTTCTCTTATAATTTGTGTT')

36

In [243]:
goldilocks_df.loc[4] = ['Middle TX',
                        'CGGCTATTTCCTATTTCTGTAAGAGCGCTCGTGTT',
                        len('CGGCTATTTCCTATTTCTGTAAGAGCGCTCGTGTT'),
                        predictions_df.loc[6830, 'Observed log(TX/Txref)'],
                        prediction[0][0],
                        predictions_df.loc[6830, 'Predicted log(TX/Txref)']]

goldilocks_df.head()

Unnamed: 0,Name,Sequence,Length,Observed_LaFleur,Prediction_RSG,Prediction_LaFleur
0,P5,TTGACAATTAATCATCCGGCTCGTAATTTATGTGGA,36,-2.859671,-4.2,-2.966256
1,plac,TTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,36,-3.809935,-3.89,-3.147286
3,Lowest TX,TTGACACTATTTCCTATTTCTCTTATAATTTGTGTT,36,-4.440532,-4.111619,-4.091806
4,Middle TX,CGGCTATTTCCTATTTCTGTAAGAGCGCTCGTGTT,35,-2.311855,-4.165508,-2.291479


### Upper end > -2.9

In [244]:
upper_end_df = _df[(_df['Observed log(TX/Txref)'] > -1.5) & (_df['Predicted log(TX/Txref)'] > -1.5)]


upper_end_df.head(None)

Unnamed: 0,UP,h35,spacs,h10,disc,ITR,Observed log(TX/Txref),Predicted log(TX/Txref)
13344,TTTTCTATCTACGTAC,CGGCTA,TTTCCTATTTCTGTAA,GAGCGC,TCGTGTT,CTCTACCTTAGTTTGTACGTT,-1.327835,-1.405075


In [245]:
# upper end of TX, row 13344

UP = 'TTTTCTATCTACGTAC'
h35 = 'CGGCTA'
spacs = 'TTTCCTATTTCTGTAA'
h10 = 'GAGCGC'
disc = 'TCGTGTT'
ITR = 'CTCTACCTTAGTTTGTACGTT'

prediction = model.predict(encode_sequence(pd.DataFrame({'UP': [format(UP)],
                                                         'h35': [format(h35)],
                                                         'spacs': [format(spacs)],
                                                         'h10': [format(h10)],
                                                         'disc': [format(disc)],
                                                         'ITR': [format(ITR)]})))
print(f'Prediction: {prediction[0][0]}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Prediction: -1.9923683404922485


In [246]:
# Lower end of TX, row 13344 (h35, spacs, h10, disc), ligated (UP, ITR)

UP = 'AATACTAGAGGTCTTCCGAC'
h35 = 'CGGCTA'
spacs = 'TTTCCTATTTCTGTAA'
h10 = 'GAGCGC'
disc = 'TCGTGTT'
ITR = 'GCGGGAAGACAACTAGGGGC'

prediction = model.predict(encode_sequence(pd.DataFrame({'UP': [format(UP)],
                                                         'h35': [format(h35)],
                                                         'spacs': [format(spacs)],
                                                         'h10': [format(h10)],
                                                         'disc': [format(disc)],
                                                         'ITR': [format(ITR)]})))
print(f'Prediction: {prediction[0][0]}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Prediction: -3.086618423461914


In [247]:
len('CGGCTATTTCCTATTTCTGTAAGAGCGCTCGTGTT')

35

In [248]:
goldilocks_df.loc[5] = ['Upper TX',
                        'CGGCTATTTCCTATTTCTGTAAGAGCGCTCGTGTT',
                        len('CGGCTATTTCCTATTTCTGTAAGAGCGCTCGTGTT'),
                        _df.loc[13344, 'Observed log(TX/Txref)'],
                        prediction[0][0],
                        _df.loc[13344, 'Predicted log(TX/Txref)']]

goldilocks_df.head()

Unnamed: 0,Name,Sequence,Length,Observed_LaFleur,Prediction_RSG,Prediction_LaFleur
0,P5,TTGACAATTAATCATCCGGCTCGTAATTTATGTGGA,36,-2.859671,-4.2,-2.966256
1,plac,TTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG,36,-3.809935,-3.89,-3.147286
3,Lowest TX,TTGACACTATTTCCTATTTCTCTTATAATTTGTGTT,36,-4.440532,-4.111619,-4.091806
4,Middle TX,CGGCTATTTCCTATTTCTGTAAGAGCGCTCGTGTT,35,-2.311855,-4.165508,-2.291479
5,Upper TX,CGGCTATTTCCTATTTCTGTAAGAGCGCTCGTGTT,35,-1.327835,-3.086618,-1.405075


In [249]:
coding = []

for row in goldilocks_df.iterrows():
    coding.append('CGAC' + row[1]['Sequence'])

print(coding)

['CGACTTGACAATTAATCATCCGGCTCGTAATTTATGTGGA', 'CGACTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG', 'CGACTTGACACTATTTCCTATTTCTCTTATAATTTGTGTT', 'CGACCGGCTATTTCCTATTTCTGTAAGAGCGCTCGTGTT', 'CGACCGGCTATTTCCTATTTCTGTAAGAGCGCTCGTGTT']


In [250]:
def get_complement(str):
    toReturn = ''
    mapping = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    for char in str:
        toReturn += mapping[char]
    return toReturn

In [251]:
complements = []

for row in goldilocks_df.iterrows():
    complements.append('CCGC' + get_complement(row[1]['Sequence'][::-1]))

print(complements)

['CCGCTCCACATAAATTACGAGCCGGATGATTAATTGTCAA', 'CCGCCCACACAACATACGAGCCGGAAGCATAAAGTGTAAA', 'CCGCAACACAAATTATAAGAGAAATAGGAAATAGTGTCAA', 'CCGCAACACGAGCGCTCTTACAGAAATAGGAAATAGCCG', 'CCGCAACACGAGCGCTCTTACAGAAATAGGAAATAGCCG']


## Automate

In [287]:
def find_sequences(goal, tol):
    temp_df = predictions_df[(predictions_df['Observed log(TX/Txref)'] < goal+tol) &
                             (predictions_df['Observed log(TX/Txref)'] > goal-tol) &
                             (predictions_df['Predicted log(TX/Txref)'] < goal+tol) &
                             (predictions_df['Predicted log(TX/Txref)'] > goal-tol)]
    return temp_df

In [288]:
def add_to_df(name, index, df):
    template = predictions_df.loc[index, 'h35'] + predictions_df.loc[index, 'spacs'] + predictions_df.loc[index, 'h10'] + predictions_df.loc[index, 'disc']
    df.loc[len(df)] = [name + '_template', 'CGAC' + template]
    df.loc[len(df)] = [name + '_template', 'CCGC' + template[::-1]]

In [312]:
# Low TX

find_sequences(-4, 0.03).head(None)

Unnamed: 0,UP,h35,spacs,h10,disc,ITR,Observed log(TX/Txref),Predicted log(TX/Txref),Our Prediction
10945,TTTAAAATATAATAGATTTT,TTGACA,CTATTTCCTATTTCTCT,TATAAT,TTGTGTT,CTCTACCTTAGTTTGTACGTT,-4.001459,-4.0213,-3.04403
11064,CTTTCCTATAAATAAAACTAGC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,TTGTGTT,CTCTACCTTAGTTTGTACGTT,-3.978866,-4.011386,-2.258561


In [313]:
# Medium TX

find_sequences(-2.3, 0.01).head(None)

Unnamed: 0,UP,h35,spacs,h10,disc,ITR,Observed log(TX/Txref),Predicted log(TX/Txref),Our Prediction
6899,TTTTCTATCTACGTAC,AAGACG,CTATTTCCTATTTCTCT,TATAAT,TTGTGTT,CTCTACCTTAGTTTGTACGTT,-2.300707,-2.290499,-2.24713
9034,TTTTCTATCTACGTAC,GACCCG,CTATTTCCTATTTCTCT,TATAAT,TTGTGTT,CTCTACCTTAGTTTGTACGTT,-2.304443,-2.290576,-2.211866
9538,TTTTCTATCTACGTAC,GGCCGT,CTATTTCCTATTTCTCT,TATAAT,TTGTGTT,CTCTACCTTAGTTTGTACGTT,-2.309971,-2.309973,-2.241688
9651,TTTTCTATCTACGTAC,GCTGGG,CTATTTCCTATTTCTCT,TATAAT,TTGTGTT,CTCTACCTTAGTTTGTACGTT,-2.304958,-2.306101,-2.239044
10337,TTTTCTATCTACGTAC,CGATGT,CTATTTCCTATTTCTCT,TATAAT,TTGTGTT,CTCTACCTTAGTTTGTACGTT,-2.290339,-2.303591,-2.239111
10386,TTTTCTATCTACGTAC,CGTAGG,CTATTTCCTATTTCTCT,TATAAT,TTGTGTT,CTCTACCTTAGTTTGTACGTT,-2.291985,-2.306241,-2.253165


In [314]:
# High TX

find_sequences(-1.7, 0.07).head(None)

Unnamed: 0,UP,h35,spacs,h10,disc,ITR,Observed log(TX/Txref),Predicted log(TX/Txref),Our Prediction
12954,TTTTCTATCTACGTAC,CCCCTA,TTTCCTATTTCTGTA,GTCATA,GGGTGTT,CTCTACCTTAGTTTGTACGTT,-1.676955,-1.634668,-2.287028
13305,TTTTCTATCTACGTAC,TGAACC,TATTTCCTATTTCTG,TAAAGA,ACGTGTT,CTCTACCTTAGTTTGTACGTT,-1.643107,-1.766673,-3.857825


In [321]:
to_order_df = pd.DataFrame({'Name' : ['p5_coding', 'p5_template', 'plac_coding', 'plac_template'],
                           'Sequence' : ['CGACTTGACAATTAATCATCCGGCTCGTAATTTATGTGGA', 'CCGCTCCACATAAATTACGAGCCGGATGATTAATTGTCAA', 'CGACTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG', 'CCGCCCACACAACATACGAGCCGGAAGCATAAAGTGTAAA']})

In [322]:
add_to_df('low', 10945, to_order_df)
to_order_df.head(None)

Unnamed: 0,Name,Sequence
0,p5_coding,CGACTTGACAATTAATCATCCGGCTCGTAATTTATGTGGA
1,p5_template,CCGCTCCACATAAATTACGAGCCGGATGATTAATTGTCAA
2,plac_coding,CGACTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG
3,plac_template,CCGCCCACACAACATACGAGCCGGAAGCATAAAGTGTAAA
4,low_template,CGACTTGACACTATTTCCTATTTCTCTTATAATTTGTGTT


In [323]:
add_to_df('medium', 6899, to_order_df)
to_order_df.head(None)

Unnamed: 0,Name,Sequence
0,p5_coding,CGACTTGACAATTAATCATCCGGCTCGTAATTTATGTGGA
1,p5_template,CCGCTCCACATAAATTACGAGCCGGATGATTAATTGTCAA
2,plac_coding,CGACTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG
3,plac_template,CCGCCCACACAACATACGAGCCGGAAGCATAAAGTGTAAA
4,low_template,CGACTTGACACTATTTCCTATTTCTCTTATAATTTGTGTT
5,low_template,CCGCTTGTGTTTAATATTCTCTTTATCCTTTATCACAGTT
6,medium_template,CGACAAGACGCTATTTCCTATTTCTCTTATAATTTGTGTT
7,medium_template,CCGCTTGTGTTTAATATTCTCTTTATCCTTTATCGCAGAA


In [324]:
add_to_df('high', 12954, to_order_df)
to_order_df.head(None)

Unnamed: 0,Name,Sequence
0,p5_coding,CGACTTGACAATTAATCATCCGGCTCGTAATTTATGTGGA
1,p5_template,CCGCTCCACATAAATTACGAGCCGGATGATTAATTGTCAA
2,plac_coding,CGACTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG
3,plac_template,CCGCCCACACAACATACGAGCCGGAAGCATAAAGTGTAAA
4,low_template,CGACTTGACACTATTTCCTATTTCTCTTATAATTTGTGTT
5,low_template,CCGCTTGTGTTTAATATTCTCTTTATCCTTTATCACAGTT
6,medium_template,CGACAAGACGCTATTTCCTATTTCTCTTATAATTTGTGTT
7,medium_template,CCGCTTGTGTTTAATATTCTCTTTATCCTTTATCGCAGAA
8,high_template,CGACCCCCTATTTCCTATTTCTGTAGTCATAGGGTGTT
9,high_template,CCGCTTGTGGGATACTGATGTCTTTATCCTTTATCCCC


In [325]:
to_order_df.to_csv('sequence_order_06-17-2024')