In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras

## Making Predictions for pLac, Low, Med, High, Neg, P5, P533A, OsmY, and OsmY_C13A

Notes:
* Descriminator overlaps with ITR
* Need ITR

Lengths:
* UP = 16bp
* -35 = 6bp
* spacer = 17bp (or as noted in pdf)
* -10 = 6bp
* Disc = 8bp
* ITR = 20bp

In [2]:
def format(seq):
    return seq.upper().replace(' ', '')

In [3]:
# GCGGGAAGACAACTAGGGG is the last 19 bp, 20 bp is ITR, first bp of ITR is depends on the promoter (it is before the sticky end)
# 20bp up sequence

seqs = {
         'pLac' : '' + '' + 'GCGGGAAGACAACTAGGGG',
          'Low' : 'AATACTAGAGGTCTTCCGACTTGACACTATTTCCTATTTCTCTTATAATTTGTGTT' + 'GCGGGAAGACAACTAGGGG',
          'Med' : 'AATACTAGAGGTCTTCCGACAAGACGCTATTTCCTATTTCTCTTATAATTTGTGTT' + 'GCGGGAAGACAACTAGGGG',
         'High' : 'AATACTAGAGGTCTTCCGACCCCCTATTTCCTATTTCTGTAGTCATAGGGTGTT' + 'GCGGGAAGACAACTAGGGG',
        #   'Neg' : '' + '' + 'GCGGGAAGACAACTAGGGG',
           'P5' : 'AATACTAGAGGTCTTCCGACTTGACAATTAATCATCCGGCTCGTAATTTATGTGGA' + 'GCGGGAAGACAACTAGGGG',
        'P533A' : 'AATACTAGAGGTCTTCCGACTTGACAATTAATCATCCGGCTCGTAATTTATGTGGA' + 'GCGGGAAGACAACTAGGGG',
         'OsmY' : 'ATACTAGAGGTCTTCCGACCCCGAGCGGTTTCAAAATTGTGATCTATATTTAACAAA' + 'GCGGGAAGACAACTAGGGG',
    'OsmY_C13A' : 'TACTAGAGGTCTTCCGACCCCGAGCGGTTTCAAAATTGTGATATATATTTAACAAA' + 'GCGGGAAGACAACTAGGGG',
}

In [4]:
seqs = {
         'pLac' : ['CTAGAGGTCTTCCGAC', 'TTTACA', 'CTTTATGCTTCCGGCTCG', 'TATGTT', 'GTGTGGGC', 'GGGAAGACAACTAGGGGCCC'],
          'Low' : ['CTAGAGGTCTTCCGAC', 'TTGACA', 'CTATTTCCTATTTCTCT', 'TATAAT', 'TTGTGTTG', 'CGGGAAGACAACTAGGGGCC'],
          'Med' : ['CTAGAGGTCTTCCGAC', 'AAGACG', 'CTATTTCCTATTTCTCT', 'TATAAT', 'TTGTGTTG', 'CGGGAAGACAACTAGGGGCC'],
         'High' : ['CTAGAGGTCTTCCGAC', 'CCCCTA', 'TTTCCTATTTCTGTA', 'GTCATA', 'GGGTGTTG', 'CGGGAAGACAACTAGGGGCC'],
        #   'Neg' : ['', '', '', '', '', ''],
           'P5' : ['CTAGAGGTCTTCCGAC', 'TTGACA', 'ATTAATCATCCGGCTCG', 'TAATTT', 'ATGTGGAG', 'CGGGAAGACAACTAGGGGCC'],
        'P533A' : ['CTAGAGGTCTTCCGAC', 'TTAACA', 'ATTAATCATCCGGCTCG', 'TAATTT', 'ATGTGGAG', 'CGGGAAGACAACTAGGGGCC'],
         'OsmY' : ['AGAGGTCTTCCGACCC', 'CGAGCG', 'GTTTCAAAATTGTGATC', 'TATATT', 'TAACAAAG', 'CGGGAAGACAACTAGGGGCC'],
    'OsmY_C13A' : ['AGAGGTCTTCCGACCC', 'CGAGCG', 'GTTTCAAAATTGTGATA', 'TATATT', 'TAACAAAG', 'CGGGAAGACAACTAGGGGCC'],
}

In [5]:
X = pd.DataFrame(seqs).T
X.head(None)

Unnamed: 0,0,1,2,3,4,5
pLac,CTAGAGGTCTTCCGAC,TTTACA,CTTTATGCTTCCGGCTCG,TATGTT,GTGTGGGC,GGGAAGACAACTAGGGGCCC
Low,CTAGAGGTCTTCCGAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,TTGTGTTG,CGGGAAGACAACTAGGGGCC
Med,CTAGAGGTCTTCCGAC,AAGACG,CTATTTCCTATTTCTCT,TATAAT,TTGTGTTG,CGGGAAGACAACTAGGGGCC
High,CTAGAGGTCTTCCGAC,CCCCTA,TTTCCTATTTCTGTA,GTCATA,GGGTGTTG,CGGGAAGACAACTAGGGGCC
P5,CTAGAGGTCTTCCGAC,TTGACA,ATTAATCATCCGGCTCG,TAATTT,ATGTGGAG,CGGGAAGACAACTAGGGGCC
P533A,CTAGAGGTCTTCCGAC,TTAACA,ATTAATCATCCGGCTCG,TAATTT,ATGTGGAG,CGGGAAGACAACTAGGGGCC
OsmY,AGAGGTCTTCCGACCC,CGAGCG,GTTTCAAAATTGTGATC,TATATT,TAACAAAG,CGGGAAGACAACTAGGGGCC
OsmY_C13A,AGAGGTCTTCCGACCC,CGAGCG,GTTTCAAAATTGTGATA,TATATT,TAACAAAG,CGGGAAGACAACTAGGGGCC


In [None]:
def padded_one_hot_encode(sequence):
    mapping = {'A': [1,0,0,0], 'C': [0,1,0,0], 'G': [0,0,1,0], 'T': [0,0,0,1], '0': [0,0,0,0]}
    encoding = []
    for nucleotide in sequence:
         encoding += [mapping[nucleotide]]
    return encoding


upstream_padding = {}
for col in X.columns:
    lengths = [22, 6, 19, 6, 8, 21]
    upstream_padding[col] = np.array([padded_one_hot_encode('0' * (lengths[col] - len(seq)) + seq) for seq in X[col]])

X = np.concatenate([upstream_padding[col] for col in X.columns], axis=1)

In [None]:
X.shape

In [None]:
# # Old model (should be worse, but isn't)

# model = keras.saving.load_model('../../v1/models/Hyperparameter_tuned.keras')

In [None]:
# New model (should be better, but isn't)

model = keras.saving.load_model('../Testing/CNN_kernal_4.keras')

In [None]:
promoters = ['pLac', 'Low', 'Medium', 'High', 'P5', 'P533A', 'OsmY', 'OsmY_C13A']
scaled_predictions = [(model.predict(np.array([X[i]]))[0][0])for i in range(len(X))]
RFP_expression = [0 for i in range(len(X))]

prediction_df = pd.DataFrame({'Promoter': promoters, 'Prediction': scaled_predictions})
prediction_df.head(None)


In [None]:
# # Not necessary with old model

# from sklearn.preprocessing import MinMaxScaler

# observed_df = pd.read_csv('../predictions.csv')

# scaler = MinMaxScaler()
# scaled_data = scaler.fit_transform(observed_df[['Observed log(TX/Txref)']])
# predicted_values = scaler.inverse_transform(np.array(scaled_predictions).reshape(-1, 1))
# prediction_df['Prediction'] = predicted_values

# prediction_df.head(None)

## Getting observed RFP

In [None]:
RFP_df = pd.read_csv('2024_07_03_Fluorescence_Data_formatted.csv')
RFP_df.head()

In [None]:
RFP_df = RFP_df[RFP_df['Promoter'] != 'Blank'].reset_index(drop=True)
RFP_df.head()

In [None]:
# # drop P5 outliar

# RFP_df = RFP_df[RFP_df['Promoter'] != 'P5'].reset_index(drop=True)
# RFP_df.head()

In [None]:
comparison_df = RFP_df
comparison_df = comparison_df.merge(prediction_df[['Promoter', 'Prediction']], on='Promoter', how='left')
comparison_df.head()

In [None]:
# Drop rows with NaN values
comparison_df.dropna(inplace=True)
comparison_df.head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


unique_media = comparison_df['Media'].unique()
unique_promoters = comparison_df['Promoter'].unique()

colors = plt.cm.Set2(np.linspace(0, 1, len(unique_promoters)))
color_map = {promoter: colors[i] for i, promoter in enumerate(unique_promoters)}

for media in unique_media:
    subset_df_media = comparison_df[comparison_df['Media'] == media]

    plt.figure()

    for i, promoter in enumerate(unique_promoters):
        promoter_subset_df = subset_df_media[subset_df_media['Promoter'] == promoter]
        color = color_map[promoter]
        plt.scatter(promoter_subset_df['Prediction'], promoter_subset_df['Relative Fluorescence'], c=[color], label=promoter)

    plt.xlabel('Predicted log(TX/Txref)')
    plt.ylabel('Relative Fluorescence')
    plt.title(f'Scatter plot for Media: {media}')
    plt.legend()

plt.show()
