## Description

This version is trained on predictions.csv, which contains LaFleur et al.'s base data. For X data, it onehot encodes base pairs in each element and appeneds them to a 2 dimensional array. It pads each element so that each element is the same length. This was a design decision that we changed later. The y data is the 'Observed log(TX/Txref)' which is normalized using sklearn's MinMaxScaler (from 0, high expression to 1, low expression).

The model uses a 3x3 kernal and no early stopping

In [3]:
import pandas as pd
import numpy as np

In [None]:
# Load the Data

df = pd.read_csv('../predictions.csv')

In [4]:
# Normalize the y values

from sklearn.preprocessing import MinMaxScaler
df['Normalized Observed'] = MinMaxScaler().fit_transform(df[['Observed log(TX/Txref)']])

In [5]:
# Define the function to onehot encode the input (x) sequences

def padded_one_hot_encode(sequence):
    mapping = {'A': [1,0,0,0], 'C': [0,1,0,0], 'G': [0,0,1,0], 'T': [0,0,0,1], '0': [0,0,0,0]}
    encoding = []
    for nucleotide in sequence:
         encoding += [mapping[nucleotide]]
    return encoding


X = df[['UP', 'h35', 'spacs', 'h10', 'disc', 'ITR']]
y = df['Normalized Observed']

upstream_padding = {}
for col in X.columns:
    max_len = X[col].apply(len).max()
    upstream_padding[col] = np.array([padded_one_hot_encode('0' * (max_len - len(seq)) + seq) for seq in X[col]])

X = np.concatenate([upstream_padding[col] for col in X.columns], axis=1)

In [6]:
# Split the data into training and testing sets (80/20)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# stores the various input approaches
X_dict = {}

# stores split training/testing
train_test = {}

# stores the results
results = {}

# stores the models
models = {}

# stores the model history
model_history = {}

In [None]:
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

# Define CNN model architecture
models['CNN'] = Sequential()
models['CNN'].add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=X.shape[1:]))
models['CNN'].add(MaxPooling1D(pool_size=2))
models['CNN'].add(Conv1D(filters=64, kernel_size=3, activation='relu'))
models['CNN'].add(MaxPooling1D(pool_size=2))
models['CNN'].add(Flatten())
models['CNN'].add(Dense(64, activation='relu'))
models['CNN'].add(Dense(1, activation='linear'))

# Compile the model
optimizer = Adam(learning_rate=0.001)
models['CNN'].compile(optimizer=optimizer, loss='mean_squared_error')

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = models['CNN'].fit(X_train,
                            y_train,
                            epochs=150,
                            batch_size=32,
                            validation_data=(X_test, y_test),
                            callbacks=[early_stopping])

# Evaluate the model
loss = models['CNN'].evaluate(X_test, y_test)

results['CNN'] = loss
model_history['CNN'] = history
models['CNN'].save('CNN.keras')


In [None]:
print(loss)

In [None]:
# Make predictions for the test set to visualize the results

our_prediction_dict = {}

for i in range(len(X)):
    our_prediction_dict[i] = models['CNN'].predict(np.array([X[i]]))[0][0]
    

In [None]:
# Save the predictions and observed values to a dataframe for visualization

our_prediction = pd.DataFrame.from_dict(our_prediction_dict, orient='index', columns=['Value'])
observed = df['Normalized Observed']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(observed, fill=True, color='blue', label='Observed')
sns.kdeplot(our_prediction, fill=True, color='green', label='Our Prediction')

plt.title('Kernel Density Plot')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(observed, fill=True, color='blue', label='Observed')
sns.kdeplot(our_prediction, fill=True, color='green', label='Our Prediction')

plt.title('Kernel Density Plot')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

our_prediction = np.ravel(our_prediction)

plt.figure(figsize=(10, 6))
plt.scatter(observed, our_prediction, color='blue', alpha=0.5, label='Data points')

min_val = min(min(observed), min(our_prediction))
max_val = max(max(observed), max(our_prediction))
plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--', label='y = x line')

plt.title('Observed log(TX/Txref) vs. Our Prediction')
plt.xlabel('Observed log(TX/Txref)')
plt.ylabel('Our Prediction')
plt.legend()
plt.grid(True)
plt.show()
