# This is an example file to show how to train the model using the Rg.npy dataset.


### Import all required packages

In [None]:
# Import required packages
import numpy as np
import tensorflow as tf
import random
from numpy import sqrt
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
import utils

# Load the data and create arrays for Rg and sequences

In [None]:
seq = np.load("Rg_data.npy",allow_pickle=True)
seq = seq.item()
data_seq = []
Rg = []
nu = []
for i in range(len(seq)):
    data_seq.append(seq[i][0])
    Rg.append(seq[i][1][1][1])
    nu.append(seq[i][1][1][3])
OE = utils.seqs_to_ordinal_encoding(data_seq)

In [None]:
Rg = np.array(Rg).reshape(-1,1)
nu = np.array(nu).reshape(-1,1)
X = utils.seqs_to_bag_of_AAs(data_seq)

Y = np.hstack((Rg,nu))

# Define the dataset split random seed, six-fold cross validation and learning curve split

In [None]:
fold = 6
seed = 10
split = 8
c, CL = utils.get_CL_from_OE(OE)
Train_indices,Test_indices = utils.LC_split_CL(fold,split,seed,c)


## Using the train indices and test indices to separate the data

In [None]:
# Split train and test based on indices

X_train_unscaled = []
X_test_unscaled = []
Y_train_unscaled = []
Y_test_unscaled = []

for i in Train_indices:
    X_train_unscaled.append(X[i])
    Y_train_unscaled.append(Y[i])
for i in Test_indices:
    X_test_unscaled.append(X[i])
    Y_test_unscaled.append(Y[i])
    
X_train_unscaled = np.vstack(X_train_unscaled)
X_test_unscaled = np.vstack(X_test_unscaled)
Y_train_unscaled = np.vstack(Y_train_unscaled)
Y_test_unscaled = np.vstack(Y_test_unscaled)

# Normalize input

scaler = MinMaxScaler()
scaler.fit(X_train_unscaled)
X_train = scaler.transform(X_train_unscaled)
X_test = scaler.transform(X_test_unscaled)

scalerY= MinMaxScaler()
scalerY.fit(Y_train_unscaled)
Y_train = scalerY.transform(Y_train_unscaled)
Y_test = scalerY.transform(Y_test_unscaled)


# Training a feedfoward neural network using Temperature Data

In [None]:

# Standard architecture choices
n_batch = 32
patience = 25
n_hidden_nodes = 100
n_hidden_layers = 2
n_epoch = 10**4
tf.random.set_seed(1)
learning_rate = 0.0001 # learning rate

es = EarlyStopping(monitor = 'val_loss', 
               mode = 'min', verbose = 1, 
               patience = patience,restore_best_weights=True) # patience for early stopping

# Choose optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Find input and output dimensions
input_dim = np.shape(X_train)[1]
output_dim = 2

# Create DNN
DNN = utils.create_DNN(input_dim, output_dim, n_hidden_nodes, n_hidden_layers)

# Compile DNN (and choose loss)
DNN.compile(optimizer=optimizer, loss='mean_squared_error')
# Generate a print
print('------------------------------------------------------------------------')

# Train DNN
DNN.fit(X_train, Y_train, epochs=n_epoch, batch_size=n_batch, 
                shuffle=True, callbacks = [es], 
                validation_split=0.25)

# Calculate training, validaiton, and testing loss
train_loss = np.asarray(DNN.history.history['loss'])
val_loss = np.asarray(DNN.history.history['val_loss'])

DNN.summary()

In [None]:
Y_test_pred = DNN.predict(X_test)
Y_test_pred = scalerY.inverse_transform(Y_test_pred)

In [None]:
print(utils.coeff_determination(Y_test_unscaled,Y_test_pred))
print(utils.percent_error(Y_test_unscaled,Y_test_pred))
print(utils.MAE(Y_test_unscaled,Y_test_pred))
print(utils.MSE(Y_test_unscaled,Y_test_pred))
print(utils.RMSE(Y_test_unscaled,Y_test_pred))

# Extrapolation Test
The extrapolation test is implemented as follow:

1) Define a variable that contains the model parameters
2) Give the function X, Y, Train_indices, Test_indices, model and binary variable forward(default is True).

The results will give a list containing training size, test loss and test score

In [None]:
# Standard architecture choices
n_batch = 32
patience = 25
n_hidden_nodes = 100
n_hidden_layers = 2
n_epoch = 10**4
tf.random.set_seed(1)
learning_rate = 0.0001 # learning rate

es = EarlyStopping(monitor = 'val_loss', 
               mode = 'min', verbose = 1, 
               patience = patience,restore_best_weights=True) # patience for early stopping

# Choose optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Find input and output dimensions
input_dim = np.shape(X_train)[1]
output_dim = 2

# Create DNN
DNN = utils.create_DNN(input_dim, output_dim, n_hidden_nodes, n_hidden_layers)

# Compile DNN (and choose loss)
DNN.compile(optimizer=optimizer, loss='mean_squared_error')
results = utils.extrapolation_test_DNN(X, Y, Train_indices, Test_indices, DNN, forward = True)

# Training on temperture data

In this section, the temperature is incorporated into the feature and can be used to train the model.
The preparation process is slightly different and be written as follow:

In [None]:
# Used for Training on temperature data preparation

seq = np.load("Rg_data.npy",allow_pickle=True)
seq = seq.item()
data_seq = []
Rg = []
nu = []
Temp = []
for i in range(len(seq)):
    data_seq.append(seq[i][0])
    for j in range(len(seq[i][1])):
        Temp.append(seq[i][1][j][0])
        Rg.append(seq[i][1][j][1])
        nu.append(seq[i][1][j][3])
N_of_Temp = len(seq[i][1])
OE = utils.seqs_to_ordinal_encoding(data_seq)

Rg = np.array(Rg).reshape(-1,1)
nu = np.array(nu).reshape(-1,1)
X = utils.seqs_to_bag_of_AAs(data_seq)

Y = np.hstack((Rg,nu))
Train_indices = [item for sublist in Train_indices for item in sublist] # flatten the train indices

X_train_unscaled = []
X_test_unscaled = []
Y_train_unscaled = []
Y_test_unscaled = []

for i in Train_indices:
    for j in range(N_of_Temp):
        X_tmp = np.hstack((X[i],Temp[i*N_of_Temp+j]))
        Y_tmp = Y[i*N_of_Temp+j]
        X_train_unscaled.append(X_tmp)
        Y_train_unscaled.append(Y_tmp)
for i in Test_indices:
    for j in range(N_of_Temp):
        X_tmp = np.hstack((X[i],Temp[i*N_of_Temp+j]))
        Y_tmp = Y[i*N_of_Temp+j]
        X_test_unscaled.append(X_tmp)
        Y_test_unscaled.append(Y_tmp)
    
X_train_unscaled = np.vstack(X_train_unscaled)
X_test_unscaled = np.vstack(X_test_unscaled)
Y_train_unscaled = np.vstack(Y_train_unscaled)
Y_test_unscaled = np.vstack(Y_test_unscaled)

Y_train_nu = Y_train_unscaled[:,0]
Y_test_nu = Y_test_unscaled[:,0]

scaler = MinMaxScaler()
scaler.fit(X_train_unscaled)
X_train = scaler.transform(X_train_unscaled)
X_test = scaler.transform(X_test_unscaled)

scalerY = MinMaxScaler()
scalerY.fit(Y_train_unscaled)
Y_train = scalerY.transform(Y_train_unscaled)