## DeepSynergy

Author: Kristina Preuer

This Keras script shows how DeepSynergy was evaluated in one cross validation run (executed 5 times - looping over test folds). In this examples fold 0 is used for testing. The script uses 60% of the data  for training (folds 2, 3, 4) and 20% for validation (fold 1). The parameters are loaded with a separate text file (hyperparameters). Validation loss was used to determine the early stopping parameter. After hyperparameter selection the training and validation data was combined (80% = folds 1, 2, 3, 4) and the remaining 20% (fold 0) of the data were used for testing.

The original work was done accordingly with binet (https://github.com/bioinf-jku/binet/tree/master/binet). 

In [None]:
import os, sys

import pandas as pd
import numpy as np
import pickle
import gzip
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from tensorflow.keras.models import load_model

import matplotlib.pyplot as plt

os.environ["CUDA_VISIBLE_DEVICES"]="3" #specify GPU 
from tensorflow import keras as K
import tensorflow as tf
from tensorflow.keras import backend
from tensorflow.compat.v1.keras.backend import set_session
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

#### Define parameters for this cross-validation run

In [None]:
hyperparameter_file = 'hyperparameters' # textfile which contains the hyperparameters of the model
data_file = '/home/nidhi/Documents/freelancing/DeepSynergy/data/data_test_fold0_tanh.p.gz' # pickle file which contains the data (produced with normalize.ipynb)

#### Define smoothing functions for early stopping parameter

In [None]:
def moving_average(a, n=3):
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

#### Load parameters defining the model

In [None]:
exec(open(hyperparameter_file).read()) 

#### Load data 
tr = 60% of data for training during hyperparameter selection <br>
val = 20% of data for validation during hyperparameter selection

train = tr + val = 80% of data for training during final testing <br>
test = remaining left out 20% of data for unbiased testing 

splitting and normalization was done with normalize.ipynb

In [None]:
file = gzip.open(data_file, 'rb')
X_tr, X_val, X_train, X_test, y_tr, y_val, y_train, y_test = pickle.load(file)
file.close()

#### run set

In [None]:
config = tf.compat.v1.ConfigProto(
         allow_soft_placement=True,
         #gpu_options = tf.GPUOptions(allow_growth=True)
        # gpu_options = tf.compat.v1.GPUOptions(allow_growth = True)
        )

config.gpu_options.allow_growth = True

In [None]:

#set_session(tf.compat.v1.Session()(config=config))

from tensorflow.compat.v1 import InteractiveSession
session = InteractiveSession(config=config)



In [None]:
model = Sequential()
metrics=[tf.keras.metrics.MeanSquaredError(), 
         tf.keras.metrics.RootMeanSquaredError(),
         tf.keras.metrics.MeanAbsoluteError()]
for i in range(len(layers)):
    if i==0:
        model.add(Dense(layers[i], input_shape=(X_tr.shape[1],), activation=act_func, 
                        kernel_initializer='he_normal'))
        model.add(Dropout(float(input_dropout)))
    elif i==len(layers)-1:
        model.add(Dense(layers[i], activation='linear', kernel_initializer="he_normal"))
    else:
        model.add(Dense(layers[i], activation=act_func, kernel_initializer="he_normal"))
        model.add(Dropout(float(dropout)))
    model.compile(loss='mean_squared_error', metrics= metrics, optimizer=K.optimizers.SGD(lr=float(eta), momentum=0.5))

Train model

In [None]:
print(X_tr.shape)
print(y_tr.shape)

In [None]:
hist = model.fit(X_tr, y_tr, epochs=15, shuffle=True, batch_size=64, validation_data=(X_val, y_val))
val_loss = hist.history['val_loss']
#model.reset_states()

In [None]:
model.save("model_15_epochs.h5")
model_15epochs = load_model("model_15_epochs.h5")
predictions_15epochs = model_15epochs.predict(X_test)


In [None]:
predictions_15epochs

In [None]:
hist.history

Evaluate model on test data

In [None]:
print("Regression metrics from the model")
results = model.evaluate(X_test, y_test, batch_size=128)
print(results)
print("Mean squared Error=", results[1])
print("Root Mean squared Error=", results[2])
print("Mean Absolute Error=", results[3])


Synergy Score Prediction

In [None]:
print("Synergy Score Prediction")
predictions = model.predict(X_test)
print("predictions shape:", predictions.shape)

print(predictions)

Classification metrics for model

In [None]:
#generate classification labels.
#Classification threshold = 30 as used in paper.
classification_df = pd.DataFrame(y_test, columns= ["y_test"])

classification_df.loc[classification_df.y_test > 30, "y_label"] = 1
classification_df.loc[classification_df.y_test < 30, "y_label"] = 0

#add predictions to classification_df
classification_df["predictions"] = predictions

classification_df.loc[classification_df.predictions > 30, "predicted_label"] = 1
classification_df.loc[classification_df.predictions < 30, "predicted_label"] = 0

classification_df.head()


In [None]:
correct_preds = classification_df.index[classification_df.y_label == classification_df.predicted_label].tolist()
accuracy = 100 * len(correct_preds)/classification_df.shape[0]
balanced_accuracy = 100 * balanced_accuracy_score(np.array(classification_df.y_label), np.array(classification_df.predicted_label))
print("accuracy = {}%".format(accuracy))
print("balanced accuracy = {0}%".format(balanced_accuracy))
print("Confusion Matrix:")
print(confusion_matrix(np.array(classification_df.y_label), np.array(classification_df.predicted_label)))



Accuracy per epoch

In [None]:
model = Sequential()
metrics=[tf.keras.metrics.MeanSquaredError(), 
        tf.keras.metrics.RootMeanSquaredError(),
        tf.keras.metrics.MeanAbsoluteError()]
for i in range(len(layers)):
    if i==0:
        model.add(Dense(layers[i], input_shape=(X_tr.shape[1],), activation=act_func, 
        kernel_initializer='he_normal'))
        
        model.add(Dropout(float(input_dropout)))
        
    elif i==len(layers)-1:
        model.add(Dense(layers[i], activation='linear', kernel_initializer="he_normal"))
    
    else:
        model.add(Dense(layers[i], activation=act_func, kernel_initializer="he_normal"))
        model.add(Dropout(float(dropout)))
        
    model.compile(loss='mean_squared_error', metrics= metrics, optimizer=K.optimizers.SGD(lr=float(eta), momentum=0.5))
model.save("model.h5") 

index = 0
columns = ["MSE", "RMSE", "MAE", "acc", "balanced_acc"]
metrics_df = pd.DataFrame(columns = columns)

for epoch in range(1,50):
    
    model = load_model('model.h5')
    hist = model.fit(X_train, y_train, epochs=1, shuffle=True, batch_size=64, validation_data=(X_test, y_test))

    print("Regression metrics from the model")
    results = model.evaluate(X_test, y_test, batch_size=128)
    predictions2 = model.predict(X_test)
    classification_df = pd.DataFrame(y_test, columns= ["y_test"])

    classification_df.loc[classification_df.y_test > 30, "y_label"] = 1
    classification_df.loc[classification_df.y_test < 30, "y_label"] = 0

    #add predictions to classification_df
    classification_df["predictions"] = predictions2
    classification_df.loc[classification_df.predictions > 30, "predicted_label"] = 1
    classification_df.loc[classification_df.predictions < 30, "predicted_label"] = 0

    correct_preds = classification_df.index[classification_df.y_label == classification_df.predicted_label].tolist()
    accuracy = 100 * len(correct_preds)/classification_df.shape[0]
    balanced_accuracy = 100 * balanced_accuracy_score(np.array(classification_df.y_label), np.array(classification_df.predicted_label))

    #metrics per epoch
    metrics_df.at[index, "MSE"] = results[1]
    metrics_df.at[index, "RMSE"] = results[2]
    metrics_df.at[index, "MAE"] = results[3]
    metrics_df.at[index, "acc"] = accuracy
    metrics_df.at[index, "balanced_acc"] = balanced_accuracy

    index +=1
    print(metrics_df)
    model.save("model.h5")
    print("Saved model to disk")




        

Saving results to disk for plotting.

In [None]:
# #save predictions to disk for plotting results.
# predictions_df = pd.DataFrame(predictions)
# predictions_df.to_csv("/home/nidhi/Documents/freelancing/DeepSynergy/data/Results/predictions.csv")

In [None]:
predictions2_df

In [None]:
#save results of model training version 2 (WITH ACCURACY METRICS PER EPOCH)
predictions2_df = pd.DataFrame(predictions2)
predictions2_df.to_csv("/home/nidhi/Documents/freelancing/DeepSynergy/data/Results/predictions_per_epoch_acc_50_epochs.csv")

In [None]:
#save metrics_df 
metrics_df.to_csv("/home/nidhi/Documents/freelancing/DeepSynergy/data/Results/metrics_per_epoch_50epochs.csv")

#### smooth validation loss for early stopping parameter determination

In [None]:
average_over = 2 #was 15 earlier. We only ran 2 epochs. So, threw error.
mov_av = moving_average(np.array(val_loss), average_over)
print(mov_av)
smooth_val_loss = np.pad(mov_av, int(average_over/2), mode='edge')
epo = np.argmin(smooth_val_loss)


#### determine model performance for methods comparison 

In [None]:
hist = model.fit(X_train, y_train, epochs=2, shuffle=True, batch_size=64, validation_data=(X_test, y_test))
test_loss = hist.history['val_loss']

#### plot performance 

In [None]:
fig, ax = plt.subplots(figsize=(16,8))
ax.plot(val_loss, label='validation loss')
ax.plot(smooth_val_loss, label='smooth validation loss')
ax.plot(test_loss, label='test loss')
ax.legend()
plt.show()