In [None]:
import sys
from sklearn import preprocessing, model_selection
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import argparse
from argparse import RawTextHelpFormatter, RawDescriptionHelpFormatter
import os
from scipy import stats
from tqdm import tqdm



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def rmse(y_true, y_pred):
    dev = np.square(y_true.ravel() - y_pred.ravel())
    return np.sqrt(np.sum(dev) / y_true.shape[0])


def pcc(y_true, y_pred):
    p = stats.pearsonr(y_true, y_pred)
    return p[0]


def pcc_rmse(y_true, y_pred):
    global alpha

    dev = np.square(y_true.ravel() - y_pred.ravel())
    r = np.sqrt(np.sum(dev) / y_true.shape[0])

    p = stats.pearsonr(y_true, y_pred)[0]

    return (1-p)*alpha + r * (1 - alpha)


def PCC_RMSE(y_true, y_pred):
    global alpha

    fsp = y_pred - tf.keras.backend.mean(y_pred)
    fst = y_true - tf.keras.backend.mean(y_true)

    devP = tf.keras.backend.std(y_pred)
    devT = tf.keras.backend.std(y_true)

    r = tf.keras.backend.sqrt(tf.keras.backend.mean(tf.keras.backend.square(y_pred - y_true), axis=-1))

    p = 1.0 - tf.keras.backend.mean(fsp * fst) / (devP * devT)

    #p = tf.where(tf.is_nan(p), 0.25, p)

    return alpha * p + (1 - alpha) * r


def RMSE(y_true, y_pred):
    return tf.keras.backend.sqrt(tf.keras.backend.mean(tf.keras.backend.square(y_pred - y_true), axis=-1))


def PCC(y_true, y_pred):
    fsp = y_pred - tf.keras.backend.mean(y_pred)
    fst = y_true - tf.keras.backend.mean(y_true)

    devP = tf.keras.backend.std(y_pred)
    devT = tf.keras.backend.std(y_true)

    return tf.keras.backend.mean(fsp * fst) / (devP * devT)

In [None]:
def create_model(input_size, lr=0.0001, maxpool=True, dropout=0.1):
    model = tf.keras.Sequential()

    model.add(tf.keras.layers.Conv2D(32, kernel_size=4, strides=1,
                                     padding="valid", input_shape=input_size))
    model.add(tf.keras.layers.Activation("relu"))
    if maxpool:
        model.add(tf.keras.layers.MaxPooling2D(
            pool_size=2,
            strides=2,
            padding='same',  # Padding method
        ))

    model.add(tf.keras.layers.Conv2D(64, 4, 1, padding="valid"))
    model.add(tf.keras.layers.Activation("relu"))
    if maxpool:
        model.add(tf.keras.layers.MaxPooling2D(
            pool_size=2,
            strides=2,
            padding='same',  # Padding method
        ))

    model.add(tf.keras.layers.Conv2D(128, 4, 1, padding="valid"))
    model.add(tf.keras.layers.Activation("relu"))
    if maxpool:
        model.add(tf.keras.layers.MaxPooling2D(
            pool_size=2,
            strides=2,
            padding='same',  # Padding method
        ))

    model.add(tf.keras.layers.Flatten())

    model.add(tf.keras.layers.Dense(400, kernel_regularizer=tf.keras.regularizers.l2(0.01), ))
    model.add(tf.keras.layers.Activation("relu"))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(dropout))

    model.add(tf.keras.layers.Dense(200,
                                    kernel_regularizer=tf.keras.regularizers.l2(0.01), ))
    model.add(tf.keras.layers.Activation("relu"))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(dropout))

    model.add(tf.keras.layers.Dense(100, kernel_regularizer=tf.keras.regularizers.l2(0.01), ))
    model.add(tf.keras.layers.Activation("relu"))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(dropout))

    #model.add(tf.keras.layers.Dense(20, kernel_regularizer=tf.keras.regularizers.l2(0.01), ))
    #model.add(tf.keras.layers.Activation("relu"))
    #model.add(tf.keras.layers.BatchNormalization())
    #model.add(tf.keras.layers.Dropout(dropout))

    model.add(tf.keras.layers.Dense(1, kernel_regularizer=tf.keras.regularizers.l2(0.01), ))
    model.add(tf.keras.layers.Activation("relu"))

    sgd = tf.keras.optimizers.SGD(lr=lr, momentum=0.9, decay=1e-6, )
    model.compile(optimizer=sgd, loss=PCC_RMSE, metrics=['mse'])

    return model

In [None]:
reshape = [60,169,1]

In [None]:
train_file = "train_test_validate_set/train.csv"
val_file = "train_test_validate_set/validate.csv"
test_file = "train_test_validate_set/test.csv"
path = "/content/drive/Shared drives/Bioinformatics/Term Project/"

In [None]:
Xtrain, ytrain = None, []
df = pd.read_csv(path + train_file,index_col=0,header = 0,names = None).dropna()
df = df[0:3000]
for index,row in tqdm(df.iterrows()):
  if(index==0):
    Xtrain = row.values[1:-2].reshape(1,-1)
  else:
    Xtrain = np.concatenate((Xtrain,row.values[1:-2].reshape(1,-1)), axis=0)
  ytrain = ytrain + [row.values[-1]]


1599it [08:24,  1.61it/s]

In [None]:
Xval, yval = None, []
df = pd.read_csv(path + val_file,index_col=0,header = 0,names = None).dropna()
# print(df.head())
index = 0
for _,row in tqdm(df.iterrows()):
  if(index==0):
    Xval = row.values[1:-2].reshape(1,-1)
  else:
    Xval = np.concatenate((Xval,row.values[1:-2].reshape(1,-1)), axis=0)
  yval = yval + [row.values[-2]]
  index = index + 1

Xtest, ytest = None, []
df = pd.read_csv(path + test_file,index_col=0,header = 0,names = None).dropna()
index = 0
for _,row in tqdm(df.iterrows()):
  if(index==0):
    Xtest = row.values[1:-2].reshape(1,-1)
  else:
    Xtest = np.concatenate((Xtest,row.values[1:-2].reshape(1,-1)), axis=0)
  ytest = ytest + [row.values[-2]]
  index = index + 1

In [None]:
scaler = preprocessing.StandardScaler()
X_train_val = np.concatenate((Xtrain, Xval), axis=0)
scaler.fit(X_train_val)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [None]:
Xtrain = scaler.transform(Xtrain).reshape(-1, reshape[0],reshape[1],reshape[2])
Xval = scaler.transform(Xval).reshape(-1, reshape[0],reshape[1],reshape[2])
Xtest = scaler.transform(Xtest).reshape(-1, reshape[0],reshape[1],reshape[2])
ytrain = np.array(ytrain).reshape(-1, 1)
yval = np.array(yval).reshape(-1, 1)
ytest = np.array(ytest).reshape(-1, 1)

In [None]:
log = []
stop = [[0,999.9], ]
batch_size = 128
epochs = 100
global alpha
alpha = 0.8
patience = 5
model_name = "model3.h5"
delta_loss = 0.001
lr = 0.001
dropout = 0.1
maxpool = False
model = create_model((reshape[0], reshape[1], reshape[2]),
                                 lr=lr, dropout=dropout, maxpool=maxpool)

In [None]:
for e in range(1, epochs+1):
            model.fit(Xtrain, ytrain, validation_data=(Xval, yval),batch_size=batch_size, epochs=1, verbose=1)

            ytrain_pred = model.predict(Xtrain).ravel()
            loss = pcc_rmse(ytrain.ravel(), ytrain_pred)
            pcc_train = pcc(ytrain.ravel(), ytrain_pred)
            rmse_train = rmse(ytrain.ravel(), ytrain_pred)

            yval_pred = model.predict(Xval).ravel()
            loss_val = pcc_rmse(yval.ravel(), yval_pred)
            pcc_val = pcc(yval.ravel(), yval_pred)
            rmse_val = rmse(yval.ravel(), yval_pred)

            ytest_pred = model.predict(Xtest).ravel()
            loss_test = pcc_rmse(ytest.ravel(), ytest_pred)
            pcc_test = pcc(ytest.ravel(), ytest_pred)
            rmse_test = rmse(ytest.ravel(), ytest_pred)

            log.append([e, loss, pcc_train, rmse_train,
                            loss_val, pcc_val, rmse_val,
                            loss_test, pcc_test, rmse_test])
            logs    = pd.DataFrame(log, columns=['epoch', 'loss', 'pcc_train', 'rmse_train',
                                                     'loss_val', 'pcc_val', 'rmse_val',
                                                     'loss_test', 'pcc_test', 'rmse_test'])

            print("EPOCH:%d Loss:%.3f RMSE:%.3f PCC:%.3f LOSS_VAL:%.3f RMSE:%.3f PCC:%.3f LOSS_TEST:%.3f RMSE_TEST:%.3f PCC_TEST:%.3f"%
                  (e, loss, rmse_train, pcc_train, loss_val, rmse_val, pcc_val, loss_test, rmse_test, pcc_test ))            

            if(stop[-1][1] - loss_val >= delta_loss):
                print("Model improve from %.3f to %.3f. Save model to %s."% (stop[-1][1], loss_val, path + model_name))
                model.save(path + model_name)
                stop.append([e, loss_val])
            else:
                if(e - stop[-1][0] >= patience):
                    print("Get best model at epoch = %d." % stop[-1][0])
                    break

EPOCH:1 Loss:6.534 RMSE:6.589 PCC:-0.059 LOSS_VAL:6.661 RMSE:6.718 PCC:-0.104 LOSS_TEST:6.723 RMSE_TEST:6.779 PCC_TEST:-0.143
Model improve from 999.900 to 6.661. Save model to /content/drive/Shared drives/Bioinformatics/Term Project/train_test_validate_set/model2.h5.
EPOCH:2 Loss:6.137 RMSE:6.190 PCC:0.042 LOSS_VAL:6.304 RMSE:6.358 PCC:0.036 LOSS_TEST:6.402 RMSE_TEST:6.456 PCC_TEST:0.033
Model improve from 6.661 to 6.304. Save model to /content/drive/Shared drives/Bioinformatics/Term Project/train_test_validate_set/model2.h5.
EPOCH:3 Loss:5.517 RMSE:5.563 PCC:0.078 LOSS_VAL:5.751 RMSE:5.800 PCC:0.092 LOSS_TEST:5.856 RMSE_TEST:5.907 PCC_TEST:0.111
Model improve from 6.304 to 5.751. Save model to /content/drive/Shared drives/Bioinformatics/Term Project/train_test_validate_set/model2.h5.
EPOCH:4 Loss:4.872 RMSE:4.913 PCC:0.115 LOSS_VAL:5.140 RMSE:5.183 PCC:0.140 LOSS_TEST:5.277 RMSE_TEST:5.322 PCC_TEST:0.171
Model improve from 5.751 to 5.140. Save model to /content/drive/Shared drives/Bi

In [None]:
print(logs)
logs.to_csv(path + 'logs3.csv')

    epoch      loss  pcc_train  ...  loss_test  pcc_test  rmse_test
0       1  6.533945  -0.059304  ...   6.722670 -0.142783   6.779032
1       2  6.137287   0.041513  ...   6.401539  0.032514   6.456428
2       3  5.516870   0.077695  ...   5.856421  0.111386   5.906601
3       4  4.872295   0.114948  ...   5.277350  0.170964   5.322283
4       5  4.195430   0.127352  ...   4.649406  0.189461   4.688183
5       6  3.581473   0.116952  ...   3.999823  0.196667   4.032111
6       7  3.099241   0.134812  ...   3.422646  0.213167   3.449271
7       8  2.892144   0.138792  ...   3.193854  0.215618   3.218192
8       9  2.561418   0.155774  ...   2.811932  0.193601   2.832190
9      10  2.528632   0.163073  ...   2.772333  0.192377   2.792179
10     11  2.335027   0.156463  ...   2.506166  0.142278   2.522816
11     12  2.491372   0.180521  ...   2.685074  0.196940   2.704084
12     13  2.514297   0.203218  ...   2.688580  0.247268   2.708134
13     14  2.402722   0.233883  ...   2.447474  