In [80]:
import os
os.getcwd()

'/home/woosung/tensorflow/temp/SIW/Binding_Motif'

In [21]:
%cd /home/woosung/tensorflow/temp/SIW/Binding_Motif

/home/woosung/tensorflow/temp/SIW/Binding_Motif


In [22]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.layers import BatchNormalization, Conv1D, Conv2D, Dense, \
Dropout, Embedding, Flatten, Input, MaxPool1D, MaxPool2D, Reshape, ReLU, LeakyReLU, Softmax
from tensorflow.keras import Model
from keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam, SGD
from sklearn.model_selection import train_test_split

In [3]:
# CHANGE THE PATH

df = pd.read_csv('/home/woosung/tensorflow/temp/SIW/Binding_Motif/data_set', index_col=0)
df.columns = ['sequence', 'class']
df

Unnamed: 0,sequence,class
0,AGAGAA,B
1,AGAAGA,B
2,CAGAGA,B
3,AGGAAG,B
4,AGGCAG,B
...,...,...
4149,NTGTGC,N
4150,GCTGCN,N
4151,CTGCNN,N
4152,GCNNNN,N


In [84]:
df.shape

(4154, 2)

## A. Configurations

### constants

In [23]:
# define some preliminaries

SEQCHAR = 'ACGNT' # N for not known or just padding
CLS = 'NB'
MAX_LEN = 6

### Model configurations (dictionary)

In [53]:
# model configurations (hyperparameters)

model_config = dict()
model_config['conv2d_dim'] = 32
model_config['kernel_height'] = 5
model_config['kernel_width'] = len(SEQCHAR) #kernel_width는 한 방향으로 진행하기 위해 nucleotide 종류로 통일
model_config['num_cls'] = len(CLS)
model_config['output_dim'] = 2


### Training hyperparameters

In [25]:
# training hyperparameters

EPOCHS = 100
LR = 0.001

## B. Functions for preprocessing

In [7]:
# function for padding sequences (not directly used in the main function)

def _pad_sequence(sequence, max_length=MAX_LEN):

    dummy_seq = 'N'*(max_length - len(sequence))
    padded_seq = sequence + dummy_seq
    
    return padded_seq

In [8]:
# function for sequence preprocessing

def preproc_sequence(sequence_data, max_length, onehot=False, SEQCHAR=SEQCHAR): # sequence_data = df['sequence'] or df['sequence'].tolist()
    
    if isinstance(sequence_data, pd.core.series.Series):
        X = sequence_data.tolist()
    elif isinstance(sequence_data, list):
        pass
    else:
        raise TypeError("pandas series or list are only allowed types for sequence_data")
        
    # 'N' padding
    X = [_pad_sequence(seq, max_length) for seq in X]
    X_int = [[SEQCHAR.index(s) for s in seq] for seq in X]
    
    if onehot:
        X_onehot = np.eye(len(SEQCHAR))[X_int]
        return X_onehot # (num_data, MAX_LEN, len(SEQCHAR))
    else:
        return np.array(X_int) # (num_data, MAX_LEN)

In [9]:
# function for label preprocessing

def preproc_class(class_data, CLS=CLS): # class_data = df['class']
    
    y_cls = [CLS.index(y) for y in class_data]
    y_cls = np.array(y_cls, dtype=np.float64)
    
    return y_cls

## C. Model classes

In [54]:
# Example model class: for classification task using Embedding Layer

class ModelEmbedCls(Model):
    
    def __init__(self, input_dim, output_dim, input_length, 
                 conv2d_dim, kernel_height, kernel_width=len(SEQCHAR), max_length=MAX_LEN, num_cls=len(CLS), 
                 **kwargs):
        super(ModelEmbedCls, self).__init__()
        
        self.embed = Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length)
        self.conv2d = Conv2D(32, (5, 5), padding='same', strides=1, input_shape=(None, 6, 5))
        self.bnrm = BatchNormalization()
        self.drop = Dropout(0.2)
        self.pool = MaxPool2D(pool_size = (2, 2), padding = 'same')
        self.lrelu = LeakyReLU()
        self.flatten = Flatten()
        self.dense = Dense(5)
        self.softmax = Softmax()
        
    def call(self, inputs):
        
        x = self.embed(inputs)
        x = tf.expand_dims(x, axis=-1)

        x = self.conv2d(x)
        x = self.bnrm(x)
        x = self.lrelu(x)
        x = self.pool(x)
        x = self.drop(x)
       
        #MLP
        x = self.flatten(x)
        x = self.dense(x)
        
        output = self.softmax(x)
        
        return output

## D. Train & Evaluation Functions

In [11]:
# function for saving the model

def save_model(path, model):
    model.save(path)

In [12]:
# function for evaluating the model

def evaluate_model(model, test_X, test_y):
    loss, acc = model.evaluate(test_X, test_y)
    print("loss, acc: ", loss, "{:.2f}".format(acc))

In [55]:
# function for model training

def train_classifier(df, model_config, learning_rate, epochs, use_onehot=True, save=False):
    input_data = df['sequence']
    output_data = df['class']
    
    X = preproc_sequence(input_data, max_length=MAX_LEN, onehot=use_onehot)
    y = preproc_class(output_data)
    
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)
    
    optimizer = SGD(learning_rate=learning_rate)
    
    conv2d_dim = model_config['conv2d_dim']
    kernel_height = model_config['kernel_height']
    kernel_width = model_config['kernel_width']
    num_cls = model_config['num_cls']
    
    output_dim = model_config['output_dim']
    
    if use_onehot:
        model = ModelOneHotCls(conv2d_dim=conv2d_dim, kernel_height=kernel_height, 
                               kernel_width=kernel_width, max_length=MAX_LEN, num_cls=num_cls)
    else:
        model = ModelEmbedCls(input_dim=len(SEQCHAR), output_dim=output_dim, 
                              input_length=MAX_LEN, conv2d_dim=conv2d_dim, 
                              kernel_height=kernel_height, kernel_width=kernel_width, num_cls=num_cls)
        #raise NotImplementedError("ModelEmbedCls is not implemented yet")
    
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    callbacks = tf.keras.callbacks.EarlyStopping(monitor='loss', patience = 5)
    history = model.fit(train_X, train_y, epochs=epochs, callbacks = [callbacks])
    
    path = "/home/woosung/tensorflow/temp/SIW/Binding_Motif/CNN_model"
    if save:
        save_model(path, model)
        
    evaluate_model(model, test_X, test_y)
    
    plt.plot(history.history['accuracy'])
    plt.title('Model accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()

## E. Run


In [56]:
# run the train_classifier (using Embedding layer)

train_classifier(df=df, model_config=model_config,
                 learning_rate=LR,
                epochs=EPOCHS,
                use_onehot=False, save=False)



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
loss, acc:  0.5562781691551208 0.69


NameError: name 'plt' is not defined