DATE: 2017, 3, 17
1. 说明：用one-hot编码，利用MXNet设计卷积神经网络（MLP，LeNET，CIFAR 10 NET），对RNA序列实现二分类、多分类，评价指标为mse，准确率，F1。

2. Tricks：
 * 解决序列长短不一问题。程序先从所有样本中选出最长的序列，以它的长度depth作为共同特征矩阵的长和宽，也就是形成${\rm{depth}} \times {\rm{depth}}$的矩阵，长度不够的位置补全零数组，类似于图片的稀疏像素。若depth为4，序列AUC可编码为：
$$ \left[ {\begin{array}{*{20}{c}}
0&1&0&0\\
0&0&1&0\\
0&0&0&1\\
0&0&0&0
\end{array}} \right] $$

 * 但是对于过长的序列会产生过高维度的特征，可以采用香农编码或哈夫曼编码压缩编码解决。

DATE: 2017, 3, 24
1. 用哈夫曼编码暂时不考虑，使用最简单的A=1，,U=2，C=3，G=4来编码，每条样本形成${\rm{100}} \times {\rm{100}}$的矩阵，该种效果与直接使用pse-in-one提取出的特征分类相似；
2. 考虑使用4个字母的连续256种组合编码，与图像的3通道像素相似。暂时无实验价值；
3. 考虑直接将卷积出的抽象特征输出，连接SVM做分类。

In [91]:
# 设计卷积网络
# coding=utf-8

import time

import mxnet as mx
import numpy as np

import logging

# Basic Conv + BN + ReLU factory
def ConvFactory(data, num_filter, kernel, stride=(1, 1), pad=(0, 0), act_type="relu"):
    # there is an optional parameter ```wrokshpace``` may influece convolution performance
    # default, the workspace is set to 256(MB)
    # you may set larger value, but convolution layer only requires its needed but not exactly
    # MXNet will handle reuse of workspace without parallelism conflict
    conv = mx.symbol.Convolution(data=data, workspace=256,
                                 num_filter=num_filter, kernel=kernel, stride=stride, pad=pad)
    bn = mx.symbol.BatchNorm(data=conv)
    act = mx.symbol.Activation(data=bn, act_type=act_type)
    return act


# A Simple Downsampling Factory
def DownsampleFactory(data, ch_3x3):
    # conv 3x3
    conv = ConvFactory(data=data, kernel=(3, 3), stride=(2, 2), num_filter=ch_3x3, pad=(1, 1))
    # pool
    pool = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type='max')
    # concat
    concat = mx.symbol.Concat(*[conv, pool])
    return concat


# A Simple module
def SimpleFactory(data, ch_1x1, ch_3x3):
    # 1x1
    conv1x1 = ConvFactory(data=data, kernel=(1, 1), pad=(0, 0), num_filter=ch_1x1)
    # 3x3
    conv3x3 = ConvFactory(data=data, kernel=(3, 3), pad=(1, 1), num_filter=ch_3x3)
    # concat
    concat = mx.symbol.Concat(*[conv1x1, conv3x3])
    return concat


def set_cifar10():
    data = mx.symbol.Variable(name="data")
    conv1 = ConvFactory(data=data, kernel=(3, 3), pad=(1, 1), num_filter=96, act_type="relu")
    in3a = SimpleFactory(conv1, 32, 32)
    in3b = SimpleFactory(in3a, 32, 48)
    in3c = DownsampleFactory(in3b, 80)
    in4a = SimpleFactory(in3c, 112, 48)
    in4b = SimpleFactory(in4a, 96, 64)
    in4c = SimpleFactory(in4b, 80, 80)
    in4d = SimpleFactory(in4c, 48, 96)
    in4e = DownsampleFactory(in4d, 96)
    in5a = SimpleFactory(in4e, 176, 160)
    in5b = SimpleFactory(in5a, 176, 160)
    pool = mx.symbol.Pooling(data=in5b, pool_type="avg", kernel=(7, 7), name="global_avg")
    flatten = mx.symbol.Flatten(data=pool)
    fc = mx.symbol.FullyConnected(data=flatten, num_hidden=10)
    softmax = mx.symbol.SoftmaxOutput(name='softmax', data=fc)
    return softmax


# 卷积神经网络
def set_con():
    data = mx.symbol.Variable('data')
    # first conv layer
    mx.sym.Activation
    conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=20)
    tanh1 = mx.sym.Activation(data=conv1, act_type="tanh")
    pool1 = mx.sym.Pooling(data=tanh1, pool_type="max", kernel=(2, 2), stride=(2, 2))
    # second conv layer
    conv2 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=50)
    tanh2 = mx.sym.Activation(data=conv2, act_type="tanh")
    pool2 = mx.sym.Pooling(data=tanh2, pool_type="max", kernel=(2, 2), stride=(2, 2))
    # first fullc layer
    flatten = mx.sym.Flatten(data=pool2)
    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500)
    tanh3 = mx.sym.Activation(data=fc1, act_type="tanh")
    # second fullc
    fc2 = mx.sym.FullyConnected(data=tanh3, num_hidden=42)
    # softmax loss
    lenet = mx.sym.SoftmaxOutput(data=fc2, name='softmax') 
    return lenet

  chunks = self.iterencode(o, _one_shot=True)


In [104]:
# 载入数据，并进行one-hot编码

import random

def load_data(encode='common'):
    data_list = []
    start = time.clock()
    print '>> loading datasets ... ',
    for line in open('/home01/shixiangwan/deep_learning/protein_location/RNA-all-long-CD-HIT.fasta'):
        if line[0] != '>':
            line = line.strip().replace('A', '1').replace('U', '2').replace('C', '3').replace('G', '4')
            data_list.append(map(int, list(line)))
    depth = len(max(data_list, key=len))
    print 'finished.', time.clock() - start, 'ms'

    print '>> encoding ... ',
    train_set = []
    if encode == 'onehot':
        for elem in range(len(data_list)):
            tmp_mx = mx.nd.one_hot(mx.nd.array(data_list[elem], dtype=np.int32), depth=depth) \
                .asnumpy().reshape(len(data_list[elem]) * depth)
            tmp_mx = np.append(tmp_mx, np.zeros(len(data_list[elem]) * depth))
            train_set.append(list(tmp_mx))
    else:
        for elem in range(len(data_list)):
            tmp_list = data_list[elem]
            tmp_mx = np.append(tmp_list, np.zeros(10000 - len(tmp_list)))
            train_set.append(list(tmp_mx))

    label_set = []
    for line in open("labels_1.txt"):
#         label_set.append(map(float, line.strip().split(',')))   # multi-label
        label_set.append(float(line.strip()))                   # single-label
    print 'finished.', time.clock() - start, 'ms'
    random.seed(100)
    random.shuffle(train_set)
    random.seed(100)
    random.shuffle(label_set)
    return train_set, label_set

train_set, label_set = load_data()


>> loading datasets ...  finished. 15.593175 ms
>> encoding ...  finished. 24.803945 ms


  chunks = self.iterencode(o, _one_shot=True)


In [105]:
# 将数据变成类似图像的矩阵形式

def to4d(data):
    print 'data.shape:', data.shape
    return data.reshape(data.shape[0], 1, 100, 100)

  chunks = self.iterencode(o, _one_shot=True)


In [92]:
# 对已经载入的数据进行深度学习

all_number = len(label_set)
# print all_number
# print np.array(train_set).shape
split_tv = int(0.8 * all_number)
split_tt = int(0.9 * all_number)
train_iter = mx.io.NDArrayIter(mx.nd.array(to4d(np.array(train_set[0:split_tv]))),
                               mx.nd.array(np.array(label_set[0:split_tv])),
                               shuffle=True)
validate_iter = mx.io.NDArrayIter(mx.nd.array(to4d(np.array(train_set[split_tv:split_tt]))),
                                  mx.nd.array(np.array(label_set[split_tv:split_tt])),
                                  shuffle=True)

# train
logging.basicConfig(level=logging.INFO)
model = mx.model.FeedForward(ctx=mx.gpu(1),  # [mx.gpu(i) for i in range(4)]
                             symbol=set_con(),  # set_mlp(), set_con(), set_cifar10()
                             num_epoch=1,
                             learning_rate=0.1,
                             momentum=0.9,
                             wd=0.00001)

batch_size = 100
result = model.fit(X=train_iter,
                   eval_data=validate_iter,
                   batch_end_callback=mx.callback.Speedometer(batch_size, 200)
                   )

data.shape: (11591, 10000)


INFO:root:Start training with [gpu(1)]


data.shape: (1449, 10000)


INFO:root:Epoch[0] Batch [200]	Speed: 17569.56 samples/sec	Train-accuracy=0.938214
INFO:root:Epoch[0] Batch [400]	Speed: 18857.84 samples/sec	Train-accuracy=0.942381
INFO:root:Epoch[0] Batch [600]	Speed: 18664.36 samples/sec	Train-accuracy=0.937857
INFO:root:Epoch[0] Batch [800]	Speed: 18758.87 samples/sec	Train-accuracy=0.940000
INFO:root:Epoch[0] Batch [1000]	Speed: 18666.19 samples/sec	Train-accuracy=0.941310
INFO:root:Epoch[0] Batch [1200]	Speed: 18916.86 samples/sec	Train-accuracy=0.940714
INFO:root:Epoch[0] Batch [1400]	Speed: 18593.39 samples/sec	Train-accuracy=0.939643
INFO:root:Epoch[0] Batch [1600]	Speed: 18838.30 samples/sec	Train-accuracy=0.942619
INFO:root:Epoch[0] Batch [1800]	Speed: 19007.90 samples/sec	Train-accuracy=0.942738
INFO:root:Epoch[0] Batch [2000]	Speed: 18820.30 samples/sec	Train-accuracy=0.945833
INFO:root:Epoch[0] Batch [2200]	Speed: 18596.85 samples/sec	Train-accuracy=0.939762
INFO:root:Epoch[0] Batch [2400]	Speed: 18561.00 samples/sec	Train-accuracy=0.940

In [109]:
# 列出模型中的所有层
internals = model.symbol.get_internals()
internals.list_outputs()

['data',
 'convolution20_weight',
 'convolution20_bias',
 'convolution20_output',
 'activation30_output',
 'pooling20_output',
 'convolution21_weight',
 'convolution21_bias',
 'convolution21_output',
 'activation31_output',
 'pooling21_output',
 'flatten10_output',
 'fullyconnected20_weight',
 'fullyconnected20_bias',
 'fullyconnected20_output',
 'activation32_output',
 'fullyconnected21_weight',
 'fullyconnected21_bias',
 'fullyconnected21_output',
 'softmax_label',
 'softmax_output']

  chunks = self.iterencode(o, _one_shot=True)


In [112]:
# 确定新层
fea_symbol = internals['fullyconnected20_output']
new_model = mx.model.FeedForward(ctx=mx.gpu(1),  # [mx.gpu(i) for i in range(4)]
                             symbol=fea_symbol,
                             numpy_batch_size=1,
                             arg_params=model.arg_params,
                             aux_params=model.aux_params,
                             allow_extra_params=True)

# 提取结果到arff文件
test_set = train_set      # train_set[split_tt:], train_set
test_lbl = label_set      # label_set[split_tt:], label_set

results = []
for i in range(len(test_set)):
    test_val = np.array(test_set[i]).reshape(1, 1, 100, 100)
    prob = new_model.predict(test_val)
    # print len(prob[0])
    results.append(prob[0])

print ">> results' number", len(results)
print ">> test_lbl' number", len(test_lbl)
arff_results = open('lenet.arff', 'w')
arff_results.write('@relation lenet\n')
for i in range(len(results[0])):
    arff_results.write('@attribute Feature' + str(i+1) + ' real\n')
# for i in range(len(test_lbl[0])):
#     arff_results.write('@attribute class' + str(i+1) + ' {0.0,1.0}\n')
arff_results.write('@attribute class {0.0,1.0}\n')
arff_results.write('@data\n')
for i in range(len(results)):
    # line =  ','.join(str(j) for j in results[i]) + ',' + ','.join(str(j) for j in test_lbl[i])
    line =  ','.join(str(j) for j in results[i]) + ',' + str(test_lbl[i])
    arff_results.write(line + '\n')
arff_results.close()



>> results' number 14489
>> test_lbl' number 14489


  chunks = self.iterencode(o, _one_shot=True)


In [114]:
#  随机分割不平衡数据集，生成arff文件
import random

def split_unbalance(file_name, pos_lab, neg_lab):
    pos_list = []
    neg_list = []
    suffix = ''
    for line in open(file_name):
        if line[0] == '@':
            suffix += line
        else:
            if line.strip().split(',')[-1] == pos_lab:
                pos_list.append(line.strip())
            elif line.strip().split(',')[-1] == neg_lab:
                neg_list.append(line.strip())
    pos_len = len(pos_list)
    neg_len = len(neg_list)
    print '>> 正例数（标签1.0）：', pos_len
    print '>> 反例数（标签0.0）：', neg_len
    if pos_len > neg_len:
        new_list = random.sample(pos_list, neg_len)
        return suffix, new_list, neg_list
    elif pos_len < neg_len:
        new_list = random.sample(neg_list, pos_len)
        return suffix, pos_list, new_list
    else:
        return None, None, None

input_file = "lenet.arff"
output_file = "lenet-b.arff"
pos_lab = "1.0"
neg_lab = "0.0"
suffix, one_list, two_list = split_unbalance(input_file, pos_lab, neg_lab)

if None is suffix:
    print 'Blanced Dataset !'
    exit()

one_list.extend(two_list)
random.shuffle(one_list)
results = open(output_file, "w")
results.write(suffix)
for i in range(len(one_list)):
    results.write(one_list[i] + '\n')
results.close()

print 'Finished.'

>> 正例数（标签1.0）： 13271
>> 反例数（标签0.0）： 1218
Finished.


  chunks = self.iterencode(o, _one_shot=True)
