In [4]:
# %load protein.py

import logging

import mxnet as mx
import numpy as np

import random

# 载入数据
def load_data():
    train_set = []
    label_set = []
    for line in open("RNA-all-SC-PseDNC-General.csv"):
        a_line = ''
        for i in range(34):
            a_line += line + ','
        train_set.append(map(float, a_line.strip(',').split(',')))
    for line in open("labels_42.txt"):
        label_set.append(map(float, line.strip().split(',')))
    return train_set, label_set


# 数据标准化处理
def norm_stat(d):
    return mx.nd.norm(d) / np.sqrt(d.size)


# 多层感知机
def set_mlp():
    data = mx.symbol.Variable('data')
    data = mx.sym.Flatten(data=data)
    fc1 = mx.symbol.FullyConnected(data=data, name='fc1', num_hidden=128)
    act1 = mx.symbol.Activation(data=fc1, name='relu1', act_type="relu")
    fc2 = mx.symbol.FullyConnected(data=act1, name='fc2', num_hidden=64)
    act2 = mx.symbol.Activation(data=fc2, name='relu2', act_type="relu")
    fc3 = mx.symbol.FullyConnected(data=act2, name='fc3', num_hidden=3)
    mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
    return mlp


# 卷积神经网络
def set_con():
    data = mx.symbol.Variable('data')
    # first conv layer
    mx.sym.Activation
    conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=20)
    tanh1 = mx.sym.Activation(data=conv1, act_type="tanh")
    pool1 = mx.sym.Pooling(data=tanh1, pool_type="max", kernel=(2, 2), stride=(2, 2))
    # second conv layer
    conv2 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=50)
    tanh2 = mx.sym.Activation(data=conv2, act_type="tanh")
    pool2 = mx.sym.Pooling(data=tanh2, pool_type="max", kernel=(2, 2), stride=(2, 2))
    # first fullc layer
    flatten = mx.sym.Flatten(data=pool2)
    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500)
    tanh3 = mx.sym.Activation(data=fc1, act_type="tanh")
    # second fullc
    fc2 = mx.sym.FullyConnected(data=tanh3, num_hidden=42)
    # softmax loss
    lenet = mx.sym.SoftmaxOutput(data=fc2, name='softmax')
    return lenet


def to4d(data):
    print data.shape
    return data.reshape(data.shape[0], 1, 34, 34)


if __name__ == '__main__':
    # data
    train_set, label_set = load_data()
    all_number = len(label_set)
    split_tv = int(0.8*all_number)
    split_tt = int(0.9*all_number)
    train_iter = mx.io.NDArrayIter(mx.nd.array(to4d(np.array(train_set[0:split_tv]))),
                                  mx.nd.array(np.array(label_set[0:split_tv])),
                                  shuffle=True)
    validate_iter = mx.io.NDArrayIter(mx.nd.array(to4d(np.array(train_set[split_tv:split_tt]))),
                                   mx.nd.array(np.array(label_set[split_tv:split_tt])),
                                   shuffle=True)

    # train
    logging.basicConfig(level=logging.INFO)
    model = mx.model.FeedForward(ctx=mx.gpu(1),
                                 symbol=set_con(), # set_mlp(), set_con()
                                 num_epoch=2,
                                 learning_rate=0.1,
                                 momentum=0.9,
                                 wd=0.00001)

    batch_size = 100
    result = model.fit(X = train_iter,
                       eval_data = validate_iter,
                       batch_end_callback = mx.callback.Speedometer(batch_size, 200)
    )
    

(11591, 1156)


  self.initializer(k, v)
INFO:root:Start training with [gpu(1)]


(1449, 1156)


INFO:root:Epoch[0] Batch [200]	Speed: 54671.71 samples/sec	Train-accuracy=0.939881
INFO:root:Epoch[0] Batch [400]	Speed: 64235.14 samples/sec	Train-accuracy=0.941786
INFO:root:Epoch[0] Batch [600]	Speed: 30336.91 samples/sec	Train-accuracy=0.941905
INFO:root:Epoch[0] Batch [800]	Speed: 69649.97 samples/sec	Train-accuracy=0.938095
INFO:root:Epoch[0] Batch [1000]	Speed: 71426.57 samples/sec	Train-accuracy=0.943095
INFO:root:Epoch[0] Batch [1200]	Speed: 70154.76 samples/sec	Train-accuracy=0.942381
INFO:root:Epoch[0] Batch [1400]	Speed: 70809.24 samples/sec	Train-accuracy=0.940000
INFO:root:Epoch[0] Batch [1600]	Speed: 71220.27 samples/sec	Train-accuracy=0.941071
INFO:root:Epoch[0] Batch [1800]	Speed: 71190.83 samples/sec	Train-accuracy=0.941548
INFO:root:Epoch[0] Batch [2000]	Speed: 70773.16 samples/sec	Train-accuracy=0.940833
INFO:root:Epoch[0] Batch [2200]	Speed: 70800.70 samples/sec	Train-accuracy=0.937857
INFO:root:Epoch[0] Batch [2400]	Speed: 71015.93 samples/sec	Train-accuracy=0.941

In [5]:
# 分析结果

test_set = train_set[split_tt:]
test_lbl = label_set[split_tt:]

count = 0
r_num = 0
results = open('results.csv', 'w')
for i in range(len(test_set)):
    test_val = np.array(test_set[i]).reshape(1, 1, 34, 34)
    prob = model.predict(test_val)
    predict =  ','.join(str(j) for j in prob[0])
    origin = ','.join(str(j) for j in test_lbl[i])
    for j in range(len(prob[0])):
        p_val = 0
        if prob[0][j] >= 0.5:
            p_val = 1
        if p_val == test_lbl[i][j]:
            r_num += 1
    results.write(predict + '\n' + origin + '\n\n')
    count += 1
#     print predict
#     print origin
#     print
#     if count == 20:
#         break
results.close()

# performance matrics
test_iter = mx.io.NDArrayIter(mx.nd.array(to4d(np.array(test_set))),
                              mx.nd.array(np.array(test_lbl)),
                              shuffle=True)
f1, mse, acc = model.score(test_iter, ['f1', 'mse', 'acc'])
recall = (acc * f1) / (2*acc - f1)
print '----------------------------------------------------------------------'
print 'f1 rate\t\t' + 'precision\t\t' + 'recall\t\t' + 'mse'
print f1, '\t\t', acc, '\t\t', recall, '\t\t', mse

print '\nright/all/rate:', r_num, count*len(label_set[0]), float(r_num)/(count*len(label_set[0]))

(1449, 1156)
----------------------------------------------------------------------
f1 rate		precision		recall		mse
0.0 		0.964918334484 		0.0 		0.0350816661738

right/all/rate: 58723 60858 0.964918334484


  chunks = self.iterencode(o, _one_shot=True)


推测：

在更复杂的蛋白质二级结构预测方面，卷积神经网络应该有着更好的效果。