In [77]:
import copy;
import numpy as np;
import pandas as pd;
import matplotlib.pyplot as plt;

In [78]:
# activation and loss function;
def func(f,x,y=None):
    if f == 'relu':
        return relu(x);
    if f == 'softmax':
        x = x.T - np.max(x,axis=1).T;
        x = x.T
        # softmax 自变量减去常量等同于分数上下同除相同常量
        # 使得自变量尽量是负数，防止溢出
        return softmax(x);
    if f == 'sigmoid':
        return sigmoid(x);
    if f == 'x':
        return x;
    if f == 'MSE':
        return 1/x.shape[0] * 1/2* np.sum((x - y)**2);
    if f == 'Cross_Entropy':
        res = - np.sum(y*np.log(x),axis=1);
        return 1/x.shape[0] * np.sum(res);
    if f == 'Real_Cross_Entropy':
        res = - np.sum(y*np.log(x)+(1-y)*np.log(1-x),axis=1);
        return 1/x.shape[0] * np.sum(res);
def gra_func(f,x,y=None):
    if f == 'relu':
        return gra_relu(x);
    if f =='softmax':
        return gra_softmax(x);
    if f == 'sigmoid':
        return gra_sigmoid(x);
    if f == 'x':
        return 1;
    if f == 'MSE':
        return (x-y);
    if f == 'Cross_Entropy':
        return -y/x;
    if f == 'Real_Cross_Entropy':
        return -(y/x-(1-y)/(1-x));
def relu(x):
    res = np.abs(x);
    res = (res + x)/2;
    return res;
def softmax(x):
    x = x.T - np.max(x,axis=1).T;
    x = x.T
    out = np.exp(x).T/np.sum(np.exp(x),axis=1).T
    out = out.T
    return out;
def sigmoid(x):
    r1 = 1/(1+np.exp(-relu(x)));
    r2 = np.exp(-relu(-x))/(1+np.exp(-relu(-x)));
    return (r1+r2-1/2);
def gra_relu(x):
    res = np.abs(x);
    res = np.sign(res + x);
    return res;
def gra_softmax(x):
    x = x.T - np.max(x,axis=1).T;
    x = x.T
    out = np.exp(x).T/np.sum(np.exp(x),axis=1).T
    out = out.T
    return (out - out**2);
def gra_sigmoid(x):
    r1 = np.exp(-relu(x))/(1+np.exp(-relu(x)))**2;
    r2 = np.exp(-relu(-x))/(1+np.exp(-relu(-x)))**2;
    return (r1+r2-1/4);
# 训练集随机化及从训练集产生测试集
def random_division(x,y=None,test_rate = 0):
    x = np.array(x);
    # 数据总长度
    n = np.shape(x)[0];
    # 生成下表序列并随即打乱
    sequence = np.array(range(n));
    np.random.shuffle(sequence);
    # 测试集长度
    n_test = int(np.ceil(n * test_rate));
    # 测试集序列与训练集序列
    se_test = sequence[:n_test];
    se_train = sequence[n_test:];
    # 训练集
    train_data = x[se_train];
    # 测试集
    test_data = x[se_test];
    # 有监督学习的情况
    if not type(y) == type(None):
        y = np.array(y);
        train_label = y[se_train];
        test_label = y[se_test];
        if not test_rate == 0:
            return (train_data,train_label,test_data,test_label);
        else:
            return (train_data,train_label);
    # 无监督情况
    if not test_rate == 0:
        return (train_data,test_data);
    else:
        return (train_data);
# label to one-hot
def onehot_label(label):
    max_num = np.max(label);
    res = [];
    for k in range(len(label)):
        resk = np.zeros((max_num+1,));
        resk[label[k]] = 1;
        res.append(resk);
    return np.array(res);
def arg_max(y):
    tar = np.argmax(y,axis=1);
    res = np.zeros_like(y);
    for i in range(y.shape[0]):
        res[i,tar[i]] = 1;
    return res;
def evaluate(y,label,metrics='Accuracy'):
    confusion_matrix = np.dot(label.T,y);
    cm = confusion_matrix;
    # 对角线元素+1 防止溢出
    cm = cm + np.eye(cm.shape[0]);
    N = y.shape[0] + cm.shape[0];
    n = y.shape[1];
    T = np.diagonal(cm);
    A = np.trace(cm)/N;
    Pk = T/np.sum(cm,axis=0);
    P = np.sum(Pk)/n;
    Rk = T/np.sum(cm,axis=1).T;
    R = np.sum(Rk)/n;
    F1 = 2*P*R/(P+R);
    if metrics == 'Accuracy':
        return A;
    if metrics == 'Precision':
        return P;
    if metrics == 'Recall':
        return R;
    if metrics == 'F1':
        return F1;

In [79]:
# 载入数据
names_to_label = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2};
label_to_names = {value: key for key, value in names_to_label.items()}
df = pd.read_csv('iris.data', header=None)
xs = df.iloc[:, :4].values
ts = np.array([names_to_label[name] for name in df.iloc[:, -1]])
ys = np.zeros((ts.shape[0], 3));
ts = ts.reshape(-1,1);
# 标签one hot
ts = onehot_label(ts);
# 样本及标签顺序随机化
(train_data,train_label) = random_division(xs,ts);
# 对数据的归一化处理
max_x = np.max(np.abs(train_data),axis=0);
train_data = train_data / max_x;
flag_loading = 0;

In [80]:
# 网络结构定义
if not flag_loading == 1:
    layers = 5;
    nets = [16,64,256,32,3];
    in_shape = [4] + nets[:-1];
    out_shape = nets;
    act_fun = ['relu','relu','relu','sigmoid','softmax'];
    loss_func = 'Real_Cross_Entropy';
    metrics = 'Accuracy';
    x0 = train_data;
    n = x0.shape[0];
    m = x0.shape[1];
    # 模型中间变量初始化
    w = [];
    b = [];
    x = [];
    y = [];
    delta = [];
    for i in range(layers):
        wk = np.random.randn(in_shape[i],out_shape[i]);
        bk = np.random.randn(1,nets[i]);
        w.append(wk);
        b.append(bk);
        x.append([]);
        y.append([]);
        delta.append([]);
# 保存初始化参数
if flag_loading == 0:
    flag_loading = 1;
    w_init = copy.deepcopy(w);
    b_init = copy.deepcopy(b);
    x_init = copy.deepcopy(x);
    y_init = copy.deepcopy(y);
    delta_init = copy.deepcopy(delta);
# 加载初始化参数
w = copy.deepcopy(w_init);
b = copy.deepcopy(b_init);
x = copy.deepcopy(x_init);
y = copy.deepcopy(y_init);
delta = copy.deepcopy(delta_init);

In [81]:
def adaptive_grandient(rate,dv,v):
    r1 = np.max(np.abs(dv));
    r2 = np.max(np.abs(v));
    res = rate*r2/r1*dv;
    return res;

In [90]:
epoches = 100;
rate = 5e-4;
drate = 2;
res_w = [];res_b = [];res_delta = [];res_dw = [];res_db = [];
loss = float('inf');
min_dloss = 0;
for p in range(epoches):
    x0 = train_data;
    # 正向传播
    # 输入层情况
    x[0] = np.dot(x0,w[0]) + b[0];
    y[0] = func(act_fun[0],x[0]);
    # 隐藏层
    for k in range(1,layers):
        x[k] = np.dot(y[k-1],w[k]) + b[k];
        y[k] = func(act_fun[k],x[k]);
    
    if not type(metrics) == type(None):
        y_pre = arg_max(y[k]);
        evalue = evaluate(y_pre,train_label,metrics=metrics);
    # 反向传播
    # res_w.append([]);res_b.append([]);res_dw.append([]);res_db.append([]);res_delta.append([]);
    # 输出层情况
    delta[k] = gra_func(loss_func,y[k],train_label)*gra_func(act_fun[k],x[k]);
    # delta量的归一化处理
    # delta[k] = delta[k] / np.max(np.abs(delta[k]));
    db = np.sum(delta[k],axis=0).reshape(1,-1);
    dw = np.dot(y[k-1].T,delta[k]);
    # res_w[p].append(copy.deepcopy(w[k]));res_b[p].append(copy.deepcopy(b[k]));res_delta[p].append(copy.deepcopy(delta[k]));res_dw[p].append(copy.deepcopy(dw));res_db[p].append(copy.deepcopy(db));
    
    # b[k] -= rate*db;
    # w[k] -= rate*dw;
    b[k] -= adaptive_grandient(rate,db,b[k]);
    w[k] -= adaptive_grandient(rate,dw,w[k]);
    k -= 1;
    # 隐藏层
    while(k>0):
        delta[k] = np.dot(delta[k+1],w[k+1].T) * gra_func(act_fun[k],x[k]);
        # delta的归一化处理
        # delta[k] = delta[k] / np.max(np.abs(delta[k]));
        db = np.sum(delta[k],axis = 0).reshape(1,-1);
        dw = np.dot(y[k-1].T,delta[k]);
        # res_w[p].append(copy.deepcopy(w[k]));res_b[p].append(copy.deepcopy(b[k]));res_delta[p].append(copy.deepcopy(delta[k]));res_dw[p].append(copy.deepcopy(dw));res_db[p].append(copy.deepcopy(db));

        b[k] -= adaptive_grandient(rate,db,b[k]);
        w[k] -= adaptive_grandient(rate,dw,w[k]);
        # b[k] -= rate*db;
        # w[k] -= rate*dw;
        k -= 1;
    
    res = func(loss_func,y[-1],train_label);
    dloss = loss - res;
    loss = res;
    if dloss < min_dloss:
        rate /= drate;
    print('ep {}: loss: {:.2f}'.format(p,loss),end='');
    if not type(metrics) == type(None):
        print(' {}:{:.2f} '.format(metrics,evalue));
    else:
        print('');

ep 0: loss: 0.53 Accuracy:0.95 
ep 1: loss: 0.58 Accuracy:0.93 
ep 2: loss: 0.54 Accuracy:0.95 
ep 3: loss: 0.52 Accuracy:0.95 
ep 4: loss: 0.54 Accuracy:0.95 
ep 5: loss: 0.53 Accuracy:0.95 
ep 6: loss: 0.52 Accuracy:0.95 
ep 7: loss: 0.52 Accuracy:0.95 
ep 8: loss: 0.52 Accuracy:0.95 
ep 9: loss: 0.52 Accuracy:0.95 
ep 10: loss: 0.52 Accuracy:0.95 
ep 11: loss: 0.52 Accuracy:0.95 
ep 12: loss: 0.52 Accuracy:0.95 
ep 13: loss: 0.52 Accuracy:0.95 
ep 14: loss: 0.52 Accuracy:0.95 
ep 15: loss: 0.52 Accuracy:0.95 
ep 16: loss: 0.52 Accuracy:0.95 
ep 17: loss: 0.52 Accuracy:0.95 
ep 18: loss: 0.52 Accuracy:0.95 
ep 19: loss: 0.52 Accuracy:0.95 
ep 20: loss: 0.52 Accuracy:0.95 
ep 21: loss: 0.52 Accuracy:0.95 
ep 22: loss: 0.52 Accuracy:0.95 
ep 23: loss: 0.52 Accuracy:0.95 
ep 24: loss: 0.52 Accuracy:0.95 
ep 25: loss: 0.52 Accuracy:0.95 
ep 26: loss: 0.52 Accuracy:0.95 
ep 27: loss: 0.52 Accuracy:0.95 
ep 28: loss: 0.52 Accuracy:0.95 
ep 29: loss: 0.52 Accuracy:0.95 
ep 30: loss: 0.52 Ac

In [72]:
data = res_dw;
abs_max = [];
mean_data = [];
med_data = [];
abs_min = [];
for i in range(len(data)):
    abs_max.append([]);
    mean_data.append([]);
    med_data.append([]);
    abs_min.append([]);

    for k in range(len(data[i])-1,-1,-1):
        abs_max[i].append(np.max(np.abs(data[i][k])));
        mean_data[i].append(np.mean(data[i][k]));
        med_data[i].append(np.median(data[i][k]));
        abs_min[i].append(np.min(np.abs(data[i][k])));

In [None]:
data_pd = pd.DataFrame(abs_max);
writer = pd.ExcelWriter('abs_max.xlsx');
data_pd.to_excel(writer,'page_1',float_format='%.6f');
writer.save();
writer.close();

data_pd = pd.DataFrame(mean_data);
writer = pd.ExcelWriter('mean_data.xlsx');
data_pd.to_excel(writer,'page_1',float_format='%.6f');
writer.save();
writer.close();

data_pd = pd.DataFrame(med_data);
writer = pd.ExcelWriter('med_data.xlsx');
data_pd.to_excel(writer,'page_1',float_format='%.6f');
writer.save();
writer.close();

data_pd = pd.DataFrame(abs_min);
writer = pd.ExcelWriter('abs_min.xlsx');
data_pd.to_excel(writer,'page_1',float_format='%.6f');
writer.save();
writer.close();