# BinaryClassification_f1score.ipynb
Description   : Single-layer neural network to find f1 score for binary classification

## 이진 분류를 위한 신경망 설계

In [21]:
import numpy as np
import csv
import os

np.random.seed(1024)
# 경로 초기화
os.chdir(r"C:\Users\TitusChoi\Desktop\Library\CodeLion\AI")

In [22]:
# Hyperparameter
RND_MEAN = 0
RND_STD = 0.003
LEARNING_RATE = 0.001

In [23]:
# Main function
def binary_classification_exec(epochs = 10, mb_size = 10, report = 1, tr = 0.8, adjust_ratio = False): # 학습 횟수, 미니 배치 사이즈, 학습 리포트, 훈련 비율(데이터 전부 훈련시키지 않고 그 중 훈련 비율만큼 훈련, 나머지는 시험 비율이 된다.), Pulsar 데이터 증폭 T/F
    binary_load_dataset(adjust_ratio) # 데이터 셋 여는 함수
    init_model() # 가중치와 편향 초기화 함수
    train_and_test(epochs, mb_size, report, tr) # 학습 및 신경망 성능 테스트 함수

In [24]:
# Data load
def binary_load_dataset(adjust_ratio):
    # pulsars, stars의 adjust_ratio
    pulsars, stars = [], []

    # Loading datasets
    with open('./datasets/pulsar_stars.csv') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader, None) # 첫 번째 index 건너뛰고 none으로 반환
        rows = []
        for row in csvreader:
            if row[8] == '1' : pulsars.append(row)
            else:
                stars.append(row)

    # Global Variable : 함수 구축 시 주요하게 사용되는 변수를 확인하기 위한 장점때문에 사용
    global data, input_cnt, output_cnt
    input_cnt, output_cnt = 8, 1 # 독립변수의 크기와 종속변수의 크기
    star_cnt, pulsar_cnt = len(stars), len(pulsars)
    
    if adjust_ratio: # pulsar 데이터 증폭 flow
        data = np.zeros([2 * star_cnt, 9])
        data[0:star_cnt,:] = np.asarray(stars, dtype = 'float32')

        for n in range(star_cnt):
            data[star_cnt + n] = np.asarray(pulsars[n % pulsar_cnt], dtype = 'float32')

    else:
        data = np.zeros([star_cnt + pulsar_cnt, 9])
        data[0:star_cnt,:] = np.asarray(stars, dtype = 'float32')
        data[star_cnt:,:] = np.asarray(pulsars, dtype = 'float32')

In [25]:
# Parameter Initialization
def init_model():
    global weight, bias, input_cnt, output_cnt
    weight = np.random.normal(RND_MEAN, RND_STD, [input_cnt, output_cnt])
    bias = np.random.normal(RND_MEAN, RND_STD, [output_cnt])

In [26]:
# Train and Test
def train_and_test(epochs, mb_size, report, tr):
    steps = arrange_data(mb_size, tr)           # 반환하는 값은 미니배치가 몇 덩어리(스텝)으로 쪼개지는지?
    test_x, test_y = get_test_data()               # 테스트 데이터에 대한 독립변수와 종속변수를 얻어내는 함수

    # epochs는 외부 for문으로 돌림
    # 미니배치는 내부 for문으로 돌리기 때문에 시간복잡도 증가
    for epoch in range(epochs):
        losses, accuracies = [], [] # epochs당 손실함수와 정확도 for 평균
        for n in range(steps):
            train_x, train_y = get_train_data(mb_size, n) # 미니배치 사이즈와 스텝의 수만큼 개별값을 받아 학습데이터의 독립, 종속변수 반환
            loss, _ = run_train(train_x, train_y)
            losses.append(loss)
            # accuracies.append(accuracy) 여기서는 accuracy 사용하지 않는다!
        
        # 몇 번의 간격에 맞춰서 출력할 것인지 report에 따라서 다름
        if report > 0 and (epoch + 1) % report == 0:
            result = run_test(test_x, test_y)
            print('Epoch {} : loss = {:5.3f}, accuracy = {:5.3f}, precision={:5.3f}, recall={:5.3f}, f1={:5.3f}'\
                .format(epoch + 1, np.mean(losses), result[0], result[1], result[2], result[3]))
    
    final_result = run_test(test_x, test_y)
    print('\nFinal Test : accuracy = {:5.3f}, precision={:5.3f}, recall={:5.3f}, f1={:5.3f}'\
                .format(final_result[0], final_result[1], final_result[2], final_result[3]))

In [27]:
# Arrange data
def arrange_data(mb_size, tr):
    global data, shuffle_map, test_begin_index
    shuffle_map = np.arange(data.shape[0])
    np.random.shuffle(shuffle_map)
    # 미니 배치 스텝 수를 구하는 과정
    steps = int(data.shape[0] * tr) // mb_size

    # Search boundary line
    test_begin_index = steps * mb_size
    return steps

In [28]:
# Test data
def get_test_data():
    global data, shuffle_map, test_begin_index, output_cnt
    test_data = data[shuffle_map[test_begin_index:]]
    return test_data[:, :-output_cnt], test_data[:, -output_cnt:]

In [29]:
# Getting training data
def get_train_data(mb_size, nth):
    global data, shuffle_map, test_begin_index, output_cnt
    if nth == 0:
        np.random.shuffle(shuffle_map[:test_begin_index]) # 무작위로 특정 인덱스까지 무작위로 섞기
    train_data = data[shuffle_map[mb_size * nth : mb_size * (nth + 1)]]
    return train_data[:, :-output_cnt], train_data[:, -output_cnt:]

In [30]:
# Training data
def run_train(x, y):
    output, aux_nn = forward_neuralnet(x)
    loss, aux_pp = forward_postproc(output, y)
    # 학습 1단계
    accuracy = eval_accuracy(output, y)

    # 학습 2단계
    G_loss = 1.0
    G_output = backprop_postproc(G_loss, aux_pp)
    backprop_neuralnet(G_output, aux_nn)

    return loss, accuracy

In [31]:
# running test
def run_test(x, y):
    output, _ = forward_neuralnet(x)
    accuracy = eval_accuracy(output, y)
    return accuracy

In [32]:
# forward neural network
def forward_neuralnet(x):
    global weight, bias
    output = np.matmul(x, weight) + bias
    # 두 번째 반환 값인 x는 aux_nn으로 반환처리, 역전파 수행시 활용하기 위해 정의
    return output, x

In [33]:
# backpropagation Neural Network : 신경망 역전파 연산 -> 가중치, 편향 값 변화
def backprop_neuralnet(G_output, x):
    global weight, bias
    g_output_w = x.transpose() # 행렬 곱을 위한 transpose

    G_w = np.matmul(g_output_w, G_output)
    G_b = np.sum(G_output, axis = 0) # axis = 0, x축 row 연산

    weight -= LEARNING_RATE * G_w
    bias -= LEARNING_RATE * G_b

In [34]:
# forward propagation postprocessing
def forward_postproc(output, y):
    CEE = sigmoid_cross_entropy_with_logits(y, output)
    loss = np.mean(CEE)

    return loss, [y, output, CEE]

In [35]:
# Activation Functions

# 1. sigmoid function to protect overflow
def sigmoid(x):
    return np.exp(-relu(-x)) / (1.0 + np.exp(-np.abs(x)))

# 2. ReLU function
def relu(x):
    return np.maximum(x, 0)

# 3. sigmoid function to use cross entropy with logits
def sigmoid_cross_entropy_with_logits(z, x):
    return relu(x) - x * z + np.log(1 + np.exp(-np.abs(x))) # 여기서 z 값은 신경망에서 실제 값 y를 의마한다.


# 4. sigmoid derivative function to use cross entropy with logits -> 신경망 구축에서는 연산 과정 자체를 이미 하나로 통합해서 진행했기 때문에 사용하지 않음
def sigmoid_cross_entropy_with_logits_derv(z, x):
    return -z + sigmoid(x)

In [36]:
# Backpropagation postprocessing : 손실함수에 대한 과정을 역전파 하는 단계
def backprop_postproc(G_loss, aux):
    y, output, CEE = aux
    G_loss = 1.0

    g_loss_entropy = 1.0 / np.prod(CEE.shape)
    g_entropy_output = sigmoid_cross_entropy_with_logits_derv(y, output)

    G_entropy = g_loss_entropy * G_loss
    G_output = g_entropy_output * G_entropy

    return G_output

![python image2](https://datascienceplus.com/wp-content/uploads/2018/01/ClassificationMatrix.png)

In [41]:
def eval_accuracy(output, y):
    est_yes = np.greater(output, 0)
    ans_yes = np.greater(y, 0.5)

    # not logical 연산
    est_no = np.logical_not(est_yes)
    ans_no = np.logical_not(ans_yes)

    tp = np.sum(np.logical_and(est_yes, ans_yes))
    fp = np.sum(np.logical_and(est_yes, ans_no))
    tn = np.sum(np.logical_and(est_no, ans_yes))
    fn = np.sum(np.logical_and(est_no, ans_no))

    accuracy    = safe_div(tp + tn, tp + fp + fn + tn)
    precision   = safe_div(tp, tp + fp)
    recall      = safe_div(tp, tp + fn)

    # f1 score 결과
    f1 = 2 * safe_div(recall * precision, recall + precision)
    return [accuracy, precision, recall, f1]

In [38]:
# division 함수 정의
def safe_div(p, q):
    p, q = float(p), float(q)
    if np.abs(q) < 1.0e-20:
        return np.sign(p)
    return p / q

In [44]:
# main 함수 동작 구간
if __name__ == "__main__":
    print('증폭 미적용')
    binary_classification_exec(epochs = 1000, report = 100, mb_size = 10, adjust_ratio=False)

    print('\n증폭 적용')
    binary_classification_exec(epochs = 1000, report = 100, mb_size = 10, adjust_ratio=True)

증폭 미적용
Epoch 100 : loss = 0.122, accuracy = 0.094, precision=0.892, recall=0.082, f1=0.151
Epoch 200 : loss = 0.130, accuracy = 0.094, precision=0.973, recall=0.073, f1=0.136
Epoch 300 : loss = 0.122, accuracy = 0.094, precision=0.881, recall=0.085, f1=0.154
Epoch 400 : loss = 0.116, accuracy = 0.094, precision=0.878, recall=0.084, f1=0.153
Epoch 500 : loss = 0.119, accuracy = 0.094, precision=0.943, recall=0.080, f1=0.147
Epoch 600 : loss = 0.113, accuracy = 0.094, precision=0.958, recall=0.078, f1=0.145
Epoch 700 : loss = 0.114, accuracy = 0.094, precision=0.947, recall=0.081, f1=0.149
Epoch 800 : loss = 0.115, accuracy = 0.094, precision=0.885, recall=0.081, f1=0.149
Epoch 900 : loss = 0.113, accuracy = 0.094, precision=0.970, recall=0.074, f1=0.138
Epoch 1000 : loss = 0.113, accuracy = 0.094, precision=0.976, recall=0.070, f1=0.131

Final Test : accuracy = 0.094, precision=0.976, recall=0.070, f1=0.131

증폭 적용
Epoch 100 : loss = 0.342, accuracy = 0.496, precision=0.971, recall=0.476