In [32]:
import pandas as pd
import numpy as np
def summary_dataframe(dataframe, key_ls=[], target_ls=[], digits=2):
    '''
    sumary_dataframe
    * code by shbaek
    * input
      - dataframe: 입력데이터 | pd.DataFrame
      - key_ls: 키 컬럼명 리스트 | list
      - target_ls: 타겟 컬럼명 리스트 | list
      - digits: 소수점 자리수 | int
    * output
      - summary_df: 요약 데이터 프레임 | pd.dataFrame
    * info itmes
      - column_name: 컬럼명
      - is_key: 키, 타겟 컬럼 여부(key, target)
      - sample: 샘플값
      - column_dtype: 데이터 형태(int, float, object)
      - column_type: 명목/수치형 변수 여부(cat, num)
      - null_perc: 결측치 백분위
      - n_uniques: number of unique values
      - min: 최솟값
      - median: 중위수
      - mean: 평균
      - max: 최댓값
      - mode: 가장 빈번하게 발생하는 값
      - mode_perc: 가장 빈번하게 발생하는 값의 백분위
    '''
    column_name_ls = dataframe.columns  # get column name list
    info_item_ls = ['column_name', 'is_key', 'sample', 'column_dtype', 'column_type', 'null_perc', 'n_uniques', 'min', 'median', 'mean', 'max', 'mode', 'mode_perc']  # info items
    column_info_dict = {info_item: [] for info_item in info_item_ls} # init info dictionary
    n_rows = dataframe.shape[0]  # n_rows
    # key_ls, target_ls validationS
    if (len(key_ls) != 0) & (len(set(key_ls) - set(column_name_ls)) != 0):
        print(f'error: input key name {set(key_ls) - set(column_name_ls)} is not in dataframe')
    if (len(target_ls) != 0) & (len(set(target_ls) - set(column_name_ls)) != 0):
        print(f'error: input target name {set(target_ls) - set(column_name_ls)} is not in dataframe')
    for column_selected in column_name_ls:
        column_name = column_selected
        is_key = 'key' if column_name in key_ls else 'target' if column_name in target_ls else ''
        column_dtype = str(dataframe[column_name].dtypes)
        column_type = 'num' if any(x in column_dtype for x in ['float', 'int']) else 'cat'
        samples = dataframe[column_name].unique()[0]
        null_perc = round(dataframe[column_name].isna().mean()*100, digits)
        n_uniques = dataframe[column_name].nunique()
        min_ = dataframe[column_name].min() if column_type == 'num' else ''
        med_ = dataframe[column_name].median() if column_type =='num' else ''
        mean_ = dataframe[column_name].mean() if column_type =='num' else ''
        max_ = dataframe[column_name].max() if column_type == 'num' else ''
        idx, value = dataframe[column_name].fillna('NA_VALUE').factorize()
        mode = value[np.bincount(idx).argmax()]
        mode_count = np.bincount(idx).max()
        mode_rate = round(mode_count/n_rows*100, digits)
        column_info_dict['column_name'].append(column_name)
        column_info_dict['is_key'].append(is_key)
        column_info_dict['sample'].append(samples)
        column_info_dict['column_dtype'].append(column_dtype)
        column_info_dict['column_type'].append(column_type)
        column_info_dict['null_perc'].append(null_perc)
        column_info_dict['n_uniques'].append(n_uniques)
        column_info_dict['min'].append(min_)
        column_info_dict['median'].append(med_)
        column_info_dict['mean'].append(mean_)
        column_info_dict['max'].append(max_)
        column_info_dict['mode'].append(mode)
        # column_info_dict['mode_count'].append(mode_count)
        column_info_dict['mode_perc'].append(mode_rate)
    summary_df = pd.DataFrame.from_dict(column_info_dict).sort_values(by=['is_key', 'column_type'], ascending=False).reset_index(drop=True)
    print(f'number of rows: {n_rows}')
    return summary_df

In [36]:
y_df.columns

Index(['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps',
       'Other_Faults'],
      dtype='object')

In [34]:
path = '../data/faults.csv'
df = pd.read_csv(path)
y_df = df.iloc[:, -7:]
cnt = y_df.sum()
perc = round(y_df.mean()*100, 2)

In [38]:
summary_dataframe(df, key_ls=[], target_ls=y_df.columns.tolist(), digits=1)

number of rows: 1941


Unnamed: 0,column_name,is_key,sample,column_dtype,column_type,null_perc,n_uniques,min,median,mean,max,mode,mode_perc
0,Pastry,target,1.0,int64,num,0.0,2,0.0,0.0,0.08140134,1.0,0.0,91.9
1,Z_Scratch,target,0.0,int64,num,0.0,2,0.0,0.0,0.09788769,1.0,0.0,90.2
2,K_Scatch,target,0.0,int64,num,0.0,2,0.0,0.0,0.2014426,1.0,0.0,79.9
3,Stains,target,0.0,int64,num,0.0,2,0.0,0.0,0.03709428,1.0,0.0,96.3
4,Dirtiness,target,0.0,int64,num,0.0,2,0.0,0.0,0.02833591,1.0,0.0,97.2
5,Bumps,target,0.0,int64,num,0.0,2,0.0,0.0,0.2071097,1.0,0.0,79.3
6,Other_Faults,target,0.0,int64,num,0.0,2,0.0,0.0,0.3467285,1.0,0.0,65.3
7,X_Minimum,,42.0,int64,num,0.0,962,0.0,435.0,571.136,1705.0,41.0,6.5
8,X_Maximum,,50.0,int64,num,0.0,994,4.0,467.0,617.9645,1713.0,212.0,1.2
9,Y_Minimum,,270900.0,int64,num,0.0,1939,6712.0,1204128.0,1650685.0,12987660.0,1803992.0,0.1


---

In [80]:
import numpy as np
import csv
import time

np.random.seed(1234)
def randomize(): np.random.seed(time.time())


RND_MEAN = 0
RND_STD = 0.0030

LEARNING_RATE = 0.001


def abalone_exec(epoch_count=20, mb_size=10, report=1):
    load_abalone_dataset()
    init_model()
    train_and_test(epoch_count, mb_size, report)

def load_abalone_dataset():
    with open('../../data/chap01/abalone.csv') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader, None)
        rows = []
        for row in csvreader:
            rows.append(row)
            
    global data, input_cnt, output_cnt
    input_cnt, output_cnt = 10, 1
    data = np.zeros([len(rows), input_cnt+output_cnt])

    for n, row in enumerate(rows):
        if row[0] == 'I': data[n, 0] = 1
        if row[0] == 'M': data[n, 1] = 1
        if row[0] == 'F': data[n, 2] = 1
        data[n, 3:] = row[1:]


def init_model():
    global weight, bias, input_cnt, output_cnt
    weight = np.random.normal(RND_MEAN, RND_STD,[input_cnt, output_cnt])
    bias = np.zeros([output_cnt])


def train_and_test(epoch_count, mb_size, report):
    step_count = arrange_data(mb_size)
    test_x, test_y = get_test_data()
    
    for epoch in range(epoch_count):
        losses, accs = [], []
        
        for n in range(step_count):
            train_x, train_y = get_train_data(mb_size, n)
            loss, acc = run_train(train_x, train_y)
            losses.append(loss)
            accs.append(acc)
            
        if report > 0 and (epoch+1) % report == 0:
            acc = run_test(test_x, test_y)
            print('Epoch {}: loss={:5.3f}, accuracy={:5.3f}/{:5.3f}'. \
                  format(epoch+1, np.mean(losses), np.mean(accs), acc))
            
    final_acc = run_test(test_x, test_y)
    print('\nFinal Test: final accuracy = {:5.3f}'.format(final_acc))


def arrange_data(mb_size):
    global data, shuffle_map, test_begin_idx
    shuffle_map = np.arange(data.shape[0])
    np.random.shuffle(shuffle_map)
    step_count = int(data.shape[0] * 0.8) // mb_size
    test_begin_idx = step_count * mb_size
    return step_count

def get_test_data():
    global data, shuffle_map, test_begin_idx, output_cnt
    test_data = data[shuffle_map[test_begin_idx:]]
    return test_data[:, :-output_cnt], test_data[:, -output_cnt:]

def get_train_data(mb_size, nth):
    global data, shuffle_map, test_begin_idx, output_cnt
    if nth == 0:
        np.random.shuffle(shuffle_map[:test_begin_idx])
    train_data = data[shuffle_map[mb_size*nth:mb_size*(nth+1)]]
    return train_data[:, :-output_cnt], train_data[:, -output_cnt:]


def run_train(x, y):
    output, aux_nn = forward_neuralnet(x)
    loss, aux_pp = forward_postproc(output, y)
    accuracy = eval_accuracy(output, y)
    
    G_loss = 1.0
    G_output = backprop_postproc(G_loss, aux_pp)
    backprop_neuralnet(G_output, aux_nn)
    
    return loss, accuracy

def run_test(x, y):
    output, _ = forward_neuralnet(x)
    accuracy = eval_accuracy(output, y)
    return accuracy


def forward_neuralnet(x):
    global weight, bias
    output = np.matmul(x, weight) + bias
    return output, x

def backprop_neuralnet(G_output, x):
    global weight, bias
    g_output_w = x.transpose()
    
    G_w = np.matmul(g_output_w, G_output)
    G_b = np.sum(G_output, axis=0)

    weight -= LEARNING_RATE * G_w
    bias -= LEARNING_RATE * G_b


def forward_postproc(output, y):
    diff = output - y
    square = np.square(diff)
    loss = np.mean(square)
    return loss, diff

def backprop_postproc(G_loss, diff):
    shape = diff.shape
    
    g_loss_square = np.ones(shape) / np.prod(shape)
    g_square_diff = 2 * diff
    g_diff_output = 1

    G_square = g_loss_square * G_loss
    G_diff = g_square_diff * G_square
    G_output = g_diff_output * G_diff
    
    return G_output


def eval_accuracy(output, y):
    mdiff = np.mean(np.abs((output - y)/y))
    return 1 - mdiff




def backprop_postproc_oneline(G_loss, diff):  # backprop_postproc() 대신 사용 가능
    return 2 * diff / np.prod(diff.shape)



In [112]:
np.random.seed(1234)

def steel_exec(epoch_count=10, mb_size=10, report=1):
    load_steel_dataset()
    init_model()
    train_and_test(epoch_count, mb_size, report)

def load_steel_dataset():
    with open('../data/faults.csv') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader, None)
        rows = []
        for row in csvreader:
            rows.append(row)
            
    global data, input_cnt, output_cnt
    input_cnt, output_cnt = 27, 7
    data = np.asarray(rows, dtype='float32')


def forward_postproc(output, y):
    entropy = softmax_cross_entropy_with_logits(y, output)
    loss = np.mean(entropy) 
    return loss, [y, output, entropy]

def backprop_postproc(G_loss, aux):
    y, output, entropy = aux
    
    g_loss_entropy = 1.0 / np.prod(entropy.shape)
    g_entropy_output = softmax_cross_entropy_with_logits_derv(y, output)
    
    G_entropy = g_loss_entropy * G_loss
    G_output = g_entropy_output * G_entropy
    
    return G_output


def eval_accuracy(output, y):
    estimate = np.argmax(output, axis=1)
    answer = np.argmax(y, axis=1)
    correct = np.equal(estimate, answer)
    
    return np.mean(correct)


def softmax(x):
    max_elem = np.max(x, axis=1)
    diff = (x.transpose() - max_elem).transpose()
    exp = np.exp(diff)
    sum_exp = np.sum(exp, axis=1)
    probs = (exp.transpose() / sum_exp).transpose()
    return probs

def softmax_derv(x, y):
    mb_size, nom_size = x.shape
    derv = np.ndarray([mb_size, nom_size, nom_size])
    for n in range(mb_size):
        for i in range(nom_size):
            for j in range(nom_size):
                derv[n, i, j] = -y[n,i] * y[n,j]
            derv[n, i, i] += y[n,i]
    return derv

def softmax_cross_entropy_with_logits(labels, logits):
    probs = softmax(logits)
    ce1 = - labels * np.log(probs+1.0e-10)
    ce2 = - labels * np.log(probs+1.0e-10) - (1-labels) * np.log(1-probs + 1.0e-10)
#     print(np.sum(ce1, axis=1))
#     print(np.sum(ce2, axis=1))
#     print(np.sum((1-labels) * np.log(1-probs + 1.0e-10), axis=1))
#     print("="*100)

    # return np.sum(np.zeros((ce1.shape)), axis =1)
    return np.sum(ce2, axis=1)
    # return np.sum(ce2, axis=1)

def softmax_cross_entropy_with_logits_derv(labels, logits):
    return softmax(logits) - labels

In [111]:
steel_exec()

Epoch 1: loss=15.984, accuracy=0.306/0.320
Epoch 2: loss=15.509, accuracy=0.326/0.197
Epoch 3: loss=15.984, accuracy=0.306/0.348
Epoch 4: loss=15.004, accuracy=0.348/0.197
Epoch 5: loss=15.286, accuracy=0.336/0.202
Epoch 6: loss=15.390, accuracy=0.332/0.440
Epoch 7: loss=15.509, accuracy=0.326/0.442
Epoch 8: loss=15.628, accuracy=0.321/0.455
Epoch 9: loss=15.360, accuracy=0.333/0.322
Epoch 10: loss=15.316, accuracy=0.335/0.455

Final Test: final accuracy = 0.455


In [113]:
steel_exec()

Epoch 1: loss=31.969, accuracy=0.306/0.320
Epoch 2: loss=31.018, accuracy=0.326/0.197
Epoch 3: loss=31.969, accuracy=0.306/0.348
Epoch 4: loss=30.008, accuracy=0.348/0.197
Epoch 5: loss=30.572, accuracy=0.336/0.202
Epoch 6: loss=30.780, accuracy=0.332/0.440
Epoch 7: loss=31.018, accuracy=0.326/0.442
Epoch 8: loss=31.256, accuracy=0.321/0.455
Epoch 9: loss=30.721, accuracy=0.333/0.322
Epoch 10: loss=30.632, accuracy=0.335/0.455

Final Test: final accuracy = 0.455


In [90]:
steel_exec()

Epoch 1: loss=15.984, accuracy=0.306/0.320
Epoch 2: loss=15.509, accuracy=0.326/0.197
Epoch 3: loss=15.984, accuracy=0.306/0.348
Epoch 4: loss=15.004, accuracy=0.348/0.197
Epoch 5: loss=15.286, accuracy=0.336/0.202
Epoch 6: loss=15.390, accuracy=0.332/0.440
Epoch 7: loss=15.509, accuracy=0.326/0.442
Epoch 8: loss=15.628, accuracy=0.321/0.455
Epoch 9: loss=15.360, accuracy=0.333/0.322
Epoch 10: loss=15.316, accuracy=0.335/0.455

Final Test: final accuracy = 0.455


In [92]:
steel_exec()

Epoch 1: loss=15.984, accuracy=0.306/0.320
Epoch 2: loss=15.509, accuracy=0.326/0.197
Epoch 3: loss=15.984, accuracy=0.306/0.348
Epoch 4: loss=15.004, accuracy=0.348/0.197
Epoch 5: loss=15.286, accuracy=0.336/0.202
Epoch 6: loss=15.390, accuracy=0.332/0.440
Epoch 7: loss=15.509, accuracy=0.326/0.442
Epoch 8: loss=15.628, accuracy=0.321/0.455
Epoch 9: loss=15.360, accuracy=0.333/0.322
Epoch 10: loss=15.316, accuracy=0.335/0.455

Final Test: final accuracy = 0.455
