# 데이터셋 불러오기

In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm.notebook import tqdm

with open('./dataset.pkl', 'rb') as file: # 데이터셋 파일 불러오기
    dataset = pickle.load(file)

# 데이터셋을 훈련셋과 시험셋으로 나눔
trainset = dataset.iloc[:10000].reset_index(drop=True)
testset = dataset.iloc[10000:].reset_index(drop=True)

print('훈련셋')
display(trainset)

print('시험셋')
display(testset)

훈련셋


Unnamed: 0,data,labels
0,"[76, 161, 58, 138, 123, 75, 3, 93, 59, 60, 81,...",1
1,"[7, 180, 8, 95, 191, 40, 46, 56, 88, 1, 120, 5...",1
2,"[36, 145, 66, 88, 170, 109, 85, 86, 12, 78, 30...",1
3,"[194, 69, 135, 18, 109, 132, 171, 162, 171, 17...",0
4,"[174, 68, 152, 104, 18, 122, 120, 142, 112, 10...",0
...,...,...
9995,"[75, 130, 38, 135, 144, 10, 93, 46, 35, 105, 1...",1
9996,"[47, 173, 43, 87, 128, 30, 71, 7, 36, 79, 112,...",1
9997,"[66, 195, 78, 199, 137, 16, 29, 66, 16, 72, 68...",1
9998,"[113, 121, 14, 188, 169, 11, 42, 47, 80, 39, 1...",1


시험셋


Unnamed: 0,data,labels
0,"[95, 13, 122, 52, 72, 91, 167, 148, 191, 84, 1...",0
1,"[73, 111, 7, 173, 84, 53, 65, 47, 22, 30, 55, ...",1
2,"[109, 110, 169, 18, 79, 85, 128, 118, 183, 91,...",0
3,"[122, 54, 169, 76, 71, 104, 103, 136, 98, 173,...",0
4,"[6, 143, 59, 121, 138, 87, 51, 3, 84, 114, 98,...",1
...,...,...
9995,"[190, 117, 126, 104, 102, 164, 152, 81, 110, 1...",0
9996,"[174, 111, 175, 47, 90, 140, 105, 158, 170, 12...",0
9997,"[156, 1, 86, 31, 40, 140, 195, 151, 185, 95, 1...",0
9998,"[92, 96, 136, 29, 32, 118, 161, 122, 115, 99, ...",0


# 해시함수 정의 (랜덤 프로젝션)

In [2]:
# 해시함수 정의 (랜덤프로젝션)
projector = np.random.randn(100, 20)

train_hashed = trainset['data'].to_list()
train_hashed = np.stack(train_hashed)
train_hashed = np.dot(train_hashed, projector)
train_hashed = (train_hashed >= 0).astype(np.int64)
display(train_hashed)

train_labels = trainset['labels'].to_numpy()
display(train_labels)

test_hashed = testset['data'].apply(lambda x: (np.dot(x, projector) >= 0).astype(np.int64))

array([[1, 1, 0, ..., 0, 1, 0],
       [1, 1, 0, ..., 0, 1, 0],
       [1, 1, 0, ..., 1, 1, 0],
       ...,
       [1, 1, 0, ..., 1, 1, 0],
       [1, 1, 1, ..., 0, 1, 0],
       [1, 1, 0, ..., 1, 1, 0]])

array([1, 1, 1, ..., 1, 1, 1])

# 시험셋 10000개에 대하여 예측을 수행

In [3]:
predicts = [0 for _ in range(len(test_hashed))]
for idx in tqdm(range(len(test_hashed))):
    input = test_hashed.iloc[idx]
    distances = np.sum(((train_hashed==1) & (input==1)), axis=1)     # 모든 해시된 trainset 포인트들에 대하여 주어진 포인트와 Jaccard 거리 계산
    indices = np.argpartition(distances, -10)[-10:]                # 가장 가까운 10개 포인트의 index들을 구함

    neighbors = train_labels[indices].sum()                        # 가장 가까운 10개 포인트의 레이블을 확인하여 합을 구함 (1이 몇개인지)
    predict = 1 if neighbors >= 5 else 0                           # 1이 5개 이상이면 1, 미만이면 0으로 예측
    predicts[idx] = predict                                        # 예측 레이블 목록에 이번 포인트의 예측 결과를 집어넣음

predicts = np.array(predicts)

  0%|          | 0/10000 [00:00<?, ?it/s]

# Classification 성능 측정

In [4]:
GT = testset['labels'].to_numpy()           # Ground Truth
TP = ((predicts == 1) & (GT == 1)).sum()    # True Positive
FP = ((predicts == 1) & (GT == 0)).sum()    # False Positive
TN = ((predicts == 0) & (GT == 0)).sum()    # True Negative
FN = ((predicts == 0) & (GT == 1)).sum()    # False Negative

accuracy = (TP+TN) / (TP+FP+TN+FN)
precision = TP / (TP+FP)
recall = TP / (TP+FN)
f1 = 2 * precision * recall / (precision + recall)

print(f'{"Accuracy":10s}: {accuracy*100:3.2f}%')
print(f'{"Precision":10s}: {precision*100:3.2f}%')
print(f'{"Recall":10s}: {recall*100:3.2f}%')
print(f'{"F1 score":10s}: {f1:3.2f}')

Accuracy  : 95.94%
Precision : 92.48%
Recall    : 100.00%
F1 score  : 0.96
