In [1]:
import csv
import json
from tqdm import tqdm

In [49]:
def load_tsv_nearest(path):
    rows = []
    with open(path, encoding='utf-8') as f:
        rd = csv.reader(f, delimiter='\t')
        next(rd, None) # Skip headder
        for row in rd:
            rows.append({'label': row[0], 'score':row[1]})
    return rows

In [26]:
def convRawToScore(work):
    label = work['label']
    score = work['score'].replace("\'", "\"")
    if label == '01':
        label = 0
    elif label == '10':
        label = 1
    score = json.loads(score)['score']
    return (label, score)

In [44]:
def getPNScore(works):
    pos_score = 0
    neg_score = 0
    pos_len = 0
    neg_len = 0

    for work in works:
        label, score = convRawToScore(work)
        if label == 0:
            neg_score += score
            neg_len += 1
        elif label == 1:
            pos_score += score
            pos_len += 1
    return {'pos': pos_score, 'neg':neg_score, 'pos_len': pos_len, 'neg_len': neg_len}

In [63]:
def getAVGScore(TARGET):
    train = load_tsv_nearest(f'./tsv/nearest-10/{TARGET}/train.tsv_status.tsv')
    test = load_tsv_nearest(f'./tsv/nearest-10/{TARGET}/test.tsv_status.tsv')
    dev = load_tsv_nearest(f'./tsv/nearest-10/{TARGET}/dev.tsv_status.tsv')

    train_score = getPNScore(train)
    test_score = getPNScore(test)
    dev_score = getPNScore(dev)

    total_pos = train_score['pos'] + test_score['pos'] + dev_score['pos']
    total_neg = train_score['neg'] + test_score['neg'] + dev_score['neg']
    total_pos_len = train_score['pos_len'] + test_score['pos_len'] + dev_score['pos_len']
    total_neg_len = train_score['neg_len'] + test_score['neg_len'] + dev_score['neg_len']
    print(f'TARGET: {TARGET}\n\tpos:{total_pos}\n\t  pos avg:{total_pos/total_pos_len:.2f}\n\tneg:{total_neg}\n\t  neg avg:{total_neg/total_neg_len:.2f}')

In [28]:
print(works[30])

{'label': '01', 'score': "{'pos': 0, 'len': 0, 'end': 0, 'score': 0}"}


In [37]:
print(getPNScore(works))

	pos: 900
	neg: 326
	pos/avg: 2.4390243902439024
	neg/avg: 0.8834688346883469
{'pos': 900, 'neg': 326, 'len': 369}


In [64]:
targets = [1, 2, 3, 4, 5, 20, 39, 40, 42, 69, 70, 71, 73, 74, 75, 77, 79, 80, 81, 83, 84, 87, 90, 96, 120, 121, 122, 126, 128, 199, 200, 203, 204, 214, 259, 260, 281, 284, 291]
for target in targets:
    getAVGScore(target)

TARGET: 1
	pos:23182
	  pos avg:7.11
	neg:16000
	  neg avg:4.91
TARGET: 2
	pos:20810
	  pos avg:7.00
	neg:14430
	  neg avg:4.86
TARGET: 3
	pos:3369
	  pos avg:1.83
	neg:759
	  neg avg:0.41
TARGET: 4
	pos:5754
	  pos avg:2.49
	neg:1934
	  neg avg:0.84
TARGET: 5
	pos:19998
	  pos avg:13.48
	neg:6209
	  neg avg:4.19
TARGET: 20
	pos:10429
	  pos avg:3.43
	neg:4464
	  neg avg:1.47
TARGET: 39
	pos:56059
	  pos avg:12.77
	neg:11415
	  neg avg:2.60
TARGET: 40
	pos:29466
	  pos avg:5.44
	neg:10272
	  neg avg:1.90
TARGET: 42
	pos:11146
	  pos avg:6.02
	neg:3378
	  neg avg:1.83
TARGET: 69
	pos:3385
	  pos avg:0.58
	neg:2819
	  neg avg:0.48
TARGET: 70
	pos:84043
	  pos avg:8.95
	neg:25844
	  neg avg:2.75
TARGET: 71
	pos:7577
	  pos avg:1.91
	neg:3551
	  neg avg:0.90
TARGET: 73
	pos:7784
	  pos avg:2.63
	neg:5728
	  neg avg:1.93
TARGET: 74
	pos:1950
	  pos avg:0.78
	neg:1103
	  neg avg:0.44
TARGET: 75
	pos:1642
	  pos avg:0.67
	neg:1294
	  neg avg:0.53
TARGET: 77
	pos:2052
	  pos avg:0.97
	neg:1955