### 1. pandas 및 pickle 불러오기

In [90]:
import pandas as pd
import numpy as np
import os
import sklearn.metrics
import pickle as pickle



num_to_label pickle 에서 불러오려면 아래 셀 uncomment 후 경로 지정

num_to_label dict 직접 만들기

In [2]:
num_to_label = {0: 'no_relation', 1: 'org:top_members/employees', 2: 'org:members', 3: 'org:product', 4: 'per:title', 5: 'org:alternate_names', 6: 'per:employee_of', 7: 'org:place_of_headquarters', 8: 'per:product', 9: 'org:number_of_employees/members', 10: 'per:children', 11: 'per:place_of_residence', 12: 'per:alternate_names', 13: 'per:other_family', 14: 'per:colleagues', 15: 'per:origin', 16: 'per:siblings', 17: 'per:spouse', 18: 'org:founded', 19: 'org:political/religious_affiliation', 20: 'org:member_of', 21: 'per:parents', 22: 'org:dissolved', 23: 'per:schools_attended', 24: 'per:date_of_death', 25: 'per:date_of_birth', 26: 'per:place_of_birth', 27: 'per:place_of_death', 28: 'org:founded_by', 29: 'per:religion'}

In [91]:
def klue_re_micro_f1(preds, labels):
    """KLUE-RE micro f1 (except no_relation)"""
    label_list = ['no_relation', 'org:top_members/employees', 'org:members',
       'org:product', 'per:title', 'org:alternate_names',
       'per:employee_of', 'org:place_of_headquarters', 'per:product',
       'org:number_of_employees/members', 'per:children',
       'per:place_of_residence', 'per:alternate_names',
       'per:other_family', 'per:colleagues', 'per:origin', 'per:siblings',
       'per:spouse', 'org:founded', 'org:political/religious_affiliation',
       'org:member_of', 'per:parents', 'org:dissolved',
       'per:schools_attended', 'per:date_of_death', 'per:date_of_birth',
       'per:place_of_birth', 'per:place_of_death', 'org:founded_by',
       'per:religion']
    no_relation_label_idx = label_list.index("no_relation")
    preds = label_to_num(preds.values)
    labels = label_to_num(labels.values)

    label_indices = list(range(len(label_list)))
    label_indices.remove(no_relation_label_idx)
    return sklearn.metrics.f1_score(labels, preds, average="micro", labels=label_indices) * 100.0


def label_to_num(label):
  num_label = []
  with open('../code/dict_label_to_num.pkl', 'rb') as f:
    dict_label_to_num = pickle.load(f)
  for v in label:
    num_label.append(dict_label_to_num[v])
  
  return num_label

In [94]:
def mixmaster(df1, df2, weight1, weight2):
    result = []
    df1= np.array(list(map(float, df1[1:-1].split(', ')))) * weight1
    df2= np.array(list(map(float, df2[1:-1].split(', ')))) * weight2
    result =np.mean([df1, df2], axis=0)
    result = result/result.sum()
    result = '[' + ', '.join(map(str, result)) + ']'
    return result

In [95]:
def change_label(x):
    x= np.array(list(map(float, x[1:-1].split(', '))))
    return num_to_label[np.argmax(x)]

In [105]:
def ensemble(folder_name, file_name1, file_name2, weight1 = 1, weight2 = 1):
    ens1 = pd.read_csv(f'{folder_name}/{file_name1}')
    ens2 = pd.read_csv(f'{folder_name}/{file_name2}')
    result = ens1.copy()
    
    result.probs=result.id.apply(lambda x: mixmaster(ens1.probs.iloc[x],ens2.probs.iloc[x],weight1,weight2))
    
    result.pred_label= result.probs.apply(lambda x: change_label(x))
    
    return result
    
    

In [122]:
if os.path.isdir('./ensemble_result')==False:
    os.mkdir('./ensemble_result')
folder_name = './ensemble'
file_list = os.listdir(folder_name)
count_file = len(file_list)

correct = pd.read_csv('../data/dataset/test/dev_final.csv')

f1_scores={}

for file_1 in range(count_file):
    for file_2 in range(file_1+1,count_file):
        
        print(file_1 , file_2)
        file_name = f'{file_list[file_1][6:-4]}_{file_list[file_2][6:-4]}'
        ens_result = ensemble(folder_name,file_list[file_1],file_list[file_2])
        ens_result.to_csv(f'./ensemble_result/train_{file_name}.csv')
        ens_result= pd.concat([ens_result,correct],axis=1)
        f1_scores[file_name]=klue_re_micro_f1(ens_result.pred_label,ens_result.label)

f1_scores=dict(sorted(f1_scores.items(), key=lambda x:-x[1]))     
f1_scores




0 1
0 2
0 3
0 4
0 5
0 6
0 7
0 8
0 9
0 10
0 11
0 12
0 13
0 14
0 15
0 16
0 17
0 18
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
1 17
1 18
2 3
2 4
2 5
2 6
2 7
2 8
2 9
2 10
2 11
2 12
2 13
2 14
2 15
2 16
2 17
2 18
3 4
3 5
3 6
3 7
3 8
3 9
3 10
3 11
3 12
3 13
3 14
3 15
3 16
3 17
3 18
4 5
4 6
4 7
4 8
4 9
4 10
4 11
4 12
4 13
4 14
4 15
4 16
4 17
4 18
5 6
5 7
5 8
5 9
5 10
5 11
5 12
5 13
5 14
5 15
5 16
5 17
5 18
6 7
6 8
6 9
6 10
6 11
6 12
6 13
6 14
6 15
6 16
6 17
6 18
7 8
7 9
7 10
7 11
7 12
7 13
7 14
7 15
7 16
7 17
7 18
8 9
8 10
8 11
8 12
8 13
8 14
8 15
8 16
8 17
8 18
9 10
9 11
9 12
9 13
9 14
9 15
9 16
9 17
9 18
10 11
10 12
10 13
10 14
10 15
10 16
10 17
10 18
11 12
11 13
11 14
11 15
11 16
11 17
11 18
12 13
12 14
12 15
12 16
12 17
12 18
13 14
13 15
13 16
13 17
13 18
14 15
14 16
14 17
14 18
15 16
15 17
15 18
16 17
16 18
17 18


{'15_roberta-large_sub_42': 71.20900015845348,
 '13_sub_35': 71.14052639679852,
 'sub_29_13': 71.13433753458347,
 '14_sub_35': 71.06145251396647,
 'sub_31_sub_35': 70.95943335479717,
 '15_roberta-large_sub_35': 70.92356687898089,
 'sub_31_sub_42': 70.84067253803042,
 '15_split_2_sub_35': 70.69255465525148,
 '14_sub_42': 70.53001277139208,
 '15_roberta-large_13': 70.50294517444496,
 'sub_31_13': 70.48201989288447,
 'sub_42_sub_29': 70.43842682140554,
 'sub_24_sub_42': 70.42117153461352,
 '14_sub_29': 70.4112658025284,
 '14_sub_31': 70.29089175011921,
 '15_roberta-large_sub_31': 70.25236593059937,
 'sub_31_sub_29': 70.22803380641047,
 '15_roberta-large_14': 70.20305367542892,
 'sub_24_14': 70.17937219730943,
 '15_roberta-large_sub_29': 70.1743264659271,
 'sub_24_sub_31': 70.15187849720223,
 'sub_24_sub_35': 70.14034521697049,
 'sub_24_sub_29': 70.12528107934469,
 '14_snu_sub_35': 70.09463722397477,
 'sub_42_13': 70.07789827401862,
 'sub_24_13': 70.0353248348948,
 '15_roberta-large_robert

In [124]:
folder_name = './ensemble'
file_list = os.listdir(folder_name)
count_file = len(file_list)
if os.path.isdir('./ensemble_result')==False:
    os.mkdir('./ensemble_result')

correct = pd.read_csv('../data/dataset/test/dev_final.csv')

f1_scores={}
file_name1= 'train_15_roberta-large.csv'
file_name2 = 'train_sub_42.csv'

for weight1 in range(1,6):

    print(weight1)

    file_name = f'{file_name1[6:-4]}_{weight1}_{file_name2[6:-4]}_5'
    ens_result = ensemble(folder_name,file_name1,file_name2,weight1,5)
    ens_result.to_csv(f'./ensemble_result/{file_name}.csv')
    ens_result= pd.concat([ens_result,correct],axis=1)
    f1_scores[file_name]=klue_re_micro_f1(ens_result.pred_label,ens_result.label)

for weight2 in range(1,6):

    print(weight2)

    file_name = f'{file_name1[6:-4]}_5_{file_name2[6:-4]}_{weight2}'
    ens_result = ensemble(folder_name,file_name1,file_name2,5,weight2)
    ens_result.to_csv(f'./ensemble_result/train_{file_name}.csv')
    ens_result= pd.concat([ens_result,correct],axis=1)
    f1_scores[file_name]=klue_re_micro_f1(ens_result.pred_label,ens_result.label)

f1_scores=dict(sorted(f1_scores.items(), key=lambda x:-x[1]))
f1_scores


1
2
3
4
5
1
2
3
4
5


{'15_roberta-large_4_sub_42_5': 71.49465624501515,
 '15_roberta-large_3_sub_42_5': 71.43314139475369,
 '15_roberta-large_5_sub_42_5': 71.20900015845348,
 '15_roberta-large_2_sub_42_5': 71.13385574924035,
 '15_roberta-large_5_sub_42_4': 70.53994316387748,
 '15_roberta-large_1_sub_42_5': 70.4428754813864,
 '15_roberta-large_5_sub_42_3': 70.29998429401601,
 '15_roberta-large_5_sub_42_2': 69.65732087227414,
 '15_roberta-large_5_sub_42_1': 69.07854050711194}

In [108]:
f1_scores=dict(sorted(f1_scores.items(), key=lambda x:-x[1]))

In [112]:
f1_scores

{'14_5_14_roberta-large_5': 70.74960127591706,
 '14_4_14_roberta-large_5': 70.69186418962204,
 '14_3_14_roberta-large_5': 70.46947604550552,
 '14_5_14_roberta-large_4': 70.40686586141132,
 '14_2_14_roberta-large_5': 70.0095816033216,
 '14_1_14_roberta-large_5': 69.69407265774379,
 '14_5_14_roberta-large_3': 69.48000632211159,
 '14_5_14_roberta-large_2': 68.75491893593578,
 '14_5_14_roberta-large_1': 68.1023720349563}

In [123]:
if os.path.isdir('./ensemble_result')==False:
    os.mkdir('./ensemble_result')
folder_name = './ensemble'
file_list = os.listdir(folder_name)
count_file = len(file_list)

correct = pd.read_csv('../data/dataset/test/dev_final.csv')

f1_scores={}
file_2 = 'train_15_roberta-large_3.8_sub_42_5.csv'

for file_1 in range(count_file):
    print(file_1 , file_2)
    file_name = f'{file_list[file_1][6:-4]}_{file_2[6:-4]}'
    ens_result = ensemble(folder_name,file_list[file_1],file_2)
    ens_result.to_csv(f'./ensemble_result/train_{file_name}.csv')
    ens_result= pd.concat([ens_result,correct],axis=1)
    f1_scores[file_name]=klue_re_micro_f1(ens_result.pred_label,ens_result.label)

f1_scores=dict(sorted(f1_scores.items(), key=lambda x:-x[1]))     
f1_scores

0 train_15_roberta-large_sub_42.csv
1 train_15_roberta-large_sub_42.csv
2 train_15_roberta-large_sub_42.csv
3 train_15_roberta-large_sub_42.csv
4 train_15_roberta-large_sub_42.csv
5 train_15_roberta-large_sub_42.csv
6 train_15_roberta-large_sub_42.csv
7 train_15_roberta-large_sub_42.csv
8 train_15_roberta-large_sub_42.csv
9 train_15_roberta-large_sub_42.csv
10 train_15_roberta-large_sub_42.csv
11 train_15_roberta-large_sub_42.csv
12 train_15_roberta-large_sub_42.csv
13 train_15_roberta-large_sub_42.csv
14 train_15_roberta-large_sub_42.csv
15 train_15_roberta-large_sub_42.csv
16 train_15_roberta-large_sub_42.csv
17 train_15_roberta-large_sub_42.csv
18 train_15_roberta-large_sub_42.csv
19 train_15_roberta-large_sub_42.csv
20 train_15_roberta-large_sub_42.csv


{'15_roberta-large_sub_42_15_roberta-large_sub_42': 71.20900015845348,
 'sub_41_15_roberta-large_sub_42': 71.20900015845348,
 'sub_29_15_roberta-large_sub_42': 70.96774193548387,
 'sub_42_15_roberta-large_sub_42': 70.85933749399905,
 'sub_24_15_roberta-large_sub_42': 70.77317933910813,
 'sub_35_15_roberta-large_sub_42': 70.53915275994865,
 '14_15_roberta-large_sub_42': 70.50428163653663,
 'sub_20_15_roberta-large_sub_42': 70.46240956275558,
 '13_15_roberta-large_sub_42': 70.46177138531415,
 'sub_31_15_roberta-large_sub_42': 70.3012912482066,
 '15_roberta-large_15_roberta-large_sub_42': 69.62732919254657,
 'roberta_large_checkpoint-8000_15_roberta-large_sub_42': 69.58714150646075,
 '15_split_2_15_roberta-large_sub_42': 69.51515151515152,
 'sub_8_15_roberta-large_sub_42': 69.24300254452926,
 'sub_26_15_roberta-large_sub_42': 69.07134947002056,
 'sub_40_15_roberta-large_sub_42': 69.05972045743329,
 '14_snu_15_roberta-large_sub_42': 69.03914590747331,
 'sub_6_15_roberta-large_sub_42': 68.8

In [126]:
folder_name = './ensemble'
file_list = os.listdir(folder_name)
count_file = len(file_list)
if os.path.isdir('./ensemble_result')==False:
    os.mkdir('./ensemble_result')

correct = pd.read_csv('../data/dataset/test/dev_final.csv')

f1_scores={}
file_name1= 'train_15_roberta-large.csv'
file_name2 = 'train_sub_42.csv'

for weight in range(1,10):
    weight1 = 3+ weight*0.2
    print(weight1)

    file_name = f'{file_name1[6:-4]}_{weight1}_{file_name2[6:-4]}_5'
    ens_result = ensemble(folder_name,file_name1,file_name2,weight1,5)
    ens_result.to_csv(f'./ensemble_result/{file_name}.csv')
    ens_result= pd.concat([ens_result,correct],axis=1)
    f1_scores[file_name]=klue_re_micro_f1(ens_result.pred_label,ens_result.label)

# for weight2 in range(1,6):

#     print(weight2)

#     file_name = f'{file_name1[6:-4]}_5_{file_name2[6:-4]}_{weight2}'
#     ens_result = ensemble(folder_name,file_name1,file_name2,5,weight2)
#     ens_result.to_csv(f'./ensemble_result/train_{file_name}.csv')
#     ens_result= pd.concat([ens_result,correct],axis=1)
#     f1_scores[file_name]=klue_re_micro_f1(ens_result.pred_label,ens_result.label)

f1_scores=dict(sorted(f1_scores.items(), key=lambda x:-x[1]))
f1_scores


3.2
3.4
3.6
3.8
4.0
4.2
4.4
4.6
4.8


{'15_roberta-large_3.8_sub_42_5': 71.50374740870674,
 '15_roberta-large_4.0_sub_42_5': 71.49465624501515,
 '15_roberta-large_3.2_sub_42_5': 71.488,
 '15_roberta-large_3.4_sub_42_5': 71.46282973621103,
 '15_roberta-large_3.6_sub_42_5': 71.44911327688129,
 '15_roberta-large_4.2_sub_42_5': 71.30573248407643,
 '15_roberta-large_4.8_sub_42_5': 71.19074044712225,
 '15_roberta-large_4.4_sub_42_5': 71.18105229693211,
 '15_roberta-large_4.6_sub_42_5': 71.163307411522}