### 1. pandas 및 pickle 불러오기

In [1]:
import pandas as pd
import numpy as np
import os
import sklearn.metrics
import pickle as pickle



num_to_label pickle 에서 불러오려면 아래 셀 uncomment 후 경로 지정

num_to_label dict 직접 만들기

In [2]:
num_to_label = {0: 'no_relation', 1: 'org:top_members/employees', 2: 'org:members', 3: 'org:product', 4: 'per:title', 5: 'org:alternate_names', 6: 'per:employee_of', 7: 'org:place_of_headquarters', 8: 'per:product', 9: 'org:number_of_employees/members', 10: 'per:children', 11: 'per:place_of_residence', 12: 'per:alternate_names', 13: 'per:other_family', 14: 'per:colleagues', 15: 'per:origin', 16: 'per:siblings', 17: 'per:spouse', 18: 'org:founded', 19: 'org:political/religious_affiliation', 20: 'org:member_of', 21: 'per:parents', 22: 'org:dissolved', 23: 'per:schools_attended', 24: 'per:date_of_death', 25: 'per:date_of_birth', 26: 'per:place_of_birth', 27: 'per:place_of_death', 28: 'org:founded_by', 29: 'per:religion'}

In [3]:
def klue_re_micro_f1(preds, labels):
    """KLUE-RE micro f1 (except no_relation)"""
    label_list = ['no_relation', 'org:top_members/employees', 'org:members',
       'org:product', 'per:title', 'org:alternate_names',
       'per:employee_of', 'org:place_of_headquarters', 'per:product',
       'org:number_of_employees/members', 'per:children',
       'per:place_of_residence', 'per:alternate_names',
       'per:other_family', 'per:colleagues', 'per:origin', 'per:siblings',
       'per:spouse', 'org:founded', 'org:political/religious_affiliation',
       'org:member_of', 'per:parents', 'org:dissolved',
       'per:schools_attended', 'per:date_of_death', 'per:date_of_birth',
       'per:place_of_birth', 'per:place_of_death', 'org:founded_by',
       'per:religion']
    no_relation_label_idx = label_list.index("no_relation")
    preds = label_to_num(preds.values)
    labels = label_to_num(labels.values)

    label_indices = list(range(len(label_list)))
    label_indices.remove(no_relation_label_idx)
    return sklearn.metrics.f1_score(labels, preds, average="micro", labels=label_indices) * 100.0


def label_to_num(label):
  num_label = []
  with open('../code/dict_label_to_num.pkl', 'rb') as f:
    dict_label_to_num = pickle.load(f)
  for v in label:
    num_label.append(dict_label_to_num[v])
  
  return num_label

In [4]:
def mixmaster(df1, df2, weight1, weight2):
    result = []
    df1= np.array(list(map(float, df1[1:-1].split(', ')))) * weight1
    df2= np.array(list(map(float, df2[1:-1].split(', ')))) * weight2
    result =np.mean([df1, df2], axis=0)
    result = result/result.sum()
    result = '[' + ', '.join(map(str, result)) + ']'
    return result

In [5]:
def change_label(x):
    x= np.array(list(map(float, x[1:-1].split(', '))))
    return num_to_label[np.argmax(x)]

In [6]:
def ensemble(folder_name, file_name1, file_name2, weight1 = 1, weight2 = 1):
    ens1 = pd.read_csv(f'{folder_name}/{file_name1}')
    ens2 = pd.read_csv(f'{folder_name}/{file_name2}')
    result = ens1.copy()
    
    result.probs=result.id.apply(lambda x: mixmaster(ens1.probs.iloc[x],ens2.probs.iloc[x],weight1,weight2))
    
    result.pred_label= result.probs.apply(lambda x: change_label(x))
    
    return result
    
    

In [29]:
if os.path.isdir('./ensemble_result')==False:
    os.mkdir('./ensemble_result')
folder_name = './ensemble'
file_list = os.listdir(folder_name)
count_file = len(file_list)

correct = pd.read_csv('../data/dataset/test/dev_final.csv')

f1_scores={}
print(count_file)
for file_1 in range(count_file):
    for file_2 in range(file_1+1,count_file):
        
        print(file_1 , file_2)
        file_name = f'{file_list[file_1][6:-4]}_{file_list[file_2][6:-4]}'
        ens_result = ensemble(folder_name,file_list[file_1],file_list[file_2])
        ens_result.to_csv(f'./ensemble_result/train_{file_name}.csv')
        ens_result= pd.concat([ens_result,correct],axis=1)
        f1_scores[file_name]=klue_re_micro_f1(ens_result.pred_label,ens_result.label)

f1_scores=dict(sorted(f1_scores.items(), key=lambda x:-x[1]))     
f1_scores




4
0 1
0 2
0 3
1 2
1 3
2 3


{'21_roberta-large_19_roberta-large_lr': 75.45310015898251,
 '19_roberta-large_lr_sub_57': 75.33206831119543,
 '21_roberta-large_sub_57': 75.13394264103371,
 '14_roberta-large_19_roberta-large_lr': 75.06014434643144,
 '14_roberta-large_21_roberta-large': 74.86408698433003,
 '14_roberta-large_sub_57': 74.78455154803702}

In [31]:
folder_name = './ensemble'
file_list = os.listdir(folder_name)
count_file = len(file_list)
if os.path.isdir('./ensemble_result')==False:
    os.mkdir('./ensemble_result')

correct = pd.read_csv('../data/dataset/test/dev_final.csv')
f1_scores={}
file_name1= 'train_21_roberta-large.csv'
file_name2 = 'train_19_roberta-large_lr.csv'

for weight1 in range(1,6):

    print(weight1)

    file_name = f'{file_name1[6:-4]}_{weight1}_{file_name2[6:-4]}_5'
    ens_result = ensemble(folder_name,file_name1,file_name2,weight1,5)
    ens_result.to_csv(f'./ensemble_result/{file_name}.csv')
    ens_result= pd.concat([ens_result,correct],axis=1)
    f1_scores[file_name]=klue_re_micro_f1(ens_result.pred_label,ens_result.label)

for weight2 in range(1,6):

    print(weight2)

    file_name = f'{file_name1[6:-4]}_5_{file_name2[6:-4]}_{weight2}'
    ens_result = ensemble(folder_name,file_name1,file_name2,5,weight2)
    ens_result.to_csv(f'./ensemble_result/train_{file_name}.csv')
    ens_result= pd.concat([ens_result,correct],axis=1)
    f1_scores[file_name]=klue_re_micro_f1(ens_result.pred_label,ens_result.label)

f1_scores=dict(sorted(f1_scores.items(), key=lambda x:-x[1]))
f1_scores


1
2
3
4
5
1
2
3
4
5


{'21_roberta-large_5_19_roberta-large_lr_5': 75.45310015898251,
 '21_roberta-large_5_19_roberta-large_lr_4': 75.44891148895597,
 '21_roberta-large_4_19_roberta-large_lr_5': 75.34573199809252,
 '21_roberta-large_5_19_roberta-large_lr_3': 75.22208121827411,
 '21_roberta-large_3_19_roberta-large_lr_5': 75.00396762418664,
 '21_roberta-large_5_19_roberta-large_lr_2': 74.97233201581027,
 '21_roberta-large_2_19_roberta-large_lr_5': 74.94850261448265,
 '21_roberta-large_5_19_roberta-large_lr_1': 74.689318861098,
 '21_roberta-large_1_19_roberta-large_lr_5': 74.65145754119138}

In [33]:
num_to_label.values()

dict_values(['no_relation', 'org:top_members/employees', 'org:members', 'org:product', 'per:title', 'org:alternate_names', 'per:employee_of', 'org:place_of_headquarters', 'per:product', 'org:number_of_employees/members', 'per:children', 'per:place_of_residence', 'per:alternate_names', 'per:other_family', 'per:colleagues', 'per:origin', 'per:siblings', 'per:spouse', 'org:founded', 'org:political/religious_affiliation', 'org:member_of', 'per:parents', 'org:dissolved', 'per:schools_attended', 'per:date_of_death', 'per:date_of_birth', 'per:place_of_birth', 'per:place_of_death', 'org:founded_by', 'per:religion'])

In [9]:
f1_scores=dict(sorted(f1_scores.items(), key=lambda x:-x[1]))

In [27]:
file_name ='train_'+'19_roberta-large8379_checkpoint-6000_5_sub_57_4_5_19_roberta-large_lr_checkpoint-8000_21_roberta-large1_5_checkpoint-6000_1'+'.csv'
x= pd.read_csv(f'./ensemble_result/{file_name}')
x.to_csv(f'./save_best/{file_name}')
x.to_csv(f'./ensemble/{file_name}')

In [11]:
if os.path.isdir('./ensemble_result')==False:
    os.mkdir('./ensemble_result')
folder_name = './save_'
file_list = os.listdir(folder_name)
count_file = len(file_list)

correct = pd.read_csv('../data/dataset/test/dev_final.csv')

f1_scores={}
file_2 = 'train_15_roberta-large_3.8_sub_42_5.csv'

for file_1 in range(count_file):
    print(file_1 , file_2)
    file_name = f'{file_list[file_1][6:-4]}_{file_2[6:-4]}'
    ens_result = ensemble(folder_name,file_list[file_1],file_2)
    ens_result.to_csv(f'./ensemble_result/train_{file_name}.csv')
    ens_result= pd.concat([ens_result,correct],axis=1)
    f1_scores[file_name]=klue_re_micro_f1(ens_result.pred_label,ens_result.label)

f1_scores=dict(sorted(f1_scores.items(), key=lambda x:-x[1]))     
f1_scores

0 train_15_roberta-large_3.8_sub_42_5.csv


FileNotFoundError: [Errno 2] No such file or directory: './ensemble/train_15_roberta-large_3.8_sub_42_5.csv'

In [None]:
folder_name = './ensemble'
file_list = os.listdir(folder_name)
count_file = len(file_list)
if os.path.isdir('./ensemble_result')==False:
    os.mkdir('./ensemble_result')

correct = pd.read_csv('../data/dataset/test/dev_final.csv')

f1_scores={}
file_name1= 'train_15_roberta-large.csv'
file_name2 = 'train_sub_42.csv'

for weight in range(1,10):
    weight1 = 3+ weight*0.2
    print(weight1)

    file_name = f'{file_name1[6:-4]}_{weight1}_{file_name2[6:-4]}_5'
    ens_result = ensemble(folder_name,file_name1,file_name2,weight1,5)
    ens_result.to_csv(f'./ensemble_result/{file_name}.csv')
    ens_result= pd.concat([ens_result,correct],axis=1)
    f1_scores[file_name]=klue_re_micro_f1(ens_result.pred_label,ens_result.label)

# for weight2 in range(1,6):

#     print(weight2)

#     file_name = f'{file_name1[6:-4]}_5_{file_name2[6:-4]}_{weight2}'
#     ens_result = ensemble(folder_name,file_name1,file_name2,5,weight2)
#     ens_result.to_csv(f'./ensemble_result/train_{file_name}.csv')
#     ens_result= pd.concat([ens_result,correct],axis=1)
#     f1_scores[file_name]=klue_re_micro_f1(ens_result.pred_label,ens_result.label)

f1_scores=dict(sorted(f1_scores.items(), key=lambda x:-x[1]))
f1_scores


3.2
3.4
3.6
3.8
4.0
4.2
4.4
4.6
4.8


{'15_roberta-large_3.8_sub_42_5': 71.50374740870674,
 '15_roberta-large_4.0_sub_42_5': 71.49465624501515,
 '15_roberta-large_3.2_sub_42_5': 71.488,
 '15_roberta-large_3.4_sub_42_5': 71.46282973621103,
 '15_roberta-large_3.6_sub_42_5': 71.44911327688129,
 '15_roberta-large_4.2_sub_42_5': 71.30573248407643,
 '15_roberta-large_4.8_sub_42_5': 71.19074044712225,
 '15_roberta-large_4.4_sub_42_5': 71.18105229693211,
 '15_roberta-large_4.6_sub_42_5': 71.163307411522}