In [1]:
import os 
import re 
import copy 
import time
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

from KoNLI_UTILS import chrome_setting, papago_translation, nan_list_retranslation, hangul_list_retranslation, hangul_word_translation, len_retranslation, pytorch_cos_sim

from transformers import AutoTokenizer, AutoModel
import torch

In [2]:
# KLUE data load
json_train_path = 'klue-nli-v1.1_train.json'
json_test_path = 'klue-nli-v1.1_dev.json'

with open(json_train_path, 'r', encoding="utf-8") as f:
    json_train = json.load(f)
with open(json_test_path, 'r', encoding="utf-8") as f:
    json_test = json.load(f)
    
json_train_df = pd.DataFrame(json_train)[['premise','hypothesis','gold_label']]
json_test_df = pd.DataFrame(json_test)[['premise','hypothesis','gold_label']]        
json_train_df.rename(columns = {'gold_label' : 'label'}, inplace = True)
json_test_df.rename(columns = {'gold_label' : 'label'}, inplace = True)
df1 = pd.concat([json_train_df, json_test_df]).reset_index(drop=True)

df_grp1 = df1.groupby(df1.columns.tolist()) # 전체 열 비교
df_di1 = df_grp1.groups # 딕셔너리로 만들기 
idx_T1 = [x[0] for x in df_di1.values() if len(x) == 1] # 중복X 인덱스 검토
idx_F1 = [x[0] for x in df_di1.values() if not len(x) == 1] # 중복O 인덱스 검토
df_concated1 = pd.concat([df1.loc[idx_T1,:], df1.loc[idx_F1,:]])
df_concated1 = df_concated1.dropna(how='any') # Null 값이 존재하는 행 제거
df_concated1 = df_concated1.reset_index(drop=True)

#### 한글 -> 영어 번역
* 영어로 번역된 데이터 npy로 저장 후, 번역이 제대로 진행되지 않은 부분을 위해 저장된 npy 로드하여 재번역 시도 

In [None]:
for column in ['premise', 'hypothesis']:    
    print('Col_name : '+column)

    set_setting = {'path':'./chromedriver', # your chrome driver path
                    'col_name':column,
                    'sk':'ko',
                    'tk':'en',
                    'final_save_name':'to_eng_{}'.format(column)}

    path = set_setting['path']
    col_name = set_setting['col_name']
    sk = set_setting['sk']
    tk = set_setting['tk']
    final_save_name = set_setting['final_save_name']

    driver = chrome_setting(path)
    back_translation_file = 'to_eng_{}_0_27996.npy'.format(col_name)
    
    # to_eng_premise_0_27996.npy, to_eng_hypothesis_0_27996.npy 생성 
    papago_translation(df_concated1[col_name], sk, tk, driver, save_name=final_save_name) 

    raw_array = np.load('{}'.format(back_translation_file))
    raw_array_df = pd.DataFrame(raw_array, columns=[col_name])

    nan_list = [1]
    hangul_ind = [1]
    driver.quit()
    os.system('killall chrome')
    
    # retry
    for ii in range(2):
        raw_array_df, nan_list = nan_list_retranslation(raw_array_df, df_concated1, col_name, sk, tk, path)
        raw_array_df, hangul_ind = hangul_list_retranslation(raw_array_df, df_concated1, col_name, sk, tk, path)
        raw_array_df = hangul_word_translation(raw_array_df, col_name, sk, tk, path)
        raw_array_df = len_retranslation(raw_array_df, df_concated1, col_name, sk, tk, path) 
    
    #  ko -> en 번역된 데이터 csv 파일로 저장. ex) to_eng_premise.csv, to_eng_hypothesis.csv
    raw_array_df.to_csv('{}.csv'.format(final_save_name), index=False) 
    


#### 영어 -> 한글 번역
* 한글로 번역된 데이터 npy로 저장 후, 번역이 제대로 진행되지 않은 부분을 위해 저장된 npy 로드하여 재번역 시도
* 같은 의미의 문장이라도 한글 대비 영어 문장이 길다고 판단되어 길이에 대한 비율에 따른 재번역은 진행 X 

In [None]:
for column in ['premise', 'hypothesis']:    
    print('Col_name : '+column)

    set_setting = {'path':'./chromedriver', # your chrome driver path
                    'col_name':column,
                    'sk':'en',
                    'tk':'ko',
                    'final_save_name':'to_kor_{}'.format(column)}

    path = set_setting['path']
    col_name = set_setting['col_name']
    sk = set_setting['sk']
    tk = set_setting['tk']
    final_save_name = set_setting['final_save_name']

    driver = chrome_setting(path)
    eng_data = pd.read_csv('to_eng_{}_0_27996.csv'.format(col_name))
    back_translation_file = 'to_kor_{}_0_27996.npy'.format(col_name)
    
    # to_kor_premise_0_27996.npy, to_kor_hypothesis_0_27996.npy 생성 
    papago_translation(eng_data[col_name], sk, tk, driver, save_name=final_save_name)

    raw_array = np.load('{}'.format(back_translation_file))
    raw_array_df = pd.DataFrame(raw_array, columns=[col_name])

    nan_list = [1]
    hangul_ind = [1]
    driver.quit()
    os.system('killall chrome')
    
    # retry
    for ii in range(2):
        raw_array_df, nan_list = nan_list_retranslation(raw_array_df, eng_data, col_name, sk, tk, path)
        raw_array_df, hangul_ind = hangul_list_retranslation(raw_array_df, eng_data, col_name, sk, tk, path)
        raw_array_df = hangul_word_translation(raw_array_df, col_name, sk, tk, path)
    
    #  en -> ko 번역된 데이터 csv 파일로 저장. ex) to_kor_premise.csv, to_kor_hypothesis.csv
    raw_array_df.to_csv('{}.csv'.format(final_save_name), index=False)


# Sentence BERT(SBERT) 
* 번역된 데이터와 원본 데이터와의 유사도를 구해 유사도가 0.9 이상인 데이터만을 학습에 사용

In [None]:
back_premise = pd.read_csv('to_kor_premise.csv')
bcak_hypothesis = pd.read_csv('to_kor_hypothesis.csv')

sim_data = copy.copy(df_concated1)
sim_data['back_premise'] = back_premise
sim_data['back_hypothesis'] = bcak_hypothesis

sim_data.replace('', np.nan, inplace=True)
sim_data.replace(' ', np.nan, inplace=True)
nan_list = [index for index, row in sim_data.iterrows() if row.isnull().any()]

hangul_ind=[]
for i in range(0,len(sim_data)):
    temp=re.findall('[a-zA-Z]',str(sim_data.drop(columns=['label']).iloc[i].values))
    if len(temp)!=0:
        hangul_ind.append(i)
        
sim_data.drop(index = hangul_ind, inplace=True)
sim_data = sim_data[sim_data['back_premise'].apply(lambda x: len(x)>=19)] # 'premise' 글자 수 18 미만인 데이터 제거 
sim_data = sim_data[sim_data['back_premise'].apply(lambda x: len(x)<=90)] # 'premise' 글자 수 89 초과인 데이터 제거
sim_data = sim_data[sim_data['back_hypothesis'].apply(lambda x: len(x)>=5)] # 'hypothesis' 글자 수 5 미만인 데이터 제거
sim_data = sim_data[sim_data['back_hypothesis'].apply(lambda x: len(x)<=103)] # 'hypothesis' 글자 수 103 초과인 데이터 제거

sim_data = sim_data.reset_index(drop=True)
sim_data[['back_premise', 'back_hypothesis']] = sim_data[['back_premise', 'back_hypothesis']].applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [None]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('bespin-global/klue-sentence-roberta-base')
model = AutoModel.from_pretrained('bespin-global/klue-sentence-roberta-base')

results={}
# for col_name in ['premise']:
for col_name in ['premise', 'hypothesis']:

    # Tokenize sentences
    corpus_input = tokenizer(list(sim_data[col_name]), max_length=103, padding='max_length', truncation=True, return_tensors='pt')
    queries_input = tokenizer(list(sim_data['back_'+col_name]), max_length=103, padding='max_length', truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        corpus_output = model(**corpus_input)
        queries_output = model(**queries_input)

    # Perform pooling. In this case, mean pooling.
    corpus_embeddings = mean_pooling(corpus_output, corpus_input['attention_mask'])
    queries_embeddings = mean_pooling(queries_output, corpus_input['attention_mask'])
    results['corpus'] = corpus_embeddings
    results['queries'] = queries_embeddings
    
    cos_similarity = torch.diagonal(pytorch_cos_sim(results['corpus'], results['queries']), 0)
    sim_data[col_name+'_cos'] = cos_similarity

In [None]:
condition = (sim_data.premise_cos >= 0.9) & (sim_data.hypothesis_cos >= 0.9) # 조건식 작성
back_train = sim_data[condition]
back_train = back_train.reset_index(drop=True)

# Augmentation data save to csv file
back_train[['back_premise', 'back_hypothesis', 'label']].to_csv('Augmented_data.csv', index=False)