# 필수코드
colab 환경에서 반드시 설치되어야 함

In [None]:
# 필요 시 설치(가상환경의 경우 터미널에 설치)
!pip install transformers==4.24.0

In [None]:
# 필요 시 설치(가상환경의 경우 터미널에 설치)
!pip install datasets==2.6.1

In [None]:
# 필요 시 설치
!pip install gdown

# 변수 및 함수 설정

## 모듈 import

In [None]:
import json
import os
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
import pandas as pd
import copy
from collections import defaultdict
from collections import OrderedDict

## 전역 변수 설정

In [None]:
# 사용할 베이스 모델 설정
base_model = 'kykim/electra-kor-base'
classifier_hidden_size = 768
classifier_dropout_prob = 0.1

# 개체#속성 쌍 설정(label25)
entity_property_pair = [
    '제품 전체#일반', '제품 전체#가격', '제품 전체#디자인', '제품 전체#품질', '제품 전체#편의성', '제품 전체#인지도', '제품 전체#다양성',
    '본품#일반', '본품#디자인', '본품#품질', '본품#편의성', '본품#다양성', '본품#가격', '본품#인지도',
    '패키지/구성품#일반', '패키지/구성품#디자인', '패키지/구성품#품질', '패키지/구성품#편의성', '패키지/구성품#가격', '패키지/구성품#다양성',
    '브랜드#일반', '브랜드#가격', '브랜드#품질', '브랜드#인지도', '브랜드#디자인'
                    ]

# 개체#속성 쌍 설정(label23)
entity_property_pair_label23 = [
    '제품 전체#일반', '제품 전체#가격', '제품 전체#디자인', '제품 전체#품질', '제품 전체#편의성', '제품 전체#인지도',
    '본품#일반', '본품#디자인', '본품#품질', '본품#편의성', '본품#다양성', '본품#가격', '본품#인지도',
    '패키지/구성품#일반', '패키지/구성품#디자인', '패키지/구성품#품질', '패키지/구성품#편의성', '패키지/구성품#가격', '패키지/구성품#다양성',
    '브랜드#일반', '브랜드#가격', '브랜드#품질', '브랜드#인지도',
                    ]

# 문장과 개체#속성 쌍의 관계를 True, False 로 표시
tf_id_to_name = ['True', 'False']
tf_name_to_id = {tf_id_to_name[i]: i for i in range(len(tf_id_to_name))}

# 문장과 개체#속성 쌍의 관계로 감성을 positive, negative, neutral 로 표시
polarity_id_to_name = ['positive', 'negative', 'neutral']
polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}

# 그래픽 카드 사용 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 불러온 tokenizer 에 special token 을 추가
special_tokens_dict = {
    'additional_special_tokens': ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']
}

## json 파일 로드

In [None]:
# json 파일 읽어서 list에 저장
def jsonload(fname, encoding="utf-8"):
    with open(fname, encoding=encoding) as f:
        j = json.load(f)
    return j

# json 개체를 파일이름으로 저장
def jsondump(j, fname):
    with open(fname, "w", encoding="UTF8") as f:
        json.dump(j, f, ensure_ascii=False)

# jsonl 파일 읽어서 list에 저장
def jsonlload(fname, encoding="utf-8"):
    json_list = []
    with open(fname, encoding=encoding) as f:
        for line in f.readlines():
            json_list.append(json.loads(line))
    return json_list

## 모델 정의

electra 모델을 기반으로 한 classification 모델 이용

In [None]:
# 아래 ELECTRABaseClassifier의 classifier로 사용될 class
class SimpleClassifier(nn.Module):

    def __init__(self, num_label):
        super().__init__()
        self.dense = nn.Linear(classifier_hidden_size, classifier_hidden_size)
        self.dropout = nn.Dropout(classifier_dropout_prob)
        self.output = nn.Linear(classifier_hidden_size, num_label)

    def forward(self, features):
        x = features[:, 0, :]
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.output(x)
        return x

# 불러올 base model 기반 classification 설정
class ELECTRABaseClassifier(nn.Module):
    def __init__(self, num_label, len_tokenizer):
        super(ELECTRABaseClassifier, self).__init__()

        self.num_label = num_label
        self.xlm_roberta = AutoModel.from_pretrained(base_model)
        self.xlm_roberta.resize_token_embeddings(len_tokenizer)

        self.labels_classifier = SimpleClassifier(self.num_label)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.xlm_roberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=None
        )

        sequence_output = outputs[0]
        logits = self.labels_classifier(sequence_output)

        loss = None

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_label),
                                                labels.view(-1))

        return loss, logits

### 모델 예측 함수

#### 25개 entity_property 쌍에 대해 predict 해주는 함수

In [None]:
# 25개 entity_property 쌍에 대해 predict 해주는 함수
def predict_from_korean_form(tokenizer, ce_model, pc_model, data):

    ce_model.to(device)
    ce_model.eval()
    for sentence in data:
        form = sentence['sentence_form']
        sentence['annotation'] = []
        if type(form) != str:
            print("form type is arong: ", form)
            continue

        # 개체#속성 쌍을 순서대로 입력
        for pair in entity_property_pair:
            # 문장과 pair를 tokenizer화           
            tokenized_data = tokenizer(form, pair, padding='max_length', max_length=256, truncation=True)

            # tonkenizer 된 값으로 entity_property 모델을 통해 True, False 결과값을 출력
            input_ids = torch.tensor([tokenized_data['input_ids']]).to(device)
            attention_mask = torch.tensor([tokenized_data['attention_mask']]).to(device)
            with torch.no_grad():
                _, ce_logits = ce_model(input_ids, attention_mask)

            ce_predictions = torch.argmax(ce_logits, dim = -1)

            ce_result = tf_id_to_name[ce_predictions[0]]

            # True 일 경우에만 다음 코드 진행
            if ce_result == 'True':
                # tonkenizer 된 값으로 polarity 모델을 통해 Positive, Negative,Neutral 결과값을 출력                
                with torch.no_grad():
                    _, pc_logits = pc_model(input_ids, attention_mask)

                pc_predictions = torch.argmax(pc_logits, dim=-1)
                pc_result = polarity_id_to_name[pc_predictions[0]]

                # entity_property 모델과 polarity 모델을 통해서 출력 된 결과 값을 label 입력
                sentence['annotation'].append([pair, pc_result])


    return data

#### 23개 entity_property 쌍에 대해 predict 해주는 함수

In [None]:
# 23개 entity_property 쌍에 대해 predict 해주는 함수
def predict_from_korean_form_label23(tokenizer, ce_model, pc_model, data):

    ce_model.to(device)
    ce_model.eval()
    for sentence in data:
        form = sentence['sentence_form']
        sentence['annotation'] = []
        if type(form) != str:
            print("form type is arong: ", form)
            continue

        # 개체#속성 쌍을 순서대로 입력
        for pair in entity_property_pair_label23:
            # 문장과 pair를 tokenizer화           
            tokenized_data = tokenizer(form, pair, padding='max_length', max_length=256, truncation=True)

            # tonkenizer 된 값으로 entity_property_pair_label23 모델을 통해 True, False 결과값을 출력
            input_ids = torch.tensor([tokenized_data['input_ids']]).to(device)
            attention_mask = torch.tensor([tokenized_data['attention_mask']]).to(device)
            with torch.no_grad():
                _, ce_logits = ce_model(input_ids, attention_mask)

            ce_predictions = torch.argmax(ce_logits, dim = -1)

            ce_result = tf_id_to_name[ce_predictions[0]]

            # True 일 경우에만 다음 코드 진행
            if ce_result == 'True':
                # tonkenizer 된 값으로 polarity 모델을 통해 Positive, Negative,Neutral 결과값을 출력                
                with torch.no_grad():
                    _, pc_logits = pc_model(input_ids, attention_mask)

                pc_predictions = torch.argmax(pc_logits, dim=-1)
                pc_result = polarity_id_to_name[pc_predictions[0]]

                # entity_property 모델과 polarity 모델을 통해서 출력 된 결과 값을 label 입력
                sentence['annotation'].append([pair, pc_result])


    return data

#### 인자로 받아온 모델들 중 entity_property가 3개인 경우 Soft_voting 을 위한 함수 

In [None]:
# 인자로 받아온 모델들을 통해 predict 하는 문장 데이터에 label(entity_property, polarity) 해주는 함수
#                                       모델 갯수에 따라서 ce_model 추가 또는 삭제
def predict_from_korean_form_softvoting_3(tokenizer, ce_model1, ce_model2, ce_model3, pc_model, data):

    # 모델 갯수에 따라서 ce_model 추가 또는 삭제
    ce_model1.to(device)
    ce_model1.eval()

    ce_model2.to(device)
    ce_model2.eval()

    ce_model3.to(device)
    ce_model3.eval()


    for sentence in data:
        form = sentence['sentence_form']
        sentence['annotation'] = []
        if type(form) != str:
            print("form type is arong: ", form)
            continue

        # 개체#속성 쌍을 순서대로 입력
        for pair in entity_property_pair:
            # 문장과 pair를 tokenizer화 
            tokenized_data = tokenizer(form, pair, padding='max_length', max_length=256, truncation=True)

            # tonkenizer 된 값으로 entity_property 모델을 통해 True, False 결과값을 출력
            input_ids = torch.tensor([tokenized_data['input_ids']]).to(device)
            attention_mask = torch.tensor([tokenized_data['attention_mask']]).to(device)

            # 문장에 대한 각 모델별 점수 예측 결과 저장
            with torch.no_grad():
                # 모델 갯수에 따라 추가 또는 삭제
                _, ce_logits1 = ce_model1(input_ids, attention_mask)
                _, ce_logits2 = ce_model2(input_ids, attention_mask)
                _, ce_logits3 = ce_model3(input_ids, attention_mask)

            # 각 모델별 예측한 결과값을 합산하여 평균 구하기
              # 모델 갯수에 따라 추가 또는 삭제
            logits1 = (ce_logits1[0][0]+ce_logits2[0][0]+ce_logits3[0][0])/3
            logits2 = (ce_logits1[0][1]+ce_logits2[0][1]+ce_logits3[0][1])/3

            ce_logits = torch.tensor([[logits1, logits2]]).to(device)

            ce_predictions = torch.argmax(ce_logits, dim = -1)

            ce_result = tf_id_to_name[ce_predictions[0]]

            # True 일 경우에만 다음 코드 진행
            if ce_result == 'True':

                # tonkenizer 된 값으로 polarity 모델을 통해 Positive, Negative,Neutral 결과값을 출력                
                with torch.no_grad():
                    _, pc_logits = pc_model(input_ids, attention_mask)

                pc_predictions = torch.argmax(pc_logits, dim=-1)
                pc_result = polarity_id_to_name[pc_predictions[0]]

                # entity_property 모델과 polarity 모델을 통해서 출력 된 결과 값을 label 입력
                sentence['annotation'].append([pair, pc_result])

    return data

#### 인자로 받아온 모델들 중 entity_property가 5개인 경우 Soft_voting 을 위한 함수 

In [None]:
# 인자로 받아온 모델들을 통해 predict 하는 문장 데이터에 label(entity_property, polarity) 해주는 함수
#                                       모델 갯수에 따라서 ce_model 추가 또는 삭제
def predict_from_korean_form_softvoting_5(tokenizer, ce_model1, ce_model2, ce_model3, ce_model4, ce_model5, pc_model, data):

    # 모델 갯수에 따라서 ce_model 추가 또는 삭제
    ce_model1.to(device)
    ce_model1.eval()

    ce_model2.to(device)
    ce_model2.eval()

    ce_model3.to(device)
    ce_model3.eval()

    ce_model4.to(device)
    ce_model4.eval()

    ce_model5.to(device)
    ce_model5.eval()

    for sentence in data:
        form = sentence['sentence_form']
        sentence['annotation'] = []
        if type(form) != str:
            print("form type is arong: ", form)
            continue

        # 개체#속성 쌍을 순서대로 입력
        for pair in entity_property_pair:
            # 문장과 pair를 tokenizer화 
            tokenized_data = tokenizer(form, pair, padding='max_length', max_length=256, truncation=True)

            # tonkenizer 된 값으로 entity_property 모델을 통해 True, False 결과값을 출력
            input_ids = torch.tensor([tokenized_data['input_ids']]).to(device)
            attention_mask = torch.tensor([tokenized_data['attention_mask']]).to(device)

            # 문장에 대한 각 모델별 점수 예측 결과 저장
            with torch.no_grad():
                # 모델 갯수에 따라 추가 또는 삭제
                _, ce_logits1 = ce_model1(input_ids, attention_mask)
                _, ce_logits2 = ce_model2(input_ids, attention_mask)
                _, ce_logits3 = ce_model3(input_ids, attention_mask)
                _, ce_logits4 = ce_model4(input_ids, attention_mask)
                _, ce_logits5 = ce_model5(input_ids, attention_mask)

            # 각 모델별 예측한 결과값을 합산하여 평균 구하기
              # 모델 갯수에 따라 추가 또는 삭제
            logits1 = (ce_logits1[0][0]+ce_logits2[0][0]+ce_logits3[0][0]+ce_logits4[0][0]+ce_logits5[0][0])/5
            logits2 = (ce_logits1[0][1]+ce_logits2[0][1]+ce_logits3[0][1]+ce_logits4[0][1]+ce_logits5[0][1])/5

            ce_logits = torch.tensor([[logits1, logits2]]).to(device)

            ce_predictions = torch.argmax(ce_logits, dim = -1)

            ce_result = tf_id_to_name[ce_predictions[0]]

            # True 일 경우에만 다음 코드 진행
            if ce_result == 'True':

                # tonkenizer 된 값으로 polarity 모델을 통해 Positive, Negative,Neutral 결과값을 출력                
                with torch.no_grad():
                    _, pc_logits = pc_model(input_ids, attention_mask)

                pc_predictions = torch.argmax(pc_logits, dim=-1)
                pc_result = polarity_id_to_name[pc_predictions[0]]

                # entity_property 모델과 polarity 모델을 통해서 출력 된 결과 값을 label 입력
                sentence['annotation'].append([pair, pc_result])

    return data

#### 단일 모델에서 나온 결과물에 polarity만 바꿔주는 함수

In [None]:
# 감성만 바꿔주는 함수(entity_property는 그대로 두며 polarity만 바꿔줌)
def predict_from_korean_form_polarity(tokenizer, pc_model, data):
    for sentence in data:
        
        form = sentence['sentence_form']
        entity = sentence['annotation']
        sentence['annotation'] = []
        for i in entity:
            tokenized_data = tokenizer(form, i[0], padding='max_length', max_length=256, truncation=True)

            input_ids = torch.tensor([tokenized_data['input_ids']]).to(device)
            attention_mask = torch.tensor([tokenized_data['attention_mask']]).to(device)

            with torch.no_grad():
                _, pc_logits = pc_model(input_ids, attention_mask)

            pc_predictions = torch.argmax(pc_logits, dim=-1)
            pc_result = polarity_id_to_name[pc_predictions[0]]

            sentence['annotation'].append([i[0], pc_result])

    return data

In [None]:
# 변수로 지정된 경로에 따라 predict 파일에 labeling 후 저장하는 함수
def test_sentiment_analysis(name):

    # predict 시 사용할 tokenizer와 데이터를 불러오기
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    test_data = jsonlload(test_data_path)
    
    # entity_property 모델 불러오기
    model = ELECTRABaseClassifier(len(tf_id_to_name), len(tokenizer))
    model.load_state_dict(torch.load(test_category_extraction_model_path, map_location=device))
    model.to(device)
    model.eval()

    # polarity 모델 불러오기        
    polarity_model = ELECTRABaseClassifier(len(polarity_id_to_name), len(tokenizer))
    polarity_model.load_state_dict(torch.load(polarity_classification_model_path, map_location=device))
    polarity_model.to(device)
    polarity_model.eval()

    # predict_from_korean_form 함수를 통해 predict 데이터 만들기
    pred_data = predict_from_korean_form(tokenizer, model, polarity_model, copy.deepcopy(test_data))

    # 만들어진 predict 데이터를 json 파일로 저장
    jsondump(pred_data, './{0:02d}.json'.format(name))

In [None]:
# 변수로 지정된 경로에 따라 predict 파일에 labeling 후 저장하는 함수
def test_sentiment_analysis_label23(name):

    # predict 시 사용할 tokenizer와 데이터를 불러오기
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    test_data = jsonlload(test_data_path)
    
    # entity_property 모델 불러오기
    model = ELECTRABaseClassifier(len(tf_id_to_name), len(tokenizer))
    model.load_state_dict(torch.load(test_category_extraction_model_path, map_location=device))
    model.to(device)
    model.eval()

    # polarity 모델 불러오기        
    polarity_model = ELECTRABaseClassifier(len(polarity_id_to_name), len(tokenizer))
    polarity_model.load_state_dict(torch.load(polarity_classification_model_path, map_location=device))
    polarity_model.to(device)
    polarity_model.eval()

    # predict_from_korean_form 함수를 통해 predict 데이터 만들기
    pred_data = predict_from_korean_form_label23(tokenizer, model, polarity_model, copy.deepcopy(test_data))

    # 만들어진 predict 데이터를 json 파일로 저장
    jsondump(pred_data, './{0:02d}.json'.format(name))

# 지정된 모델들 불러오기

저장할 폴더를 만들고 현재 위치를 옮기기

In [None]:
# 저장할 폴더를 만들고 현재 위치를 옮김
path = './tmp'

os.mkdir(path)
os.chdir(path)

## Soft_voting을 위한 entity_property 5개, polarity, predict 할 json 데이터 gdown

In [None]:
# 09
!gdown 1T6zgdQOeydga6qnfXNKpPENVZpvjBhsE
# 12
!gdown 1Rwqkb_hdedV7Tmfa0pUncrpP0IEuxttA
# 15
!gdown 1d78UHgWdQF-wtUO3_GFPWsO2iTMWlYj4
# 16
!gdown 1BXiJBaFFR3c3_lxxXMFQMQGTGGOZQzGG
# 22
!gdown 1b-I9vtidwMBX-mno94oBCTdsjovIHeUk

# polarity 12epoch
!gdown 1__t7OPrvrQs0nEYswIn_hBdbK-bHAQpY

# test_data
!gdown 1M4SGcPmxUS5ocleECRxec-E1ElmNEsxW

entity_property, polarity, predict 할 json 데이터 경로 설정

In [None]:
# 불러올 entity_property pt 파일 경로 설정
# 5개인 경우
entity_classification_model_path = [
    ['./09.data_label23_epoch_5.pt',
     './12.data+10.17_label25_epoch_5.pt',
     './15.data+manual_label25_epoch_21.pt',
     './16.data+similar_label25_epoch_5.pt',
     './22.data+10.17_label25_epoch_15.pt']
]

# 불러올 polarity pt 파일 경로 설정
polarity_classification_model_path = './data_polarity_epoch_12.pt'

# 학습 후 저장된 pt 파일로 predict 할 json 데이터 설정
test_data_path = './nikluge-sa-2022-test.jsonl'

설정된 경로를 통해 모델 로드

In [None]:
# 변수로 지정된 경로에 따라 predict 파일에 labeling 후 저장하는 함수
def test_sentiment_analysis_softvoting_5():

    # predict 시 사용할 tokenizer와 데이터를 불러오기
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    test_data = jsonlload(test_data_path)
    
    # entity_property 모델 불러오기
    # 모델 갯수에 따라 model1 추가 또는 삭제
    model1 = ELECTRABaseClassifier(len(tf_id_to_name), len(tokenizer))
    model1.load_state_dict(torch.load(entity_classification_model_path[0][0], map_location=device))
    model1.to(device)
    model1.eval()

    # 모델 갯수에 따라 model2추가 또는 삭제
    model2 = ELECTRABaseClassifier(len(tf_id_to_name), len(tokenizer))
    model2.load_state_dict(torch.load(entity_classification_model_path[0][1], map_location=device))
    model2.to(device)
    model2.eval()

    # 모델 갯수에 따라 model3 추가 또는 삭제
    model3 = ELECTRABaseClassifier(len(tf_id_to_name), len(tokenizer))
    model3.load_state_dict(torch.load(entity_classification_model_path[0][2], map_location=device))
    model3.to(device)
    model3.eval()

    # 모델 갯수에 따라 model4 추가 또는 삭제
    model4 = ELECTRABaseClassifier(len(tf_id_to_name), len(tokenizer))
    model4.load_state_dict(torch.load(entity_classification_model_path[0][3], map_location=device))
    model4.to(device)
    model4.eval()

    # 모델 갯수에 따라 model5 추가 또는 삭제
    model5 = ELECTRABaseClassifier(len(tf_id_to_name), len(tokenizer))
    model5.load_state_dict(torch.load(entity_classification_model_path[0][4], map_location=device))
    model5.to(device)
    model5.eval()
            
    # polarity 모델 불러오기   
    polarity_model = ELECTRABaseClassifier(len(polarity_id_to_name), len(tokenizer))
    polarity_model.load_state_dict(torch.load(polarity_classification_model_path, map_location=device))
    polarity_model.to(device)
    polarity_model.eval()
    
    # predict_from_korean_form 함수를 통해 predict 데이터 만들기
    # 모델 갯수에 따라 model 추가 또는 삭제
    pred_data = predict_from_korean_form_softvoting_5(tokenizer, model1, model2, model3, model4, model5, polarity_model, copy.deepcopy(test_data))

    # 만들어진 predict 데이터를 json 파일로 저장
    jsondump(pred_data, './high_score_5_softvoting_62.54.json')

In [None]:
# 변수로 지정된 경로에 따라 predict 파일을 생성 및 저장 함수
test_sentiment_analysis_softvoting_5()

사용된 각각의 pt값을 json화 하여 이후 voting에 활용

In [None]:
# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './09.data_label23_epoch_5.pt'

test_sentiment_analysis_label23(9)

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './12.data+10.17_label25_epoch_5.pt'

test_sentiment_analysis(12)

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './16.data+similar_label25_epoch_5.pt'

test_sentiment_analysis(16)

사용 완료한 entity_property 제거

In [None]:
remove_path = "./09.data_label23_epoch_5.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

remove_path = "./12.data+10.17_label25_epoch_5.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

remove_path = "./16.data+similar_label25_epoch_5.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

##  Soft_voting를 위한 entity_property 3개 gdown

In [None]:
# 18
!gdown 14enKPuyWp5DfTbqwK0OeVSbfKUGk_cRm
# 20
!gdown 190A0GbzPoU0w2oAwM_SmnSOsUdPbZyWF
# 25
!gdown 1QDqF8QjZnDxcC7TH5dkKWowKtKytGi5C

entity_property 경로 재설정

In [None]:
# 불러올 entity_property pt 파일 경로 설정
# 5개인 경우 예시
entity_classification_model_path = [
    ['./15.data+manual_label25_epoch_21.pt',
     './18.data_label23_epoch_5.pt',
     './20.data+108+10.17+10.24_label25_epoch_8.pt',
     './22.data+10.17_label25_epoch_15.pt',
     './25.data+108+10.17+10.24_label23_epoch_10.pt']
]

설정된 경로를 통해 모델 로드

In [None]:
# 변수로 지정된 경로에 따라 predict 파일에 labeling 후 저장하는 함수
def test_sentiment_analysis_softvoting_5():

    # predict 시 사용할 tokenizer와 데이터를 불러오기
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    test_data = jsonlload(test_data_path)
    
    # entity_property 모델 불러오기
    # 모델 갯수에 따라 model1 추가 또는 삭제
    model1 = ELECTRABaseClassifier(len(tf_id_to_name), len(tokenizer))
    model1.load_state_dict(torch.load(entity_classification_model_path[0][0], map_location=device))
    model1.to(device)
    model1.eval()

    # 모델 갯수에 따라 model2추가 또는 삭제
    model2 = ELECTRABaseClassifier(len(tf_id_to_name), len(tokenizer))
    model2.load_state_dict(torch.load(entity_classification_model_path[0][1], map_location=device))
    model2.to(device)
    model2.eval()

    # 모델 갯수에 따라 model3 추가 또는 삭제
    model3 = ELECTRABaseClassifier(len(tf_id_to_name), len(tokenizer))
    model3.load_state_dict(torch.load(entity_classification_model_path[0][2], map_location=device))
    model3.to(device)
    model3.eval()

    # 모델 갯수에 따라 model4 추가 또는 삭제
    model4 = ELECTRABaseClassifier(len(tf_id_to_name), len(tokenizer))
    model4.load_state_dict(torch.load(entity_classification_model_path[0][3], map_location=device))
    model4.to(device)
    model4.eval()

    # 모델 갯수에 따라 model5 추가 또는 삭제
    model5 = ELECTRABaseClassifier(len(tf_id_to_name), len(tokenizer))
    model5.load_state_dict(torch.load(entity_classification_model_path[0][4], map_location=device))
    model5.to(device)
    model5.eval()
            
    # polarity 모델 불러오기   
    polarity_model = ELECTRABaseClassifier(len(polarity_id_to_name), len(tokenizer))
    polarity_model.load_state_dict(torch.load(polarity_classification_model_path, map_location=device))
    polarity_model.to(device)
    polarity_model.eval()
    
    # predict_from_korean_form 함수를 통해 predict 데이터 만들기
    # 모델 갯수에 따라 model 추가 또는 삭제
    pred_data = predict_from_korean_form_softvoting_5(tokenizer, model1, model2, model3, model4, model5, polarity_model, copy.deepcopy(test_data))

    # 만들어진 predict 데이터를 json 파일로 저장
    jsondump(pred_data, './pt_5_softvoting_62.15.json')

In [None]:
test_sentiment_analysis_softvoting_5()

### 사용된 각각의 pt값을 json화 하여 이후 voting에 활용

단일 모델 중 필요한 polarity gdown

In [None]:
# 18 polarity
!gdown 1hDTlfC17xXGcP-Fhq7jcAkJ5z_ly2LiB

In [None]:
# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './15.data+manual_label25_epoch_21.pt'

test_sentiment_analysis(15)

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './18.data_label23_epoch_5.pt'
# 불러올 polarity pt 파일 경로 설정
polarity_classification_model_path = './18.data_polarity_label23_epoch_5.pt'

test_sentiment_analysis(18)

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './20.data+108+10.17+10.24_label25_epoch_8.pt'
# 불러올 polarity pt 파일 경로 설정
polarity_classification_model_path = './data_polarity_epoch_12.pt'

test_sentiment_analysis(20)

사용 완료한 entity_property 제거

In [None]:
remove_path = "./15.data+manual_label25_epoch_21.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

remove_path = "./18.data_label23_epoch_5.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

remove_path = './18.data_polarity_label23_epoch_5.pt'

if os.path.exists(remove_path):
    os.remove(remove_path)

remove_path = "./20.data+108+10.17+10.24_label25_epoch_8.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

remove_path = "./25.data+108+10.17+10.24_label23_epoch_10.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

## Soft_voting를 위한 entity_property 2개 gdown

In [None]:
# 21
!gdown 184zfO0Bp9cNVtjsfUk68ts-nrqItrz6R
# 23
!gdown 1fvmtIYEjuoAWcds7Wf6wwvVeEIuGdduj

entity_property 경로 재설정

In [None]:
# 불러올 entity_property pt 파일 경로 설정
# 5개인 경우 예시
entity_classification_model_path = [
    ['./21.data_all_label25_epoch_30.pt.',
     './22.data+10.17_label25_epoch_15.pt',
     './23.data+similar_label25_epoch_10.pt']
]

재설정된 경로를 통해 모델 로드

In [None]:
def test_sentiment_analysis_softvoting_3():

    # predict 시 사용할 tokenizer와 데이터를 불러오기
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    test_data = jsonlload(test_data_path)
    
    # entity_property 모델 불러오기
    # 모델 갯수에 따라 model1 추가 또는 삭제
    model1 = ELECTRABaseClassifier(len(tf_id_to_name), len(tokenizer))
    model1.load_state_dict(torch.load(entity_classification_model_path[0][0], map_location=device))
    model1.to(device)
    model1.eval()

    # 모델 갯수에 따라 model2추가 또는 삭제
    model2 = ELECTRABaseClassifier(len(tf_id_to_name), len(tokenizer))
    model2.load_state_dict(torch.load(entity_classification_model_path[0][1], map_location=device))
    model2.to(device)
    model2.eval()

    # 모델 갯수에 따라 model3 추가 또는 삭제
    model3 = ELECTRABaseClassifier(len(tf_id_to_name), len(tokenizer))
    model3.load_state_dict(torch.load(entity_classification_model_path[0][2], map_location=device))
    model3.to(device)
    model3.eval()
            
    # polarity 모델 불러오기   
    polarity_model = ELECTRABaseClassifier(len(polarity_id_to_name), len(tokenizer))
    polarity_model.load_state_dict(torch.load(polarity_classification_model_path, map_location=device))
    polarity_model.to(device)
    polarity_model.eval()
    
    # predict_from_korean_form 함수를 통해 predict 데이터 만들기
    # 모델 갯수에 따라 model 추가 또는 삭제
    pred_data = predict_from_korean_form_softvoting_3(tokenizer, model1, model2, model3, polarity_model, copy.deepcopy(test_data))

    # 만들어진 predict 데이터를 json 파일로 저장
    jsondump(pred_data, './pt_3_softvoting_61.15.json')

In [None]:
test_sentiment_analysis_softvoting_3()

### 사용된 각각의 pt값을 json화 하여 이후 voting에 활용

단일 모델 중 필요한 polarity gdown

In [None]:
!gdown 15uW-s-xl38qmXNiVy0LJyufYY8Hdyz02

In [None]:
# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './21.data_all_label25_epoch_30.pt'
# 불러올 polarity pt 파일 경로 설정
polarity_classification_model_path = './21.data_all_polarity_label25_epoch_30.pt'

test_sentiment_analysis(21)

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './22.data+10.17_label25_epoch_15.pt'
# 불러올 polarity pt 파일 경로 설정
polarity_classification_model_path = './data_polarity_epoch_12.pt'

test_sentiment_analysis(22)

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './23.data+similar_label25_epoch_10.pt'

test_sentiment_analysis(23)

사용 완료한 entity_property 제거

In [None]:
remove_path = "./21.data_all_label25_epoch_30.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

remove_path = "./21.data_all_polarity_label25_epoch_30.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)


remove_path = "./22.data+10.17_label25_epoch_15.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

remove_path = "./23.data+similar_label25_epoch_10.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

# 단일 모델을 통한 pred_data만들기

In [None]:
# 1
!gdown 17r1cYHKhYmYZKQS3onQeHq0idTwQDdaF

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './01.data+108_label23_epoch_1.pt'

test_sentiment_analysis_label23(1)

remove_path = "./01.data+108_label23_epoch_1.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

In [None]:
# 2
!gdown 15XVi_9M2tCzpSq-NXHi63Ci7E-Vzf0Ij

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './02.data+108+10.17+1024_label23_epoch_2.pt'

test_sentiment_analysis_label23(2)

remove_path = "./02.data+108+10.17+1024_label23_epoch_2.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

In [None]:
# 3
!gdown 10PuAaK3-tDr4oqydSh6IaNolCSmOd8PA
# polarity
!gdown 1EGGOZy-aDd1fLGy68x4H8lbagq47gTnW

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './03.data_label25_epoch_5.pt'

# 불러올 polarity pt 파일 경로 설정
polarity_classification_model_path = './03.data_polarity_label25_epoch_5.pt'

test_sentiment_analysis(3)

remove_path = "./03.data_label25_epoch_5.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

remove_path = './03.data_polarity_label25_epoch_5.pt'

if os.path.exists(remove_path):
    os.remove(remove_path)

In [None]:
# 불러올 polarity pt 파일 경로 설정
polarity_classification_model_path = './data_polarity_epoch_12.pt'

In [None]:
# 4
!gdown 1JgBKU8qkDod3p0geNe4DAH0-z0_Hnf8q

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './04.data+108+10.17+10.24_label23_epoch_5.pt'

test_sentiment_analysis_label23(4)

remove_path = "./04.data+108+10.17+10.24_label23_epoch_5.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

In [None]:
# 5
!gdown 1k8K_0ltOy3QZbKaexhOaAQ1QvgspVUlk

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './05.data+108_label23_epoch_11.pt'

test_sentiment_analysis_label23(5)

remove_path = "./05.data+108_label23_epoch_11.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

In [None]:
# 6
!gdown 1svN_-bhM-lzsse-i_og5SgJK9hTOAPNY
# polarity
!gdown 1GFdbS3SNuuj7bk0U7WhZe_Sf5SqsMsjP

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './06.data+10.17_label25_epoch_14.pt'

# 불러올 polarity pt 파일 경로 설정
polarity_classification_model_path = './06.data_polarity_label25_epoch_30.pt'

test_sentiment_analysis(6)

remove_path = './06.data+10.17_label25_epoch_14.pt'

if os.path.exists(remove_path):
    os.remove(remove_path)

remove_path = './06.data_polarity_label25_epoch_30.pt'

if os.path.exists(remove_path):
    os.remove(remove_path)

In [None]:
# 불러올 polarity pt 파일 경로 설정
polarity_classification_model_path = './data_polarity_epoch_12.pt'

In [None]:
# 7
!gdown 12iWcsaf4yEj5cXBSB-zcfJIAQ4UY56nN

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './07.data+108_label23_epoch_7.pt'

test_sentiment_analysis_label23(7)

remove_path = "./07.data+108_label23_epoch_7.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

In [None]:
# 8
!gdown 1--cvb_ouiHNxFPT9qD6AtJAMfLgQgbNx

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './08.data+108_label23_epoch_3.pt'

test_sentiment_analysis_label23(8)

remove_path = "./08.data+108_label23_epoch_3.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

In [None]:
# 10
!gdown 14OFWVTqiGIlz5rLrotxcIsjWvcc4W_Sm

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './10.final_data_label25_epoch_16.pt'

test_sentiment_analysis(10)

remove_path = "./10.final_data_label25_epoch_16.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

In [None]:
# 11
! gdown 1sKNvIFG2yev6S03aG4PDnkoFDDkA8LXI

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './11.data+108+10.17+10.24_label25_epoch_12.pt'

test_sentiment_analysis(11)

remove_path = "./11.data+108+10.17+10.24_label25_epoch_12.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

In [None]:
# 13
! gdown 17VzlAINzT_6stuM_kbOqp72UQYzARGl1
# polarity
! gdown 1sH3RqfohrY81BByvFyKKcTAIcQMbVUyR

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './13.data_label25_epoch_30.pt'
# 불러올 polarity pt 파일 경로 설정
polarity_classification_model_path = './13.data_polarity_label25_epoch_30.pt'

test_sentiment_analysis(13)

remove_path = "./13.data_label25_epoch_30.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)
    
remove_path = './13.data_polarity_label25_epoch_30.pt'

if os.path.exists(remove_path):
    os.remove(remove_path)

In [None]:
# 불러올 polarity pt 파일 경로 설정
polarity_classification_model_path = './data_polarity_epoch_12.pt'

In [None]:
# 14
! gdown 1RE67eYxq43u_GzL7Foc-1jDqKaOSghe4

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './14.data+108+10.17+10.24_label23_epoch_15.pt'

test_sentiment_analysis_label23(14)

remove_path = "./14.data+108+10.17+10.24_label23_epoch_15.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

In [None]:
# 17
! gdown 1tpO4ZNETNhrOjjE1NiZo8ISBd1iE39uK

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './17.data+manual_label25_epoch_21.pt'

test_sentiment_analysis(17)

remove_path = "./17.data+manual_label25_epoch_21.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

In [None]:
# 19
! gdown 1u2G88GAsxd26aqU-8FUCqj6e90Dkts5F
# polarity
! gdown 1llHgAIs2YyJqrhpkL5hrlVD-QhSO-LA9

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './19.data+10.17_label23_epoch_10.pt'
# 불러올 polarity pt 파일 경로 설정
polarity_classification_model_path = './19.data_polarity_label25_epoch_30.pt'

test_sentiment_analysis_label23(19)

remove_path = "./19.data+10.17_label23_epoch_10.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

remove_path = './19.data_polarity_label25_epoch_30.pt'

if os.path.exists(remove_path):
    os.remove(remove_path)

In [None]:
# 불러올 polarity pt 파일 경로 설정
polarity_classification_model_path = './data_polarity_epoch_12.pt'

In [None]:
# 24
! gdown 15eKRn0P8gFYH8XMXUvvrg-Q4zbqW9bWf

# 불러올 entity_property pt 파일 경로 설정
test_category_extraction_model_path = './24.data+10.17_label25_epoch_21.pt'

test_sentiment_analysis(24)

remove_path = "./24.data+10.17_label25_epoch_21.pt"

if os.path.exists(remove_path):
    os.remove(remove_path)

# 만들어진 json 파일을 통해 hard voting

만들어진 json 경로 설정

In [None]:
path_1 = './01.json'
path_2 = './02.json'
path_3 = './03.json'
path_4 = './04.json'
path_5 = './05.json'
path_6 = './06.json'
path_7 = './07.json'
path_8 = './08.json'
path_9 = './09.json'
path_10 = './10.json'
path_11 = './11.json'
path_12 = './12.json'
path_13 = './13.json'
path_14 = './14.json'
path_15 = './15.json'
path_16 = './16.json'
path_17 = './17.json'
path_18 = './18.json'
path_19 = './19.json'
path_20 = './20.json'
path_21 = './21.json'
path_22 = './22.json'
path_23 = './23.json'
path_24 = './24.json'

## 특정 json 3개로 hard_voting

In [None]:
data_0 = [json.loads(line) for line in open( path_15,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( path_8,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( path_9,'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open(path_15,'r', encoding='utf-8')]

wv_0 = 1.3
wv_1 = 1.2
wv_2 = 1.1


list_data = [data_0, data_1, data_2]
list_wv = [wv_0, wv_1, wv_2]

list_aaa = []

for i in range(len(data_0[0])):
    board = defaultdict(int)

    for j in range(len(list_data)):
        annotation = ''
        for q in range(len(list_data[j][0][i]['annotation'])):
            if q == len(list_data[j][0][i]['annotation'])-1:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
            else:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
        if annotation != '':
            board[annotation] += list_wv[j]

    tmp = [k for k,v in board.items() if max(board.values()) == v]
    list_aaa.append(tmp)

for l in range(len(list_aaa)):
    data_a = []
    if len(list_aaa[l]) != 0:
        for jj in range(len(list_aaa[l][0].split(','))):
            if list_aaa[l][0] != '':
                data_b = []
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[0])
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[1])
                data_a.append(data_b)
        

        data_final[0][l]['annotation'] = data_a

with open("./62.97_1109.json", "w") as f:
    json.dump(data_final[0], f)

## 특정 json 12개로 hard_voting

In [None]:
data_0 = [json.loads(line) for line in open(path_9, 'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open(path_16, 'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open(path_15, 'r', encoding='utf-8')]
data_3 = [json.loads(line) for line in open(path_8, 'r', encoding='utf-8')]
data_4 = [json.loads(line) for line in open(path_7, 'r', encoding='utf-8')]
data_5 = [json.loads(line) for line in open(path_5, 'r', encoding='utf-8')]
data_6 = [json.loads(line) for line in open(path_6, 'r', encoding='utf-8')]
data_7 = [json.loads(line) for line in open(path_4, 'r', encoding='utf-8')]
data_8 = [json.loads(line) for line in open(path_3, 'r', encoding='utf-8')]
data_9 = [json.loads(line) for line in open(path_19, 'r', encoding='utf-8')]
data_10 = [json.loads(line) for line in open(path_2, 'r', encoding='utf-8')]
data_11 = [json.loads(line) for line in open(path_1, 'r', encoding='utf-8')]

In [None]:
#결과값 저장할 제이슨 형식
data_final = [json.loads(line) for line in open(path_9, 'r', encoding='utf-8')]

In [None]:
#하드부팅 가중치 설정

wv_0 = 62.1
wv_1 = 61.82
wv_2 = 61.8
wv_3 = 61.67
wv_4 = 61.03
wv_5 = 60.56
wv_6 = 60.79
wv_7 = 60.11
wv_8 = 59.7
wv_9 = 59.44
wv_10 = 53.35
wv_11 = 52.25

In [None]:
list_data = [data_0, data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9, data_10, data_11]
list_wv = [wv_0, wv_1, wv_2, wv_3, wv_4, wv_5, wv_6,wv_7, wv_8, wv_9, wv_10, wv_11]
list_aaa = []

for i in range(len(data_0[0])):
  board = defaultdict(int)
  
  for j in range(len(list_data)):
   
   #공백정답미포함
    annotation = ''
    for q in range(len(list_data[j][0][i]['annotation'])):
      if q == len(list_data[j][0][i]['annotation'])-1:
        annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
      else:
        annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
    if annotation != '':
      board[annotation] += list_wv[j]


  tmp = [k for k,v in board.items() if max(board.values()) == v]
  list_aaa.append(tmp)

In [None]:
#제출데이터 형식에 맞추기

for i in range(len(list_aaa)):
  data_a = []
  if len(list_aaa[i]) != 0:
    for j in range(len(list_aaa[i][0].split(','))):
      if list_aaa[i][0] != '':
        data_b = []
        data_b.append(list_aaa[i][0].split(',')[j].split('$')[0])
        data_b.append(list_aaa[i][0].split(',')[j].split('$')[1])
        data_a.append(data_b)
      

    data_final[0][i]['annotation'] = data_a

In [None]:
with open("./62.87_1117.json", "w",encoding='utf-8') as f:
    json.dump(data_final[0], f)

## hard_voting로 만들어진 두개의 json을 hard_voting

In [None]:
data_base = [json.loads(line) for line in open("./62.97_1109.json", 'r', encoding='utf-8')]
data_plus = [json.loads(line) for line in open("./62.87_1117.json", 'r', encoding='utf-8')]

for k in range(len(data_base[0])):
    if len(data_base[0][k]['annotation']) == 0:
        data_base[0][k]['annotation'] = data_plus[0][k]['annotation']

with open("./63.16_2191.json", "w", encoding='utf-8') as f:
    json.dump(data_base[0], f)

## soft_votind의 json 1개, hard_voting의 json 1개, 단일 모델 json 1개, 총 3개 hard_voting

In [None]:
data_0 = [json.loads(line) for line in open( './high_score_5_softvoting_62.54.json','r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( './63.16_2191.json','r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( path_10,'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open('./high_score_5_softvoting_62.54.json','r', encoding='utf-8')]

wv_0 = 1.3
wv_1 = 1.2
wv_2 = 1.1


list_data = [data_0, data_1, data_2]
list_wv = [wv_0, wv_1, wv_2]

list_aaa = []

for i in range(len(data_0[0])):
    board = defaultdict(int)

    for j in range(len(list_data)):
        annotation = ''
        for q in range(len(list_data[j][0][i]['annotation'])):
            if q == len(list_data[j][0][i]['annotation'])-1:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
            else:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
        if annotation != '':
            board[annotation] += list_wv[j]

    tmp = [k for k,v in board.items() if max(board.values()) == v]
    list_aaa.append(tmp)

for l in range(len(list_aaa)):
    data_a = []
    if len(list_aaa[l]) != 0:
        for jj in range(len(list_aaa[l][0].split(','))):
            if list_aaa[l][0] != '':
                data_b = []
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[0])
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[1])
                data_a.append(data_b)
        

        data_final[0][l]['annotation'] = data_a

with open("./63.57_3579.json", "w") as f:
    json.dump(data_final[0], f)

## 단일 모델로 만든 특정 json 3개 hard_voting

### path_11,12,19

In [None]:
data_0 = [json.loads(line) for line in open( path_11 ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( path_12 ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( path_19 ,'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( path_11 ,'r', encoding='utf-8')]

wv_0 = 1.3
wv_1 = 1.2
wv_2 = 1.1


list_data = [data_0, data_1, data_2]
list_wv = [wv_0, wv_1, wv_2]

list_aaa = []

for i in range(len(data_0[0])):
    board = defaultdict(int)

    for j in range(len(list_data)):
        annotation = ''
        for q in range(len(list_data[j][0][i]['annotation'])):
            if q == len(list_data[j][0][i]['annotation'])-1:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
            else:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
        if annotation != '':
            board[annotation] += list_wv[j]

    tmp = [k for k,v in board.items() if max(board.values()) == v]
    list_aaa.append(tmp)

for l in range(len(list_aaa)):
    data_a = []
    if len(list_aaa[l]) != 0:
        for jj in range(len(list_aaa[l][0].split(','))):
            if list_aaa[l][0] != '':
                data_b = []
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[0])
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[1])
                data_a.append(data_b)
        

        data_final[0][l]['annotation'] = data_a

with open("./62.98_c3a.json", "w") as f:
    json.dump(data_final[0], f)

### path_12,13,14

In [None]:
data_0 = [json.loads(line) for line in open( path_12 ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( path_13 ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( path_14 ,'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( path_12 ,'r', encoding='utf-8')]

wv_0 = 1.3
wv_1 = 1.2
wv_2 = 1.1


list_data = [data_0, data_1, data_2]
list_wv = [wv_0, wv_1, wv_2]

list_aaa = []

for i in range(len(data_0[0])):
    board = defaultdict(int)

    for j in range(len(list_data)):
        annotation = ''
        for q in range(len(list_data[j][0][i]['annotation'])):
            if q == len(list_data[j][0][i]['annotation'])-1:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
            else:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
        if annotation != '':
            board[annotation] += list_wv[j]

    tmp = [k for k,v in board.items() if max(board.values()) == v]
    list_aaa.append(tmp)

for l in range(len(list_aaa)):
    data_a = []
    if len(list_aaa[l]) != 0:
        for jj in range(len(list_aaa[l][0].split(','))):
            if list_aaa[l][0] != '':
                data_b = []
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[0])
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[1])
                data_a.append(data_b)
        

        data_final[0][l]['annotation'] = data_a

with open("./62.97_3ed.json", "w") as f:
    json.dump(data_final[0], f)

## soft_voting의 json 1개, hard_voting의 특정 json 2개, 단일 모델의 특정 json 2개, 총 5개 hard_voting

### high_score_5_softvoting_62.54, 63.16_2191, 62.98_c3a, path_24,9

In [None]:
data_0 = [json.loads(line) for line in open( './high_score_5_softvoting_62.54.json' ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( './63.16_2191.json' ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( './62.98_c3a.json' ,'r', encoding='utf-8')]
data_3 = [json.loads(line) for line in open(path_24, 'r', encoding='utf-8')]
data_4 = [json.loads(line) for line in open(path_9, 'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( './62.97_3ed.json', 'r', encoding='utf-8')]


In [None]:
wv_0 = 62.1
wv_1 = 61.67
wv_2 = 52.25
wv_3 = 53.35
wv_4 = 59.44

In [None]:
#정답에 빈값이 있는경우, 중간에 주석처리로 변경가능
#최빈값 찾기

list_data = [data_0, data_1, data_2, data_3, data_4]
list_wv = [wv_0, wv_1, wv_2, wv_3, wv_4]
list_aaa = []


for i in range(len(data_0[0])):
  board = defaultdict(int)
  
  for j in range(len(list_data)):

    annotation = ''
    for q in range(len(list_data[j][0][i]['annotation'])):
      if q == len(list_data[j][0][i]['annotation'])-1:
        annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
      else:
        annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
    if annotation != '':
      board[annotation] += list_wv[j]


  tmp = [k for k,v in board.items() if max(board.values()) == v]
  list_aaa.append(tmp)



for i in range(len(list_aaa)):
  data_a = []
  if len(list_aaa[i]) != 0:
    for j in range(len(list_aaa[i][0].split(','))):
      if list_aaa[i][0] != '':
        data_b = []
        data_b.append(list_aaa[i][0].split(',')[j].split('$')[0])
        data_b.append(list_aaa[i][0].split(',')[j].split('$')[1])
        data_a.append(data_b)
      

    data_final[0][i]['annotation'] = data_a


with open("./63.60_21312.json", "w") as f:
    json.dump(data_final[0], f)

### high_score_5_softvoting_62.54, 63.16_2191, 62.98_c3a, path_24,12

In [None]:
data_0 = [json.loads(line) for line in open( './high_score_5_softvoting_62.54.json' ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( './63.16_2191.json' ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( './62.98_c3a.json' ,'r', encoding='utf-8')]
data_3 = [json.loads(line) for line in open(path_24, 'r', encoding='utf-8')]
data_4 = [json.loads(line) for line in open(path_12, 'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( './62.97_3ed.json', 'r', encoding='utf-8')]

In [None]:
wv_0 = 62.1
wv_1 = 61.67
wv_2 = 52.25
wv_3 = 53.35
wv_4 = 59.44

In [None]:
#정답에 빈값이 있는경우, 중간에 주석처리로 변경가능
#최빈값 찾기

list_data = [data_0, data_1, data_2, data_3, data_4]
list_wv = [wv_0, wv_1, wv_2, wv_3, wv_4]
list_aaa = []


for i in range(len(data_0[0])):
  board = defaultdict(int)
  
  for j in range(len(list_data)):

    annotation = ''
    for q in range(len(list_data[j][0][i]['annotation'])):
      if q == len(list_data[j][0][i]['annotation'])-1:
        annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
      else:
        annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
    if annotation != '':
      board[annotation] += list_wv[j]


  tmp = [k for k,v in board.items() if max(board.values()) == v]
  list_aaa.append(tmp)



for i in range(len(list_aaa)):
  data_a = []
  if len(list_aaa[i]) != 0:
    for j in range(len(list_aaa[i][0].split(','))):
      if list_aaa[i][0] != '':
        data_b = []
        data_b.append(list_aaa[i][0].split(',')[j].split('$')[0])
        data_b.append(list_aaa[i][0].split(',')[j].split('$')[1])
        data_a.append(data_b)
      

    data_final[0][i]['annotation'] = data_a


with open("./63.41_21313.json", "w") as f:
    json.dump(data_final[0], f)

### high_score_5_softvoting_62.54, 63.16_2191, 62.98_3ed, path_24,9

In [None]:
data_0 = [json.loads(line) for line in open( './high_score_5_softvoting_62.54.json' ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( './63.16_2191.json' ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( './62.97_3ed.json' ,'r', encoding='utf-8')]
data_3 = [json.loads(line) for line in open(path_24, 'r', encoding='utf-8')]
data_4 = [json.loads(line) for line in open(path_9, 'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( './62.98_c3a.json' ,'r', encoding='utf-8')]

In [None]:
wv_0 = 62.1
wv_1 = 61.67
wv_2 = 52.25
wv_3 = 53.35
wv_4 = 59.44

In [None]:
#정답에 빈값이 있는경우, 중간에 주석처리로 변경가능
#최빈값 찾기

list_data = [data_0, data_1, data_2, data_3, data_4]
list_wv = [wv_0, wv_1, wv_2, wv_3, wv_4]
list_aaa = []


for i in range(len(data_0[0])):
  board = defaultdict(int)
  
  for j in range(len(list_data)):
   
   #공백정답미포함
    annotation = ''
    for q in range(len(list_data[j][0][i]['annotation'])):
      if q == len(list_data[j][0][i]['annotation'])-1:
        annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
      else:
        annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
    if annotation != '':
      board[annotation] += list_wv[j]


  tmp = [k for k,v in board.items() if max(board.values()) == v]
  list_aaa.append(tmp)



for i in range(len(list_aaa)):
  data_a = []
  if len(list_aaa[i]) != 0:
    for j in range(len(list_aaa[i][0].split(','))):
      if list_aaa[i][0] != '':
        data_b = []
        data_b.append(list_aaa[i][0].split(',')[j].split('$')[0])
        data_b.append(list_aaa[i][0].split(',')[j].split('$')[1])
        data_a.append(data_b)
      

    data_final[0][i]['annotation'] = data_a


with open("./63.49_21212.json", "w") as f:
    json.dump(data_final[0], f)

### pt_5_softvoting_62.15, 63.16_2191, 62.98_c3a, path_24,9

In [None]:
data_0 = [json.loads(line) for line in open( './pt_5_softvoting_62.15.json' ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( './63.16_2191.json' ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( './62.98_c3a.json' ,'r', encoding='utf-8')]
data_3 = [json.loads(line) for line in open(path_24, 'r', encoding='utf-8')]
data_4 = [json.loads(line) for line in open(path_9, 'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( './62.97_3ed.json', 'r', encoding='utf-8')]

In [None]:
wv_0 = 62.1
wv_1 = 61.67
wv_2 = 52.25
wv_3 = 53.35
wv_4 = 59.44

In [None]:
#정답에 빈값이 있는경우, 중간에 주석처리로 변경가능
#최빈값 찾기

list_data = [data_0, data_1, data_2, data_3, data_4]
list_wv = [wv_0, wv_1, wv_2, wv_3, wv_4]
list_aaa = []


for i in range(len(data_0[0])):
  board = defaultdict(int)
  
  for j in range(len(list_data)):
   
   #공백정답미포함
    annotation = ''
    for q in range(len(list_data[j][0][i]['annotation'])):
      if q == len(list_data[j][0][i]['annotation'])-1:
        annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
      else:
        annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
    if annotation != '':
      board[annotation] += list_wv[j]


  tmp = [k for k,v in board.items() if max(board.values()) == v]
  list_aaa.append(tmp)



for i in range(len(list_aaa)):
  data_a = []
  if len(list_aaa[i]) != 0:
    for j in range(len(list_aaa[i][0].split(','))):
      if list_aaa[i][0] != '':
        data_b = []
        data_b.append(list_aaa[i][0].split(',')[j].split('$')[0])
        data_b.append(list_aaa[i][0].split(',')[j].split('$')[1])
        data_a.append(data_b)
      

    data_final[0][i]['annotation'] = data_a


with open("./63.46_11312.json", "w") as f:
    json.dump(data_final[0], f)

## 단일 모델로 만든 특정 json 3개 hard_voting

### path_12,13,17

In [None]:
data_0 = [json.loads(line) for line in open( path_12 ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( path_13 ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( path_17 ,'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( path_12 ,'r', encoding='utf-8')]

wv_0 = 1.3
wv_1 = 1.2
wv_2 = 1.1


list_data = [data_0, data_1, data_2]
list_wv = [wv_0, wv_1, wv_2]

list_aaa = []

for i in range(len(data_0[0])):
    board = defaultdict(int)

    for j in range(len(list_data)):
        annotation = ''
        for q in range(len(list_data[j][0][i]['annotation'])):
            if q == len(list_data[j][0][i]['annotation'])-1:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
            else:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
        if annotation != '':
            board[annotation] += list_wv[j]

    tmp = [k for k,v in board.items() if max(board.values()) == v]
    list_aaa.append(tmp)

for l in range(len(list_aaa)):
    data_a = []
    if len(list_aaa[l]) != 0:
        for jj in range(len(list_aaa[l][0].split(','))):
            if list_aaa[l][0] != '':
                data_b = []
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[0])
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[1])
                data_a.append(data_b)
        

        data_final[0][l]['annotation'] = data_a

with open("./62.9_3e4.json", "w") as f:
    json.dump(data_final[0], f)

### path_17,12,16

In [None]:
data_0 = [json.loads(line) for line in open( path_17 ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( path_12 ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( path_16 ,'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( path_17 ,'r', encoding='utf-8')]

wv_0 = 1.3
wv_1 = 1.2
wv_2 = 1.1


list_data = [data_0, data_1, data_2]
list_wv = [wv_0, wv_1, wv_2]

list_aaa = []

for i in range(len(data_0[0])):
    board = defaultdict(int)

    for j in range(len(list_data)):
        annotation = ''
        for q in range(len(list_data[j][0][i]['annotation'])):
            if q == len(list_data[j][0][i]['annotation'])-1:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
            else:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
        if annotation != '':
            board[annotation] += list_wv[j]

    tmp = [k for k,v in board.items() if max(board.values()) == v]
    list_aaa.append(tmp)

for l in range(len(list_aaa)):
    data_a = []
    if len(list_aaa[l]) != 0:
        for jj in range(len(list_aaa[l][0].split(','))):
            if list_aaa[l][0] != '':
                data_b = []
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[0])
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[1])
                data_a.append(data_b)
        

        data_final[0][l]['annotation'] = data_a

with open("./62.98_435.json", "w") as f:
    json.dump(data_final[0], f)

## hard_voting 결과물 중 특정 json 7개 선정 및 선정된 7개의 json 중에서 5개 hard_voting

7개의 json 번호 지정

In [None]:
#하드부팅 데이터
data_1 = [json.loads(line) for line in open("./63.16_2191.json", 'r')]
data_2 = [json.loads(line) for line in open('./62.87_1117.json', 'r')]
data_3 = [json.loads(line) for line in open("./62.9_3e4.json", 'r')]
data_4 = [json.loads(line) for line in open('./62.98_c3a.json', 'r')]
data_5 = [json.loads(line) for line in open('./62.97_3ed.json', 'r')]
data_6 = [json.loads(line) for line in open('./62.97_1109.json', 'r')]
data_7 = [json.loads(line) for line in open('./62.98_435.json', 'r')]

### data_1,2,4,5,7 hard_voting

In [None]:
wv_1 = 63.16
wv_2 = 62.87
wv_3 = 62.9
wv_4 = 62.98
wv_5 = 62.97
wv_6 = 62.97
wv_7 = 62.98

In [None]:
#결과값 저장할 제이슨 형식
data_final = [json.loads(line) for line in open('./62.98_435.json', 'r')]

In [None]:
#정답에 빈값이 있는경우, 중간에 주석처리로 변경가능
#최빈값 찾기

list_data = [data_1, data_2, data_4, data_5, data_7]
list_wv = [wv_1,wv_2, wv_4, wv_5, wv_7]
list_aaa = []


for i in range(len(data_1[0])):
  board = defaultdict(int)
  
  for j in range(len(list_data)):

    annotation = ''
    for q in range(len(list_data[j][0][i]['annotation'])):
      if q == len(list_data[j][0][i]['annotation'])-1:
        annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
      else:
        annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
    if annotation != '':
      board[annotation] += list_wv[j]


  tmp = [k for k,v in board.items() if max(board.values()) == v]
  list_aaa.append(tmp)

for i in range(len(list_aaa)):
  data_a = []
  if len(list_aaa[i]) != 0:
    for j in range(len(list_aaa[i][0].split(','))):
      if list_aaa[i][0] != '':
        data_b = []
        data_b.append(list_aaa[i][0].split(',')[j].split('$')[0])
        data_b.append(list_aaa[i][0].split(',')[j].split('$')[1])
        data_a.append(data_b)
      

    data_final[0][i]['annotation'] = data_a
  


with open("./63.47_12457.json", "w") as f:
    json.dump(data_final[0], f)

### data_1,2,3,4,7 hard_voting

In [None]:
#정답에 빈값이 있는경우, 중간에 주석처리로 변경가능
#최빈값 찾기

list_data = [data_1, data_2, data_3, data_4, data_7]
list_wv = [wv_1,wv_2, wv_3, wv_4, wv_7]
list_aaa = []


for i in range(len(data_1[0])):
  board = defaultdict(int)
  
  for j in range(len(list_data)):

    annotation = ''
    for q in range(len(list_data[j][0][i]['annotation'])):
      if q == len(list_data[j][0][i]['annotation'])-1:
        annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
      else:
        annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
    if annotation != '':
      board[annotation] += list_wv[j]


  tmp = [k for k,v in board.items() if max(board.values()) == v]
  list_aaa.append(tmp)

for i in range(len(list_aaa)):
  data_a = []
  if len(list_aaa[i]) != 0:
    for j in range(len(list_aaa[i][0].split(','))):
      if list_aaa[i][0] != '':
        data_b = []
        data_b.append(list_aaa[i][0].split(',')[j].split('$')[0])
        data_b.append(list_aaa[i][0].split(',')[j].split('$')[1])
        data_a.append(data_b)
      

    data_final[0][i]['annotation'] = data_a
  


with open("./63.44_12347.json", "w") as f:
    json.dump(data_final[0], f)

### data_2,4,5,6,7 hard_voting

In [None]:
#정답에 빈값이 있는경우, 중간에 주석처리로 변경가능
#최빈값 찾기

list_data = [data_2, data_4, data_5, data_6, data_7]
list_wv = [wv_2,wv_4, wv_5, wv_6, wv_7]
list_aaa = []


for i in range(len(data_1[0])):
  board = defaultdict(int)
  
  for j in range(len(list_data)):

    annotation = ''
    for q in range(len(list_data[j][0][i]['annotation'])):
      if q == len(list_data[j][0][i]['annotation'])-1:
        annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
      else:
        annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
    if annotation != '':
      board[annotation] += list_wv[j]


  tmp = [k for k,v in board.items() if max(board.values()) == v]
  list_aaa.append(tmp)

for i in range(len(list_aaa)):
  data_a = []
  if len(list_aaa[i]) != 0:
    for j in range(len(list_aaa[i][0].split(','))):
      if list_aaa[i][0] != '':
        data_b = []
        data_b.append(list_aaa[i][0].split(',')[j].split('$')[0])
        data_b.append(list_aaa[i][0].split(',')[j].split('$')[1])
        data_a.append(data_b)
      

    data_final[0][i]['annotation'] = data_a
  


with open("./63.54_24567.json", "w") as f:
    json.dump(data_final[0], f)

## hard_voting 결과물 중 특정 json 3개 선정 및 선정된 3개로 hard_voting

### 63.60_21312, 63.47_12457. 63.41_21313

In [None]:
data_0 = [json.loads(line) for line in open( './63.60_21312.json' ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( './63.47_12457.json' ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( './63.41_21313.json' ,'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( './63.60_21312.json' ,'r', encoding='utf-8')]

wv_0 = 1.3
wv_1 = 1.2
wv_2 = 1.1


list_data = [data_0, data_1, data_2]
list_wv = [wv_0, wv_1, wv_2]

list_aaa = []

for i in range(len(data_0[0])):
    board = defaultdict(int)

    for j in range(len(list_data)):
        annotation = ''
        for q in range(len(list_data[j][0][i]['annotation'])):
            if q == len(list_data[j][0][i]['annotation'])-1:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
            else:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
        if annotation != '':
            board[annotation] += list_wv[j]

    tmp = [k for k,v in board.items() if max(board.values()) == v]
    list_aaa.append(tmp)

for l in range(len(list_aaa)):
    data_a = []
    if len(list_aaa[l]) != 0:
        for jj in range(len(list_aaa[l][0].split(','))):
            if list_aaa[l][0] != '':
                data_b = []
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[0])
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[1])
                data_a.append(data_b)
        

        data_final[0][l]['annotation'] = data_a

with open("./63.62_146.json", "w") as f:
    json.dump(data_final[0], f)

### 63.54_24567, 63.44_12347, 63,41_21313

In [None]:
data_0 = [json.loads(line) for line in open( './63.54_24567.json' ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( './63.44_12347.json' ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( './63.41_21313.json' ,'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( './63.54_24567.json' ,'r', encoding='utf-8')]

wv_0 = 1.3
wv_1 = 1.2
wv_2 = 1.1


list_data = [data_0, data_1, data_2]
list_wv = [wv_0, wv_1, wv_2]

list_aaa = []

for i in range(len(data_0[0])):
    board = defaultdict(int)

    for j in range(len(list_data)):
        annotation = ''
        for q in range(len(list_data[j][0][i]['annotation'])):
            if q == len(list_data[j][0][i]['annotation'])-1:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
            else:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
        if annotation != '':
            board[annotation] += list_wv[j]

    tmp = [k for k,v in board.items() if max(board.values()) == v]
    list_aaa.append(tmp)

for l in range(len(list_aaa)):
    data_a = []
    if len(list_aaa[l]) != 0:
        for jj in range(len(list_aaa[l][0].split(','))):
            if list_aaa[l][0] != '':
                data_b = []
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[0])
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[1])
                data_a.append(data_b)
        

        data_final[0][l]['annotation'] = data_a

with open("./63.60_356.json", "w") as f:
    json.dump(data_final[0], f)

### 63.46_11312. 63.60_21312. 63.49_21212

In [None]:
data_0 = [json.loads(line) for line in open( './63.46_11312.json' ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( './63.60_21312.json' ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( './63.49_21212.json' ,'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( './63.46_11312.json' ,'r', encoding='utf-8')]

wv_0 = 1.3
wv_1 = 1.2
wv_2 = 1.1


list_data = [data_0, data_1, data_2]
list_wv = [wv_0, wv_1, wv_2]

list_aaa = []

for i in range(len(data_0[0])):
    board = defaultdict(int)

    for j in range(len(list_data)):
        annotation = ''
        for q in range(len(list_data[j][0][i]['annotation'])):
            if q == len(list_data[j][0][i]['annotation'])-1:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
            else:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
        if annotation != '':
            board[annotation] += list_wv[j]

    tmp = [k for k,v in board.items() if max(board.values()) == v]
    list_aaa.append(tmp)

for l in range(len(list_aaa)):
    data_a = []
    if len(list_aaa[l]) != 0:
        for jj in range(len(list_aaa[l][0].split(','))):
            if list_aaa[l][0] != '':
                data_b = []
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[0])
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[1])
                data_a.append(data_b)
        

        data_final[0][l]['annotation'] = data_a

with open("./63.65_shd_hs_312.json", "w") as f:
    json.dump(data_final[0], f)

### 63.57_3579, 63.62_146, 63.60_356

In [None]:
data_0 = [json.loads(line) for line in open( './63.57_3579.json' ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( './63.62_146.json' ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( './63.60_356.json' ,'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( './63.57_3579.json' ,'r', encoding='utf-8')]

wv_0 = 1.3
wv_1 = 1.2
wv_2 = 1.1


list_data = [data_0, data_1, data_2]
list_wv = [wv_0, wv_1, wv_2]

list_aaa = []

for i in range(len(data_0[0])):
    board = defaultdict(int)

    for j in range(len(list_data)):
        annotation = ''
        for q in range(len(list_data[j][0][i]['annotation'])):
            if q == len(list_data[j][0][i]['annotation'])-1:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
            else:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
        if annotation != '':
            board[annotation] += list_wv[j]

    tmp = [k for k,v in board.items() if max(board.values()) == v]
    list_aaa.append(tmp)

for l in range(len(list_aaa)):
    data_a = []
    if len(list_aaa[l]) != 0:
        for jj in range(len(list_aaa[l][0].split(','))):
            if list_aaa[l][0] != '':
                data_b = []
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[0])
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[1])
                data_a.append(data_b)
        

        data_final[0][l]['annotation'] = data_a

with open("./63.85_645.json", "w") as f:
    json.dump(data_final[0], f)

### 63.60_356, 63.65_shd_hs_312, 63.57_3579

In [None]:
data_0 = [json.loads(line) for line in open( './63.60_356.json' ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( './63.65_shd_hs_312.json' ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( './63.57_3579.json' ,'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( './63.60_356.json' ,'r', encoding='utf-8')]

wv_0 = 1.3
wv_1 = 1.2
wv_2 = 1.1


list_data = [data_0, data_1, data_2]
list_wv = [wv_0, wv_1, wv_2]

list_aaa = []

for i in range(len(data_0[0])):
    board = defaultdict(int)

    for j in range(len(list_data)):
        annotation = ''
        for q in range(len(list_data[j][0][i]['annotation'])):
            if q == len(list_data[j][0][i]['annotation'])-1:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
            else:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
        if annotation != '':
            board[annotation] += list_wv[j]

    tmp = [k for k,v in board.items() if max(board.values()) == v]
    list_aaa.append(tmp)

for l in range(len(list_aaa)):
    data_a = []
    if len(list_aaa[l]) != 0:
        for jj in range(len(list_aaa[l][0].split(','))):
            if list_aaa[l][0] != '':
                data_b = []
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[0])
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[1])
                data_a.append(data_b)
        

        data_final[0][l]['annotation'] = data_a

with open("./63.87_536.json", "w") as f:
    json.dump(data_final[0], f)

### 63.60_21312, 63.54_24567, 63.41_21313

In [None]:
data_0 = [json.loads(line) for line in open( './63.60_21312.json' ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( './63.54_24567.json' ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( './63.41_21313.json' ,'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( './63.60_21312.json' ,'r', encoding='utf-8')]

wv_0 = 1.3
wv_1 = 1.2
wv_2 = 1.1


list_data = [data_0, data_1, data_2]
list_wv = [wv_0, wv_1, wv_2]

list_aaa = []

for i in range(len(data_0[0])):
    board = defaultdict(int)

    for j in range(len(list_data)):
        annotation = ''
        for q in range(len(list_data[j][0][i]['annotation'])):
            if q == len(list_data[j][0][i]['annotation'])-1:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
            else:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
        if annotation != '':
            board[annotation] += list_wv[j]

    tmp = [k for k,v in board.items() if max(board.values()) == v]
    list_aaa.append(tmp)

for l in range(len(list_aaa)):
    data_a = []
    if len(list_aaa[l]) != 0:
        for jj in range(len(list_aaa[l][0].split(','))):
            if list_aaa[l][0] != '':
                data_b = []
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[0])
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[1])
                data_a.append(data_b)
        

        data_final[0][l]['annotation'] = data_a

with open("./63.68_136.json", "w") as f:
    json.dump(data_final[0], f)

### 63.68_136, 63.65_shd_hs_312, 63.57_3579

In [None]:
data_0 = [json.loads(line) for line in open( './63.68_136.json' ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( './63.65_shd_hs_312.json' ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( './63.57_3579.json' ,'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( './63.68_136.json' ,'r', encoding='utf-8')]

wv_0 = 1.3
wv_1 = 1.2
wv_2 = 1.1


list_data = [data_0, data_1, data_2]
list_wv = [wv_0, wv_1, wv_2]

list_aaa = []

for i in range(len(data_0[0])):
    board = defaultdict(int)

    for j in range(len(list_data)):
        annotation = ''
        for q in range(len(list_data[j][0][i]['annotation'])):
            if q == len(list_data[j][0][i]['annotation'])-1:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
            else:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
        if annotation != '':
            board[annotation] += list_wv[j]

    tmp = [k for k,v in board.items() if max(board.values()) == v]
    list_aaa.append(tmp)

for l in range(len(list_aaa)):
    data_a = []
    if len(list_aaa[l]) != 0:
        for jj in range(len(list_aaa[l][0].split(','))):
            if list_aaa[l][0] != '':
                data_b = []
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[0])
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[1])
                data_a.append(data_b)
        

        data_final[0][l]['annotation'] = data_a

with open("./63.80_136.json", "w") as f:
    json.dump(data_final[0], f)

### 63.85_645, 63.87_536, 63.80_136

In [None]:
data_0 = [json.loads(line) for line in open( './63.85_645.json' ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( './63.87_536.json' ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( './63.80_136.json' ,'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( './63.85_645.json' ,'r', encoding='utf-8')]

wv_0 = 1.3
wv_1 = 1.2
wv_2 = 1.1


list_data = [data_0, data_1, data_2]
list_wv = [wv_0, wv_1, wv_2]

list_aaa = []

for i in range(len(data_0[0])):
    board = defaultdict(int)

    for j in range(len(list_data)):
        annotation = ''
        for q in range(len(list_data[j][0][i]['annotation'])):
            if q == len(list_data[j][0][i]['annotation'])-1:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
            else:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
        if annotation != '':
            board[annotation] += list_wv[j]

    tmp = [k for k,v in board.items() if max(board.values()) == v]
    list_aaa.append(tmp)

for l in range(len(list_aaa)):
    data_a = []
    if len(list_aaa[l]) != 0:
        for jj in range(len(list_aaa[l][0].split(','))):
            if list_aaa[l][0] != '':
                data_b = []
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[0])
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[1])
                data_a.append(data_b)
        

        data_final[0][l]['annotation'] = data_a

with open("./63.89_213.json", "w") as f:
    json.dump(data_final[0], f)

## hard_voting 특정 json 1개, 단일 모델 특정 json 1개, soft_voting 특정 json 1개 hard_voting

In [None]:
data_0 = [json.loads(line) for line in open( './63.89_213.json' ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( './08.json' ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( './pt_3_softvoting_61.15.json' ,'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( './63.89_213.json' ,'r', encoding='utf-8')]

wv_0 = 1.3
wv_1 = 1.2
wv_2 = 1.1


list_data = [data_0, data_1, data_2]
list_wv = [wv_0, wv_1, wv_2]

list_aaa = []

for i in range(len(data_0[0])):
    board = defaultdict(int)

    for j in range(len(list_data)):
        annotation = ''
        for q in range(len(list_data[j][0][i]['annotation'])):
            if q == len(list_data[j][0][i]['annotation'])-1:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
            else:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
        if annotation != '':
            board[annotation] += list_wv[j]

    tmp = [k for k,v in board.items() if max(board.values()) == v]
    list_aaa.append(tmp)

for l in range(len(list_aaa)):
    data_a = []
    if len(list_aaa[l]) != 0:
        for jj in range(len(list_aaa[l][0].split(','))):
            if list_aaa[l][0] != '':
                data_b = []
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[0])
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[1])
                data_a.append(data_b)
        

        data_final[0][l]['annotation'] = data_a

with open("./63.90_132.json", "w") as f:
    json.dump(data_final[0], f)

# polarity_change

change 할 polarity gdown

In [None]:
!gdown 1596e_lhhEvTMbkscHc2ygypqA3hNbIVW

polarity_change 할 json 경로 설정

In [None]:
# 바꿔줄 pred_data파일 경로 설정
test_data_path = './63.90_132.json'

# 사용할 베이스 모델 설정
base_model = 'kykim/electra-kor-base'

polarity_change 함수

In [None]:
def test_sentiment_analysis_save():
    # polarity pt 파일 경로 설정
    test_polarity_classification_model_path = './data_polarity_epoch_10.pt'
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    test_data = jsonload(test_data_path)
            
    polarity_model = ELECTRABaseClassifier(len(polarity_id_to_name), len(tokenizer))
    polarity_model.load_state_dict(torch.load(test_polarity_classification_model_path, map_location=device))
    polarity_model.to(device)
    polarity_model.eval()

    pred_data = predict_from_korean_form_polarity(tokenizer, polarity_model, copy.deepcopy(test_data))

    jsondump(pred_data, './63.90_10epoch')

In [None]:
test_sentiment_analysis_save()

사용 완료한 polarity 제거

In [None]:
remove_path = './data_polarity_epoch_10.pt'

if os.path.exists(remove_path):
    os.remove(remove_path)

polarity gdown

In [None]:
!gdown 1s3XP91Vp8zqJs5jejz-5AEQNVFOT1Mev

변경된 polarity로 polarity_change 함수 실행

In [None]:
def test_sentiment_analysis_save():
    # polarity pt 파일 경로 설정
    test_polarity_classification_model_path = './data_polarity_negative_plus_epoch_3.pt'
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    test_data = jsonload(test_data_path)
            
    polarity_model = ELECTRABaseClassifier(len(polarity_id_to_name), len(tokenizer))
    polarity_model.load_state_dict(torch.load(test_polarity_classification_model_path, map_location=device))
    polarity_model.to(device)
    polarity_model.eval()

    pred_data = predict_from_korean_form_polarity(tokenizer, polarity_model, copy.deepcopy(test_data))

    jsondump(pred_data, './63.90_negative_plus_3epoch')

In [None]:
test_sentiment_analysis_save()

사용 완료한 polarity 제거

In [None]:
remove_path = './data_polarity_negative_plus_epoch_3.pt'

if os.path.exists(remove_path):
    os.remove(remove_path)

사용할 base_model 변경

In [None]:
base_model = 'beomi/KcELECTRA-base'

polarity gdown

In [None]:
!gdown 19TpczS4kFATxPGuqqiFt0E7zE-bJz86H

변경된 polarity로 polarity_change 함수 실행

In [None]:
def test_sentiment_analysis_save():
    # polarity pt 파일 경로 설정
    test_polarity_classification_model_path = './data_polarity_kcelectra_epoch_6.pt'
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    test_data = jsonload(test_data_path)
            
    polarity_model = ELECTRABaseClassifier(len(polarity_id_to_name), len(tokenizer))
    polarity_model.load_state_dict(torch.load(test_polarity_classification_model_path, map_location=device))
    polarity_model.to(device)
    polarity_model.eval()

    pred_data = predict_from_korean_form_polarity(tokenizer, polarity_model, copy.deepcopy(test_data))

    jsondump(pred_data, './63.90_kcelectra_6epoch')

In [None]:
test_sentiment_analysis_save()

사용 완료한 polarity 제거

In [None]:
remove_path = './data_polarity_kcelectra_epoch_6.pt'

if os.path.exists(remove_path):
    os.remove(remove_path)

remove_path = './data_polarity_epoch_12.pt'

if os.path.exists(remove_path):
    os.remove(remove_path)

# polarity_change 한 결과물들 총 3개로 hard_voting

In [None]:
data_0 = [json.loads(line) for line in open( './63.90_kcelectra_6epoch' ,'r', encoding='utf-8')]
data_1 = [json.loads(line) for line in open( './63.90_10epoch' ,'r', encoding='utf-8')]
data_2 = [json.loads(line) for line in open( './63.90_negative_plus_3epoch' ,'r', encoding='utf-8')]

data_final = [json.loads(line) for line in open( './63.90_kcelectra_6epoch','r', encoding='utf-8')]

wv_0 = 1.3
wv_1 = 1.2
wv_2 = 1.1


list_data = [data_0, data_1, data_2]
list_wv = [wv_0, wv_1, wv_2]

list_aaa = []

for i in range(len(data_0[0])):
    board = defaultdict(int)

    for j in range(len(list_data)):
        annotation = ''
        for q in range(len(list_data[j][0][i]['annotation'])):
            if q == len(list_data[j][0][i]['annotation'])-1:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1]
            else:
                annotation = annotation + list_data[j][0][i]['annotation'][q][0] +'$'+ list_data[j][0][i]['annotation'][q][1] +','   
        if annotation != '':
            board[annotation] += list_wv[j]

    tmp = [k for k,v in board.items() if max(board.values()) == v]
    list_aaa.append(tmp)

for l in range(len(list_aaa)):
    data_a = []
    if len(list_aaa[l]) != 0:
        for jj in range(len(list_aaa[l][0].split(','))):
            if list_aaa[l][0] != '':
                data_b = []
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[0])
                data_b.append(list_aaa[l][0].split(',')[jj].split('$')[1])
                data_a.append(data_b)
        

        data_final[0][l]['annotation'] = data_a

with open("./64.25.json", "w") as f:
    json.dump(data_final[0], f)

## 완성된 json 파일을 jsonl화

In [None]:
row_data = jsonload('./64.25.json')

with open("final_potential.jsonl" , encoding= "utf-8",mode="w") as file: 
	for i in row_data: file.write(json.dumps(i) + "\n")