#  구글 드라이브 마운트

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 패키지 설치

In [None]:
!pip install git+https://github.com/ssut/py-hanspell.git
!pip install konlpy
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3 
!pip install torch
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [None]:
# 데이터 처리 
import pandas as pd 
import numpy as np

# 시각화 
import seaborn as sns 
import matplotlib as mpl
import matplotlib.pyplot as plt 

# 시각화 옵션
from matplotlib_inline.backend_inline import set_matplotlib_formats
# 한글 글꼴 설정
mpl.rc('font',family ='Malgun Gothic')
# 음수값 설정 
mpl.rc('axes',unicode_minus= True )
# 그래프 선명하게 출력
set_matplotlib_formats('retina')

import re

from hanspell import spell_checker

from sklearn.model_selection import train_test_split

from konlpy.tag import Okt

# LSTM
import tensorflow 
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

from tensorflow.keras.optimizers import RMSprop, SGD
from tensorflow.keras.layers import Dropout, Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.callbacks import Callback, EarlyStopping, ReduceLROnPlateau

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
from keras.utils import np_utils
from sklearn.metrics import accuracy_score, confusion_matrix

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

# 파일 불러오기

In [None]:
df_data1 = pd.read_excel('/content/drive/MyDrive/data1.xlsx', engine='openpyxl')
df_data2 = pd.read_excel('/content/drive/MyDrive/data2.xlsx', engine='openpyxl')
df_stopword = pd.read_csv('/content/drive/MyDrive/stopword.txt', header = None, names = ['불용어'])

df_test = pd.read_excel('/content/drive/MyDrive/input_data.xlsx', engine='openpyxl')

# 모델 학습하기

In [None]:
# 데이터 병합
df = pd.concat([df_data1, df_data2], sort = False)
df = df.sort_values(by = ['NO'])
df = df.reset_index(drop = True)


# 발화 1개 여러개 분리
cond1 = df['발화2'].isnull()
df_one = df[cond1]
df_many = df[~cond1]


# 분류 수정
df1 = df_one[['NO', 'NO2', '발화', '발화1', '발화1의 우선순위']]
df1 = df1.reset_index(drop = True)
df1['수정된 분류'] = df1['발화1']

cond2 = df1['수정된 분류'] == '중립'
cond3 = df1['수정된 분류'] == '폐기'
df1_neutral = df1[cond2]
df1_discard = df1[cond3]

df1_neutral_good = df1_neutral[df1_neutral.columns]
df1_neutral_bad = df1_neutral[df1_neutral.columns]

df1_neutral_good['수정된 분류'] = '칭찬>기타>중립'
df1_neutral_bad['수정된 분류'] = '불만>기타>중립'

df1_1 = df1[~cond2]
df1_2 = pd.concat([df1_1, df1_neutral_good], sort = False)
df2 = pd.concat([df1_2, df1_neutral_bad], sort = False)
df2 = df2.reset_index(drop = True)

df2.loc[df2['수정된 분류']=='칭찬>고객서비스', '수정된 분류'] = '칭찬>고객서비스>고객서비스'
df2.loc[df2['수정된 분류']=='칭찬>삼성카드', '수정된 분류'] = '칭찬>삼성카드>삼성카드'
df2.loc[df2['수정된 분류']=='칭찬>기타', '수정된 분류'] = '칭찬>기타>기타'
df2.loc[df2['수정된 분류']=='불만>고객서비스', '수정된 분류'] = '불만>고객서비스>고객서비스'
df2.loc[df2['수정된 분류']=='불만>삼성카드', '수정된 분류'] = '불만>삼성카드>삼성카드'
df2.loc[df2['수정된 분류']=='불만>기타', '수정된 분류'] = '불만>기타>기타'


# 레이블링
df3 = df2[df2.columns]
df3[['분류1','분류2','분류3']] = df3['수정된 분류'].str.split('>',n=3, expand=True)

df4 = df3[df3.columns]
df4['document'] = df4['발화']

dic1 = {'칭찬' : 0, '불만' : 1}
dic2 = {'고객서비스' : 0, '삼성카드' : 1, '기타' : 2}
dic3_0 = {'상담원' : 0, '상담시스템' : 1, '고객서비스' : 2}
dic3_1 = {'혜택' : 0, '할부금융상품' : 1, '커뮤니티서비스' : 2, '카드이용/결제' : 3, 
          '카드상품' : 4, '청구입금' : 5, '심사/한도' : 6, '생활편의서비스' : 7, 
          '상담/채널' : 8, '리스렌탈상품' : 9, '라이프서비스' : 10, '금융상품' : 11, 
          '고객정보관리' : 12, '가맹점매출/승인' : 13, '가맹점대금' : 14, '가맹점계약' : 15, '삼성카드' : 16}
dic3_2 = {'기타' : 0, '중립' : 1, '폐기' : 2}

for i in df4.index :
    df4.loc[i, 'label1'] = dic1[df4.loc[i, '분류1']]
    df4.loc[i, 'label2'] = dic2[df4.loc[i, '분류2']]
    if df4.loc[i, '분류2'] == '고객서비스' :
        df4.loc[i, 'label3'] = dic3_0[df4.loc[i, '분류3']]
    elif df4.loc[i, '분류2'] == '삼성카드' :
        df4.loc[i, 'label3'] = dic3_1[df4.loc[i, '분류3']]
    elif df4.loc[i, '분류2'] == '기타' :
        df4.loc[i, 'label3'] = dic3_2[df4.loc[i, '분류3']]
    else :
        print(i)
        
df4[['label1', 'label2', 'label3']] = df4[['label1', 'label2', 'label3']].astype('int')        


# 맞춤법 교정을 위한
# 특수문자 삭제
df5 = df4[df4.columns]
df5["document"] = df5["document"].str.replace(pat=r'[^\w]', repl=r' ', regex=True)

# 이중 space 제거
df6 = df5[df5.columns]
new_document = []
for sent in df6['document'] :
    changed_sent = re.sub(' +', ' ', sent)
    new_document.append(changed_sent)

df6['document'] = new_document

In [None]:
import time
from IPython.display import display, clear_output

df7 = df6[df6.columns]

start = time.time()  # 시작 시간 저장

changed_document = []
i = 0
for sent in df7['document'] :
    
    clear_output(wait=True)
    i += 1
    
    spelled_sent = spell_checker.check(sent)
    checked_sent = spelled_sent.checked
    changed_document.append(checked_sent)
    
    print('진행 상황 : ', i,'/',len(df7['document']))
    print('진행 시간 : ', round((time.time() - start),2))

print('총 걸린 시간 : ', round((time.time() - start),2), '초')

df7['document'] = changed_document

In [None]:
# 1차 분류를 위한 dataset
cond11 = (df7['분류3'] == '중립')
df_class_1 = df7.loc[~cond11][['document', 'label1']]

# 2차 분류를 위한 dataset
cond12 = (df7['분류3'] == '중립' )&(df7['분류1'] == '칭찬' )
df_class_2 = df7.loc[~cond12][['document', 'label2']]

# 2차 분류를 위한 dataset - 1차분류 결과로 따로 할 경우
cond13 = (df7['분류1'] == '칭찬')
cond14 = (df7['분류1'] == '불만')
df_class_2_0 = df7.loc[cond13][['document', 'label2']]
df_class_2_1 = df7.loc[cond14][['document', 'label2']]

# 3차 분류를 위한 dataset
cond15 = (df7['분류2'] == '고객서비스')
cond16 = (df7['분류2'] == '삼성카드')
cond17 = (df7['분류2'] == '기타')
cond18 = (df7['분류3'] == '중립' )&(df7['분류1'] == '칭찬' )
df_class_3_0 = df7.loc[cond15][['document', 'label3']]
df_class_3_1 = df7.loc[cond16][['document', 'label3']]
df_class_3_2 = df7.loc[cond17].loc[~cond18][['document', 'label3']]
# print(len(df_class_3_0), len(df_class_3_1), len(df_class_3_2))

In [None]:
# 시드값 설정 
seed = 6124
np.random.seed(seed)

x_train_1 , x_test_1, y_train_1, y_test_1 = train_test_split(df_class_1['document'], 
                                                             df_class_1['label1'], 
                                                             test_size = 0.2, 
                                                             random_state = seed)

x_train_2 , x_test_2, y_train_2, y_test_2 = train_test_split(df_class_2['document'], 
                                                             df_class_2['label2'], 
                                                             test_size = 0.2, 
                                                             random_state = seed)

x_train_3_0 , x_test_3_0 , y_train_3_0 , y_test_3_0  = train_test_split(df_class_3_0['document'],
                                                                        df_class_3_0['label3'], 
                                                                        test_size = 0.2, 
                                                                        random_state = seed)

x_train_3_1 , x_test_3_1 , y_train_3_1 , y_test_3_1  = train_test_split(df_class_3_1['document'],
                                                                        df_class_3_1['label3'], 
                                                                        test_size = 0.2, 
                                                                        random_state = seed)

x_train_3_2 , x_test_3_2 , y_train_3_2 , y_test_3_2  = train_test_split(df_class_3_2['document'],
                                                                        df_class_3_2['label3'], 
                                                                        test_size = 0.2, 
                                                                        random_state = seed)

In [None]:
# 불용어 및 특수 기호 처리함수 구성 
def stopword_function(X):
    clean_train_review = []
    okt = Okt()
    # 모든 X 값에 대해 불용어 및 특수기호 처리 
    for i in X:

        # 문자데이터에 대한 불용어 처리 
        if type(i) == str:
            # 한글 자음, 모음 처리
            review_text = re.sub("([ㄱ-ㅎㅏ-ㅣ]+)", "", i) 
            # 특수 문자 처리 
            review_text = re.sub("[^\w\s]", "", review_text) 
            # Tokenizing & Stemming 
            word_text = okt.morphs(review_text,stem=True)
            # StopWord 
            word_text = [token for token in word_text if not token in stopword_set]  

            clean_train_review.append(word_text)

        # 숫자데이터가 오는 경우, 공백을 append     
        else:
            clean_train_review.append([])
            
    return clean_train_review


# 이중 리스트에서 해당 요소와 그 앞뒤 요소 출력하는 함수
def find_letter_with_before_and_after(double_list, x) :
    i = 0
    for text in double_list :
        if str(x) in text :
            i += 1
            if text.index(str(x)) == 0 :
                print(text[text.index(str(x))], text[text.index(str(x))+1])
            elif text.index(str(x)) >= len(text)-1 :
                print(text[text.index(str(x))-1],text[text.index(str(x))])
            else :
                print(text[text.index(str(x))-1],text[text.index(str(x))], text[text.index(str(x))+1])
    print('총, ', i, '개')

    
# 리스트의 특정 요소 인덱스 모두 찾는 함수
def find_index(data, target):
    res = []
    lis = data
    while True:
        try:
            res.append(lis.index(target) + (res[-1]+1 if len(res)!=0 else 0))
            lis = data[res[-1]+1:]
        except:
            break     
    return res


# 리스트의 특정 요소 모두 삭제하는 함수
def remove_values_from_list(the_list, val):
    return [value for value in the_list if value != val]


# 이중 리스트에서 두 단어를 합치는 함수 
def combine_two_words(double_list, word1, word2) :
    c_double_list = double_list
    for i in range(len(c_double_list)) :
        if str(word1) in c_double_list[i] :
            for j in find_index(c_double_list[i], str(word1)) :
                if j < len(c_double_list[i])-1 :
                    if c_double_list[i][j+1] == str(word2) :
                        c_double_list[i][j] = str(word1)+str(word2)
                        c_double_list[i][j+1] = '삭제할 문자'
            
            c_double_list[i] = remove_values_from_list(c_double_list[i], '삭제할 문자')
            
    return c_double_list


# 이중 리스트에서 한 단어 삭제하는 함수
def remove_one_word(double_list, word) :
    r_double_list = double_list
    for i in range(len(r_double_list)) :
        if str(word) in r_double_list[i] :
            r_double_list[i] = remove_values_from_list(r_double_list[i], str(word))

    return r_double_list


# 이중 리스트에서 한 단어 변경하는 함수
def change_one_word(double_list, word, changed_word) :
    co_double_list = double_list
    for i in range(len(co_double_list)) :
        if str(word) in co_double_list[i] :
            for j in find_index(co_double_list[i], str(word)) :
                co_double_list[i][j] = str(changed_word)

    return co_double_list


# 추가 전처리
def additional_cleaning(double_list) :
    
    # 원
    double_list = combine_two_words(double_list, '상담', '원')

    double_list = combine_two_words(double_list, '상당', '원')
    double_list = change_one_word(double_list, '상당원', '상담원')

    double_list = combine_two_words(double_list, '안내', '원')

    double_list = combine_two_words(double_list, '원', '치')
    double_list = change_one_word(double_list, '원치', '원하지')

    double_list = combine_two_words(double_list, '상사', '원')
    double_list = change_one_word(double_list, '상사원', '상담원')

    double_list = combine_two_words(double_list, '상대', '원')
    double_list = change_one_word(double_list, '상대원', '상담원')

    double_list = combine_two_words(double_list, '칙', '원')
    double_list = change_one_word(double_list, '칙원', '직원')

    double_list = combine_two_words(double_list, '칙', '원')

    double_list = change_one_word(double_list, '결시', '연결')
    double_list = change_one_word(double_list, '간이', '시간')

    # 지
    double_list = combine_two_words(double_list, '지', '원금')

    double_list = combine_two_words(double_list, '지', '연')

    double_list = combine_two_words(double_list, '알', '지')

    double_list = combine_two_words(double_list, '지', '양')

    double_list = combine_two_words(double_list, '지', '양해')
    double_list = change_one_word(double_list, '지양해', '지양')
    
    # 대
    double_list = combine_two_words(double_list, '대', '기')

    double_list = combine_two_words(double_list, '대', '체적')

    double_list = combine_two_words(double_list, '대', '출사')

    double_list = combine_two_words(double_list, '대', '화법')

    double_list = combine_two_words(double_list, '대', '금도')

    double_list = combine_two_words(double_list, '현', '대')

    # OOO
    double_list = remove_one_word(double_list, 'OOO')
    
    return double_list


# 이중 리스트 복제 함수
def copy_double_list(double_list) : 
    copied_double_list = []
    for i in range(len(double_list)) :
        line = []
        for j in range(len(double_list[i])):
            line.append(double_list[i][j])
        copied_double_list.append(line)
        
    return copied_double_list

In [None]:
stopword_set = set(df_stopword['불용어'].values.tolist())

x_train_1_clean = stopword_function(x_train_1)
x_train_1_clean = additional_cleaning(x_train_1_clean)

x_train_2_clean = stopword_function(x_train_2)
x_train_2_clean = additional_cleaning(x_train_2_clean)

x_train_3_0_clean = stopword_function(x_train_3_0)
x_train_3_0_clean = additional_cleaning(x_train_3_0_clean)
x_train_3_1_clean = stopword_function(x_train_3_1)
x_train_3_1_clean = additional_cleaning(x_train_3_1_clean)
x_train_3_2_clean = stopword_function(x_train_3_2)
x_train_3_2_clean = additional_cleaning(x_train_3_2_clean)


x_test_1_clean = stopword_function(x_test_1)
x_test_1_clean = additional_cleaning(x_test_1_clean)

x_test_2_clean = stopword_function(x_test_2)
x_test_2_clean = additional_cleaning(x_test_2_clean)

x_test_3_0_clean = stopword_function(x_test_3_0)
x_test_3_0_clean = additional_cleaning(x_test_3_0_clean)
x_test_3_1_clean = stopword_function(x_test_3_1)
x_test_3_1_clean = additional_cleaning(x_test_3_1_clean)
x_test_3_2_clean = stopword_function(x_test_3_2)
x_test_3_2_clean = additional_cleaning(x_test_3_2_clean)

In [None]:
tensorflow.compat.v1.set_random_seed(196)

vocab_size_1 = 2000
model_token_1 = Tokenizer(num_words = vocab_size_1)
model_token_1.fit_on_texts(x_train_1_clean)
x_sequences_1 = model_token_1.texts_to_sequences(x_train_1_clean)
max_length_1 = 80
train_x_1 = pad_sequences(x_sequences_1, maxlen = max_length_1, padding = 'post')
train_y_1 = to_categorical(y_train_1.astype(int))


vocab_size_2 = 2000
model_token_2 = Tokenizer(num_words = vocab_size_2)
model_token_2.fit_on_texts(x_train_2_clean)
x_sequences_2 = model_token_2.texts_to_sequences(x_train_2_clean)
max_length_2 = 80
train_x_2 = pad_sequences(x_sequences_2, maxlen = max_length_2, padding = 'post')
train_y_2 = to_categorical(y_train_2.astype(int))


vocab_size_3_0 = 2000
model_token_3_0 = Tokenizer(num_words = vocab_size_3_0)
model_token_3_0.fit_on_texts(x_train_3_0_clean)
x_sequences_3_0 = model_token_3_0.texts_to_sequences(x_train_3_0_clean)
max_length_3_0 = 80
train_x_3_0 = pad_sequences(x_sequences_3_0, maxlen = max_length_3_0, padding = 'post')
train_y_3_0 = to_categorical(y_train_3_0.astype(int))

vocab_size_3_1 = 2000
model_token_3_1 = Tokenizer(num_words = vocab_size_3_1)
model_token_3_1.fit_on_texts(x_train_3_1_clean)
x_sequences_3_1 = model_token_3_1.texts_to_sequences(x_train_3_1_clean)
max_length_3_1 = 80
train_x_3_1 = pad_sequences(x_sequences_3_1, maxlen = max_length_3_1, padding = 'post')
train_y_3_1 = to_categorical(y_train_3_1.astype(int))

vocab_size_3_2 = 2000
model_token_3_2 = Tokenizer(num_words = vocab_size_3_2)
model_token_3_2.fit_on_texts(x_train_3_2_clean)
x_sequences_3_2 = model_token_3_2.texts_to_sequences(x_train_3_2_clean)
max_length_3_2 = 80
train_x_3_2 = pad_sequences(x_sequences_3_2, maxlen = max_length_3_2, padding = 'post')
train_y_3_2 = to_categorical(y_train_3_2.astype(int))

In [None]:
vocab_size_2 = 2000
model_token_2_t = Tokenizer(num_words = vocab_size_2)
model_token_2_t.fit_on_texts(x_test_2_clean)
x_sequences_2_t = model_token_2_t.texts_to_sequences(x_test_2_clean)
max_length_2 = 80
test_x_2 = pad_sequences(x_sequences_2_t, maxlen = max_length_2, padding = 'post')
test_y_2 = to_categorical(y_test_2.astype(int))

In [None]:
def metrics_plot(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs =range(1, len(acc)+1)

    plt.plot(epochs, acc, label='Train Acc')
    plt.plot(epochs, val_acc, label='Validation Acc')
    plt.title('Accuracy Score')
    plt.legend()

    plt.figure()
    plt.plot(epochs, loss , label='Train Loss')
    plt.plot(epochs, val_loss, label='Validation Loss')
    plt.title('Loss Score')
    plt.legend()
    
    return plt.show()

In [None]:
# 1차 분류 모델

##GPU 사용 시
device = torch.device("cuda:0")
bertmodel, vocab = get_pytorch_kobert_model()

# 기본 Bert tokenizer 사용
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair) 

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [None]:
# 1차분류를 위한 발화 + label로 이루어진 리스트 신규 생성
# x_train_1_clean & y_train_1_clean 은 불용어 + 토크나이징을 동시에 처리된 데이터셋이고
# 1차분류에서는 따로 BERT Tokenizer를 쓸 예정이기에 토크나이징이 안되어있는 x_train_1 & y_train_1 사용

train_1_list = []
test_1_list = []

for q, label in zip(x_train_1, y_train_1)  :
    data = []
    data.append(q)
    data.append(str(label))

    train_1_list.append(data)

for q, label in zip(x_test_1, y_test_1)  :
    data = []
    data.append(q)
    data.append(str(label))

    test_1_list.append(data)

In [None]:
# Setting parameters
max_len = 64 # 해당 길이를 초과하는 단어에 대해선 bert가 학습하지 않음
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

data_train = BERTDataset(train_1_list, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(test_1_list, 0, 1, tok, max_len, True, False)

# pytorch용 DataLoader 사용
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 2, 
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)
  
# Bert 모델 불러오기
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss() # softmax용 Loss Function 정하기 <- binary classification도 해당 loss function 사용 가능

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# 정확도측정을 위한 함수 정의
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

train_dataloader

# 모델 학습 시작
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # gradient clipping
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
   
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

In [None]:
# 2차분류를 위한 발화 + label로 이루어진 리스트 신규 생성
# x_train_2_clean & y_train_2_clean 은 불용어 + 토크나이징을 동시에 처리된 데이터셋이고
# 2차분류에서는 따로 BERT Tokenizer를 쓸 예정이기에 토크나이징이 안되어있는 x_train_2 & y_train_2 사용

train_2_list = []
test_2_list = []

for q, label in zip(x_train_2, y_train_2)  :
    data = []
    data.append(q)
    data.append(str(label))

    train_2_list.append(data)

for q, label in zip(x_test_2, y_test_2)  :
    data = []
    data.append(q)
    data.append(str(label))

    test_2_list.append(data)

In [None]:
# 2차 분류 모델

# Setting parameters
max_len = 64 # 해당 길이를 초과하는 단어에 대해선 bert가 학습하지 않음
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

data_train = BERTDataset(train_2_list, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(test_2_list, 0, 1, tok, max_len, True, False)

# pytorch용 DataLoader 사용
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 3, 
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)
  
# Bert 모델 불러오기
model2 = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model2.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model2.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss() # softmax용 Loss Function 정하기 <- binary classification도 해당 loss function 사용 가능

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# 정확도측정을 위한 함수 정의
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

train_dataloader

# 모델 학습 시작
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model2.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model2(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model2.parameters(), max_grad_norm) # gradient clipping
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
   
    model2.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model2(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

In [None]:
# 3-0차 분류 모델
earlystop_callback = EarlyStopping(monitor = 'val_acc', min_delta = 0.0001, patience = 5)

model_3_0 = Sequential([Embedding(vocab_size_3_0, 300, input_length =max_length_3_0),
        tf.keras.layers.Bidirectional(LSTM(units = 64, return_sequences = True)),
        tf.keras.layers.Bidirectional(LSTM(units = 64, return_sequences = True)),
        tf.keras.layers.Bidirectional(LSTM(units = 64)),
        Dense(2, activation='softmax')
    ])

model_3_0.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model_3_0.summary()

In [None]:
history_3_0 = model_3_0.fit(train_x_3_0, train_y_3_0,
                        epochs=40, batch_size=64, validation_split=0.2, callbacks=[earlystop_callback])

In [None]:
metrics_plot(history_3_0)

In [None]:
# 3-1차 분류 모델
earlystop_callback = EarlyStopping(monitor = 'val_acc', min_delta = 0.0001, patience = 5)

model_3_1 = Sequential([Embedding(vocab_size_3_1, 300, input_length =max_length_3_1),
        tf.keras.layers.Bidirectional(LSTM(units = 64, return_sequences = True)),
        tf.keras.layers.Bidirectional(LSTM(units = 64, return_sequences = True)),
        tf.keras.layers.Bidirectional(LSTM(units = 64)),
        Dense(17, activation='softmax')
    ])

model_3_1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model_3_1.summary()

In [None]:
history_3_1 = model_3_1.fit(train_x_3_1, train_y_3_1,
                        epochs=40, batch_size=64, validation_split=0.2, callbacks=[earlystop_callback])

In [None]:
metrics_plot(history_3_1)

In [None]:
# 3-2차 분류 모델
earlystop_callback = EarlyStopping(monitor = 'val_acc', min_delta = 0.0001, patience = 5)

model_3_2 = Sequential([Embedding(vocab_size_3_2, 300, input_length =max_length_3_2),
        tf.keras.layers.Bidirectional(LSTM(units = 64, return_sequences = True)),
        tf.keras.layers.Bidirectional(LSTM(units = 64, return_sequences = True)),
        tf.keras.layers.Bidirectional(LSTM(units = 64)),
        Dense(2, activation='softmax')
    ])

model_3_2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model_3_2.summary()

In [None]:
history_3_2 = model_3_2.fit(train_x_3_2, train_y_3_2,
                        epochs=40, batch_size=64, validation_split=0.2, callbacks=[earlystop_callback])

In [None]:
metrics_plot(history_3_2)

# 모델 분류하기

In [None]:
df_test = pd.read_excel('/content/drive/MyDrive/input_data.xlsx', engine='openpyxl')

In [None]:
import time
from IPython.display import display, clear_output
 # 시작 시간 저장
start = time.time() 

# 맞춤법 교정을 위한 특수문자 삭제
df_test["TEXT_cleaned"] = df_test["TEXT"].str.replace(pat=r'[^\w]', repl=r' ', regex=True)

# 맞춤법 교정을 위한이중 space 제거
new_document = []
for sent in df_test["TEXT_cleaned"] :
    changed_sent = re.sub(' +', ' ', sent)
    new_document.append(changed_sent)

df_test["TEXT_cleaned"] = new_document


# 맞춤법 교정
changed_document = []
i = 0
for sent in df_test["TEXT_cleaned"] :
    
    clear_output(wait=True)
    i += 1
    
    spelled_sent = spell_checker.check(sent)
    checked_sent = spelled_sent.checked
    changed_document.append(checked_sent)
    
    print('진행 상황 : ', i,'/',len(df_test["TEXT_cleaned"]))
    print('진행 시간 : ', round((time.time() - start),2))

print('총 걸린 시간 : ', round((time.time() - start),2), '초')

df_test["TEXT_cleaned"] = changed_document
df_test

In [None]:
# 버트용
test_input_serires = df_test["TEXT_cleaned"]
test_input_serires

In [None]:
def stopword_function(X):
    clean_train_review = []
    okt = Okt()
    # 모든 X 값에 대해 불용어 및 특수기호 처리 
    for i in X:

        # 문자데이터에 대한 불용어 처리 
        if type(i) == str:
            # 한글 자음, 모음 처리
            review_text = re.sub("([ㄱ-ㅎㅏ-ㅣ]+)", "", i) 
            # 특수 문자 처리 
            review_text = re.sub("[^\w\s]", "", review_text) 
            # Tokenizing & Stemming 
            word_text = okt.morphs(review_text,stem=True)
            # StopWord 
            word_text = [token for token in word_text if not token in stopword_set]  

            clean_train_review.append(word_text)

        # 숫자데이터가 오는 경우, 공백을 append     
        else:
            clean_train_review.append([])
            
    return clean_train_review


# 이중 리스트에서 해당 요소와 그 앞뒤 요소 출력하는 함수
def find_letter_with_before_and_after(double_list, x) :
    i = 0
    for text in double_list :
        if str(x) in text :
            i += 1
            if text.index(str(x)) == 0 :
                print(text[text.index(str(x))], text[text.index(str(x))+1])
            elif text.index(str(x)) >= len(text)-1 :
                print(text[text.index(str(x))-1],text[text.index(str(x))])
            else :
                print(text[text.index(str(x))-1],text[text.index(str(x))], text[text.index(str(x))+1])
    print('총, ', i, '개')

    
# 리스트의 특정 요소 인덱스 모두 찾는 함수
def find_index(data, target):
    res = []
    lis = data
    while True:
        try:
            res.append(lis.index(target) + (res[-1]+1 if len(res)!=0 else 0))
            lis = data[res[-1]+1:]
        except:
            break     
    return res


# 리스트의 특정 요소 모두 삭제하는 함수
def remove_values_from_list(the_list, val):
    return [value for value in the_list if value != val]


# 이중 리스트에서 두 단어를 합치는 함수 
def combine_two_words(double_list, word1, word2) :
    c_double_list = double_list
    for i in range(len(c_double_list)) :
        if str(word1) in c_double_list[i] :
            for j in find_index(c_double_list[i], str(word1)) :
                if j < len(c_double_list[i])-1 :
                    if c_double_list[i][j+1] == str(word2) :
                        c_double_list[i][j] = str(word1)+str(word2)
                        c_double_list[i][j+1] = '삭제할 문자'
            
            c_double_list[i] = remove_values_from_list(c_double_list[i], '삭제할 문자')
            
    return c_double_list


# 이중 리스트에서 한 단어 삭제하는 함수
def remove_one_word(double_list, word) :
    r_double_list = double_list
    for i in range(len(r_double_list)) :
        if str(word) in r_double_list[i] :
            r_double_list[i] = remove_values_from_list(r_double_list[i], str(word))

    return r_double_list


# 이중 리스트에서 한 단어 변경하는 함수
def change_one_word(double_list, word, changed_word) :
    co_double_list = double_list
    for i in range(len(co_double_list)) :
        if str(word) in co_double_list[i] :
            for j in find_index(co_double_list[i], str(word)) :
                co_double_list[i][j] = str(changed_word)

    return co_double_list


# 추가 전처리
def additional_cleaning(double_list) :
    
    # 원
    double_list = combine_two_words(double_list, '상담', '원')

    double_list = combine_two_words(double_list, '상당', '원')
    double_list = change_one_word(double_list, '상당원', '상담원')

    double_list = combine_two_words(double_list, '안내', '원')

    double_list = combine_two_words(double_list, '원', '치')
    double_list = change_one_word(double_list, '원치', '원하지')

    double_list = combine_two_words(double_list, '상사', '원')
    double_list = change_one_word(double_list, '상사원', '상담원')

    double_list = combine_two_words(double_list, '상대', '원')
    double_list = change_one_word(double_list, '상대원', '상담원')

    double_list = combine_two_words(double_list, '칙', '원')
    double_list = change_one_word(double_list, '칙원', '직원')

    double_list = combine_two_words(double_list, '칙', '원')

    double_list = change_one_word(double_list, '결시', '연결')
    double_list = change_one_word(double_list, '간이', '시간')

    # 지
    double_list = combine_two_words(double_list, '지', '원금')

    double_list = combine_two_words(double_list, '지', '연')

    double_list = combine_two_words(double_list, '알', '지')

    double_list = combine_two_words(double_list, '지', '양')

    double_list = combine_two_words(double_list, '지', '양해')
    double_list = change_one_word(double_list, '지양해', '지양')
    
    # 대
    double_list = combine_two_words(double_list, '대', '기')

    double_list = combine_two_words(double_list, '대', '체적')

    double_list = combine_two_words(double_list, '대', '출사')

    double_list = combine_two_words(double_list, '대', '화법')

    double_list = combine_two_words(double_list, '대', '금도')

    double_list = combine_two_words(double_list, '현', '대')

    # OOO
    double_list = remove_one_word(double_list, 'OOO')
    
    return double_list


# 이중 리스트 복제 함수
def copy_double_list(double_list) : 
    copied_double_list = []
    for i in range(len(double_list)) :
        line = []
        for j in range(len(double_list[i])):
            line.append(double_list[i][j])
        copied_double_list.append(line)
        
    return copied_double_list

In [None]:
stopword_set = set(df_stopword['불용어'].values.tolist())

test_tok = stopword_function(df_test["TEXT_cleaned"])
test_tok = additional_cleaning(test_tok)

In [None]:
tensorflow.compat.v1.set_random_seed(196)

vocab_size_test = 2000

model_token_test = Tokenizer(num_words = vocab_size_test)
model_token_test.fit_on_texts(test_tok)

x_sequences_test = model_token_test.texts_to_sequences(test_tok)

max_length = 80
test_input = pad_sequences(x_sequences_test, maxlen = max_length, padding = 'post')

In [None]:
# 분류 1 진행 -> label_1 구해짐

import pandas as pd

# 위에서 설정한 tok, max_len, batch_size, device 그대로 사용
# comment : 예측하고자 하는 테스트 데이터 발화 리스트
def final_1_classification(comment, tok, max_len, batch_size, device):
  commnetslist = [] # 발화 리스트
  emo_list = [] # 1차 분류 값을 담을 리스트
  for c in comment: # 발화 리스트
    commnetslist.append( [c, 3] ) # [댓글, 임의의 양의 정수값] 설정
    
  pdData = pd.DataFrame( commnetslist, columns = [['발화', '1차분류']] )
  pdData = pdData.values
  test_set = BERTDataset(pdData, 0, 1, tok, max_len, True, False) 
  test_input = torch.utils.data.DataLoader(test_set, batch_size=1, num_workers=0)
  
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_input):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length 
    out = model(token_ids, valid_length, segment_ids)
	
    for e in out:
      if e[0]>e[1]: 
        value = 0
      else: 
        value = 1
      emo_list.append(value)

  return emo_list 

import gc
gc.collect()
torch.cuda.empty_cache()

label_1 = final_1_classification(test_input_serires, tok, max_len, batch_size, device)

In [None]:
df_test['label1'] = label_1

In [None]:
# 분류 2 진행 -> label_2 구해짐

import pandas as pd

# 위에서 설정한 tok, max_len, batch_size, device 그대로 사용
# comment : 예측하고자 하는 테스트 데이터 발화 리스트
def final_2_classification(comment, tok, max_len, batch_size, device):
  commnetslist = [] # 발화 리스트
  emo_list = [] # 2차 분류 값을 담을 리스트
  for c in comment: # 발화 리스트
    commnetslist.append( [c, 4] ) # [댓글, 임의의 양의 정수값] 설정
    
  pdData = pd.DataFrame( commnetslist, columns = [['발화', '2차분류']] )
  pdData = pdData.values
  test_set = BERTDataset(pdData, 0, 1, tok, max_len, True, False) 
  test_input = torch.utils.data.DataLoader(test_set, batch_size=1, num_workers=0)
  
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_input):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length 
    out = model2(token_ids, valid_length, segment_ids)

    for e in out:
      if e[0]>e[1] and e[0]>e[2]: 
        value = 0
      elif e[1]>e[0] and e[1]>e[2]:
        value = 1
      else:
        value = 2
      emo_list.append(value)

  return emo_list 
   

import gc
gc.collect()
torch.cuda.empty_cache()


label_2 = final_2_classification(test_input_serires, tok, max_len, batch_size, device)

In [None]:
condt2_0 = (df_test['label2'] == 0)
condt2_1 = (df_test['label2'] == 1)
condt2_2 = (df_test['label2'] == 2)

df_test_3_0 = df_test.loc[condt2_0]
df_test_3_1 = df_test.loc[condt2_1]
df_test_3_2 = df_test.loc[condt2_2]

# 불용어 제거
stopword_set = set(df_stopword['불용어'].values.tolist())

# 토큰화
test_tok_3_0 = stopword_function(df_test_3_0["TEXT_cleaned"])
test_tok_3_0 = additional_cleaning(test_tok_3_0)

test_tok_3_1 = stopword_function(df_test_3_1["TEXT_cleaned"])
test_tok_3_1 = additional_cleaning(test_tok_3_1)

test_tok_3_2 = stopword_function(df_test_3_2["TEXT_cleaned"])
test_tok_3_2 = additional_cleaning(test_tok_3_2)


tensorflow.compat.v1.set_random_seed(196)

vocab_size_test = 2000
max_length = 80

model_token_test_3_0 = Tokenizer(num_words = vocab_size_test)
model_token_test_3_0.fit_on_texts(test_tok_3_0)
x_sequences_test_3_0 = model_token_test_3_0.texts_to_sequences(test_tok_3_0)
test_input_3_0 = pad_sequences(x_sequences_test_3_0, maxlen = max_length, padding = 'post')

model_token_test_3_1 = Tokenizer(num_words = vocab_size_test)
model_token_test_3_1.fit_on_texts(test_tok_3_1)
x_sequences_test_3_1 = model_token_test_3_1.texts_to_sequences(test_tok_3_1)
test_input_3_1 = pad_sequences(x_sequences_test_3_1, maxlen = max_length, padding = 'post')

model_token_test_3_2 = Tokenizer(num_words = vocab_size_test)
model_token_test_3_2.fit_on_texts(test_tok_3_2)
x_sequences_test_3_2 = model_token_test_3_2.texts_to_sequences(test_tok_3_2)
test_input_3_2 = pad_sequences(x_sequences_test_3_2, maxlen = max_length, padding = 'post')

In [None]:
# 분류3 진행 -> label_3_0, label_3_1, label_3_2 구해짐

# 모델 3_0 으로 라벨 3_0 예측
pred_3_0 = model_3_0.predict(test_input_3_0)
label_3_0 = np.argmax(pred_3_0, axis = 1)

# 모델 3_1 으로 라벨 3_1 예측
pred_3_1 = model_3_1.predict(test_input_3_1)
label_3_1 = np.argmax(pred_3_1, axis = 1)

# 모델 3_2 으로 라벨 3_2 예측
pred_3_2 = model_3_2.predict(test_input_3_2)
label_3_2 = np.argmax(pred_3_2, axis = 1)

In [None]:
df_test_3_0['label3'] = label_3_0
df_test_3_1['label3'] = label_3_1
df_test_3_2['label3'] = label_3_2

df_result = pd.concat([df_test_3_0, df_test_3_1], sort = False)
df_result = pd.concat([df_result, df_test_3_2], sort = False)

df_result = df_result.sort_index()
df_result

In [None]:
df_result[['label1', 'label2', 'label3']] = df_result[['label1', 'label2', 'label3']].astype('int')        

dic1 = {'칭찬' : 0, '불만' : 1}
dic2 = {'고객서비스' : 0, '삼성카드' : 1, '기타' : 2}
dic3_0 = {'상담원' : 0, '상담시스템' : 1, '고객서비스' : 2}
dic3_1 = {'혜택' : 0, '할부금융상품' : 1, '커뮤니티서비스' : 2, '카드이용/결제' : 3, 
          '카드상품' : 4, '청구입금' : 5, '심사/한도' : 6, '생활편의서비스' : 7, 
          '상담/채널' : 8, '리스렌탈상품' : 9, '라이프서비스' : 10, '금융상품' : 11, 
          '고객정보관리' : 12, '가맹점매출/승인' : 13, '가맹점대금' : 14, '가맹점계약' : 15, '삼성카드' : 16}
dic3_2 = {'기타' : 0, '중립' : 1, '폐기' : 2}

dic1_reversed = {v:k for k, v in dic1.items()}
dic2_reversed = {v:k for k, v in dic2.items()}
dic3_0_reversed = {v:k for k, v in dic3_0.items()}
dic3_1_reversed = {v:k for k, v in dic3_1.items()}
dic3_2_reversed = {v:k for k, v in dic3_2.items()}

for i in df_result.index :
    df_result.loc[i, '분류1'] = dic1_reversed[df_result.loc[i, 'label1']]
    df_result.loc[i, '분류2'] = dic2_reversed[df_result.loc[i, 'label2']]
    if df_result.loc[i, 'label2'] == 0 :
        df_result.loc[i, '분류3'] = dic3_0_reversed[df_result.loc[i, 'label3']]
    elif df_result.loc[i, 'label2'] == 1 :
        df_result.loc[i, '분류3'] = dic3_1_reversed[df_result.loc[i, 'label3']]
    elif df_result.loc[i, 'label2'] == 2 :
        df_result.loc[i, '분류3'] = dic3_2_reversed[df_result.loc[i, 'label3']]
    else :
        print(i)
        
        
df_result['분류'] = df_result[['분류1', '분류2', '분류3']].apply('>'.join, axis=1)

df_result['복구된 분류'] = df_result['분류']

df_result.loc[df_result['복구된 분류']=='칭찬>고객서비스>고객서비스', '복구된 분류'] = '칭찬>고객서비스'
df_result.loc[df_result['복구된 분류']=='칭찬>삼성카드>삼성카드', '복구된 분류'] = '칭찬>삼성카드'
df_result.loc[df_result['복구된 분류']=='칭찬>기타>기타', '복구된 분류'] = '칭찬>기타'
df_result.loc[df_result['복구된 분류']=='불만>고객서비스>고객서비스', '복구된 분류'] = '불만>고객서비스'
df_result.loc[df_result['복구된 분류']=='불만>삼성카드>삼성카드', '복구된 분류'] = '불만>삼성카드'
df_result.loc[df_result['복구된 분류']=='불만>기타>기타', '복구된 분류'] = '불만>기타'

df_result.loc[df_result['복구된 분류']=='칭찬>기타>중립', '복구된 분류'] = '중립'
df_result.loc[df_result['복구된 분류']=='불만>기타>중립', '복구된 분류'] = '중립'

df_result['INT'] = df_result['복구된 분류']

In [None]:
df_final_result = df_result[['KEY1', 'KEY2', 'TEXT', 'INT']]

# 결과출력

In [None]:
df_final_result.to_excel('/content/drive/MyDrive/result_data.xlsx', index = False)