In [244]:
import os
import numpy as np
from tqdm import tqdm

import tensorflow as tf
from transformers import TFElectraModel

import pandas as pd

In [245]:
tf.random.set_seed(42)
np.random.seed(42)

In [246]:
CLASS_NUMBER = 3
MAX_LEN = 40
BERT_CKPT = './data_out/'
DATA_IN_PATH = '../metadata/'
DATA_OUT_PATH = './data_out/'

In [247]:
class TFElectraClassifier(tf.keras.Model):
    def __init__(self, model_name, dir_path, num_class):
        super().__init__()

        self.bert = TFElectraModel.from_pretrained(model_name, cache_dir=dir_path, from_pt=True)
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.flatten = tf.keras.layers.Flatten()
        self.classifier = tf.keras.layers.Dense(num_class, name='classifier', activation='softmax', kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range))

    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        last_hidden_state = outputs[0]
        last_hidden_state = self.flatten(last_hidden_state)
        last_hidden_state = self.dropout(last_hidden_state, training=training)
        logits = self.classifier(last_hidden_state)

        return logits

In [248]:
cls_model = TFElectraClassifier(model_name='monologg/koelectra-base-v3-discriminator', dir_path=os.path.join(BERT_CKPT, 'model'), num_class=CLASS_NUMBER)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFElectraModel: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'electra.embeddings.position_ids']
- This IS expected if you are initializing TFElectraModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFElectraModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFElectraModel for predictions without further train

## 전처리

In [92]:
song_id = pd.read_csv(DATA_IN_PATH+'db에넣을노래들.csv')

In [93]:
lyrics = pd.read_csv(DATA_IN_PATH+'sliced20Lyrics.csv')

In [94]:
id = song_id['SONG_ID']

In [95]:
title = song_id[['SONG_TITLE']]

In [96]:
id_title = dict()
for row in range(len(song_id)):
    id_title[song_id.iloc[row, 2]] = song_id.iloc[row, 3]

In [97]:
lyrics

Unnamed: 0,SONG_ID,Sliced_LYRICS,대분류1,대분류2
0,2466,나도 알지 못한날 굳이 알려고 들지마 보여줄 수 없는 내 마음만 안타까울 뿐 이런 ...,,
1,2466,내 주위를 보며 후회를 시작했어 하루 종일 찾아 헤매도 판에 박혀있는 모습 뿐 그런...,,
2,2466,난 혼자라는 사실을 잊은채로 잠들겠지만 오늘도 난 기억속의 네게 의미없는 후횔하며 ...,,
3,2466,다시 안기길 기대하고 있어 하루종일 찾아 헤매도 판에 박혀있는 모습뿐 그런 내 모습...,,
4,2466,잊은채로 잠들겠지만 오늘도 난 기억속의 네게 의미없는 후횔하며 지내겠지 오늘도 난 ...,,
...,...,...,...,...
34389,35389041,넘치지 않게 Oh 나를 톡 쏘게 만들어줘 Oh Baby you’re all mine...,,
34390,35389041,"you’re all mine mine I, I, I So, you make me s...",,
34391,35389041,"love me yeah, love me yeah baby (Could you hol...",,
34392,35389041,baby 너의 두 눈 속에 나 더 깊이 빠져들어 Now 나를 톡 쏘게 만들어줘 Oh...,,


In [98]:
data = lyrics[lyrics['SONG_ID'].isin(id)]

In [99]:
data = data[['SONG_ID', 'Sliced_LYRICS']]

In [100]:
data['SONG_TITLE'] = ''

In [101]:
for idx in range(len(data)):
    data.iloc[idx, -1] = id_title[data.iloc[idx, 0]]

In [102]:
data = data[['SONG_ID', 'SONG_TITLE', 'Sliced_LYRICS']]

In [103]:
data

Unnamed: 0,SONG_ID,SONG_TITLE,Sliced_LYRICS
23,9270,매직 카펫 라이드,이렇게 멋진 파란 하늘 위로 날으는 마법 융단을 타고 이렇게 멋진 푸른 세상 속을 ...
24,9270,매직 카펫 라이드,마요 그렇고 그런 얘기들 골치 아픈일은 내일로 미뤄버려요 인생은 한번 뿐 후회하지 ...
25,9270,매직 카펫 라이드,파란 하늘 위에 지어진 마법 정원으로 와요 색색의 보석 꽃과 노루 비단 달콤한 우리...
26,9270,매직 카펫 라이드,지난 일은 모두 다 잊어버려요 기회는 한번 뿐 실수하지 마요 진짜로 해내고 싶은 걸...
27,9270,매직 카펫 라이드,이렇게 멋진 파란 하늘 위로 나르는 마법융단을 타고 이렇게 멋진 장미빛 인생을 당신...
...,...,...,...
34349,35333345,POP!,내게 푹 빠진 너를 애써 참진 마 Bae bae eyes on me now 내가 터...
34350,35333345,POP!,됐어 Four! 딱 숨을 멈춰 Three! 난 너를 겨눠 Two! One! Here...
34351,35333345,POP!,"it Pop pop pop, you want it Pop pop pop 터지길 원해..."
34352,35333345,POP!,"Pop pop pop, you want it Pop pop pop 널 갖길 원해 P..."


In [104]:
data.isnull().sum()

SONG_ID          0
SONG_TITLE       0
Sliced_LYRICS    0
dtype: int64

In [105]:
# data.to_csv(DATA_IN_PATH+'concatlyrics.csv', encoding='utf-8-sig')

## 예측

In [249]:
lyrics = pd.read_csv(DATA_IN_PATH+'concatlyrics.csv')

In [250]:
x = lyrics['Sliced_LYRICS']

In [251]:
import pickle

with open(DATA_OUT_PATH+'tokenizer.pickle', 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)

In [252]:
def bert_tokenizer(sent, MAX_LEN):
    encoded_dict = loaded_tokenizer.encode_plus(
        text=sent,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True
    )

    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    token_type_id = encoded_dict['token_type_ids']

    return input_id, attention_mask, token_type_id

In [253]:
optimizer = tf.keras.optimizers.Adam(3e-6)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [254]:
a, b, c = bert_tokenizer('안녕하세요', MAX_LEN)

In [255]:
cls_model.call((np.array(a).reshape(1,-1), np.array(b).reshape(1,-1), np.array(c).reshape(1,-1)))

<tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[0.000028, 0.152711, 0.847261]], dtype=float32)>

In [256]:
cls_model.built = True

In [257]:
cls_model.load_weights(DATA_OUT_PATH+'tf2_electra_plutchik_hs_6.h5')

In [258]:
input_ids = []
attention_masks = []
token_type_ids = []
test_data_labels = []

for test_sent in tqdm(x):
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer(test_sent, MAX_LEN)
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
    except:
        pass

test_input_ids = np.array(input_ids, dtype=int)
test_attention_masks = np.array(attention_masks, dtype=int)
test_type_ids = np.array(token_type_ids, dtype=int)
test_inputs = (test_input_ids, test_attention_masks, test_type_ids)

100%|██████████| 3669/3669 [00:03<00:00, 1071.55it/s]


In [259]:
prediction = cls_model.predict(test_inputs)



In [261]:
with open(DATA_OUT_PATH+'3classEncoder.pickle', 'rb') as handle:
    le = pickle.load(handle)

In [262]:
pred = np.array([])

In [263]:
for p in prediction:
    temp = le.inverse_transform([np.argmax(p)])
    pred = np.append(pred, temp)

In [264]:
total = pd.concat([lyrics, pd.DataFrame(pred)], axis=1)

In [265]:
total

Unnamed: 0.1,Unnamed: 0,SONG_ID,SONG_TITLE,Sliced_LYRICS,0
0,23,9270,매직 카펫 라이드,이렇게 멋진 파란 하늘 위로 날으는 마법 융단을 타고 이렇게 멋진 푸른 세상 속을 ...,pos
1,24,9270,매직 카펫 라이드,마요 그렇고 그런 얘기들 골치 아픈일은 내일로 미뤄버려요 인생은 한번 뿐 후회하지 ...,neut
2,25,9270,매직 카펫 라이드,파란 하늘 위에 지어진 마법 정원으로 와요 색색의 보석 꽃과 노루 비단 달콤한 우리...,pos
3,26,9270,매직 카펫 라이드,지난 일은 모두 다 잊어버려요 기회는 한번 뿐 실수하지 마요 진짜로 해내고 싶은 걸...,neut
4,27,9270,매직 카펫 라이드,이렇게 멋진 파란 하늘 위로 나르는 마법융단을 타고 이렇게 멋진 장미빛 인생을 당신...,pos
...,...,...,...,...,...
3664,34349,35333345,POP!,내게 푹 빠진 너를 애써 참진 마 Bae bae eyes on me now 내가 터...,pos
3665,34350,35333345,POP!,됐어 Four! 딱 숨을 멈춰 Three! 난 너를 겨눠 Two! One! Here...,pos
3666,34351,35333345,POP!,"it Pop pop pop, you want it Pop pop pop 터지길 원해...",pos
3667,34352,35333345,POP!,"Pop pop pop, you want it Pop pop pop 널 갖길 원해 P...",neut


## 개별 예측 및 보팅

In [266]:
np.set_printoptions(precision=6, suppress=True)

In [267]:
lyrics = pd.read_csv(DATA_IN_PATH+'db에넣을노래들.csv')

In [268]:
MAX_LEN=40
MAX_WORD=20

In [269]:
with open(DATA_OUT_PATH+'3classEncoder.pickle', 'rb') as handle:
    le = pickle.load(handle)

In [270]:
data = lyrics[['SONG_ID', 'SONG_TITLE', 'LYRICS']]

In [271]:
song_ids = lyrics['SONG_ID']

In [272]:
pred = dict()

In [273]:
def preprocessing(x):
    temp = x.split('\n')
    temp = ' '.join(temp)
    return temp

def slicing(x):
    x = x.split()
    res = []
    for i in range(0, len(x)+1, MAX_WORD):
        if len(x)-i-MAX_WORD < MAX_WORD//2:
            temp = x[i:]
            res.append(temp)
            break
        temp = x[i:i+MAX_WORD]
        res.append(temp)
    return res

def predict(lyrics):
    temp = []
    lyrics = preprocessing(lyrics)
    lyrics = slicing(lyrics)
    for lyric in lyrics:
        try:
            txt = ' '.join(lyric)
            temp.append(txt)
        except:
            continue

    input_ids = []
    attention_masks = []
    token_type_ids = []

    for lyric in temp:
        try:
            input_id, attention_mask, token_type_id = bert_tokenizer(lyric, MAX_LEN)
            input_ids.append(input_id)
            attention_masks.append(attention_mask)
            token_type_ids.append(token_type_id)

        except Exception as e:
            print(e)
            pass
    prediction = cls_model.predict(input_ids)

    return prediction

def transform(array):
    result = le.inverse_transform([np.argmax(array)])
    return result

In [275]:
for id in song_ids:
    lyric = data[data['SONG_ID']==id]['LYRICS'].values[0]
    prediction = predict(lyric)
    score = sum(prediction)/len(prediction)
    senti = transform(score)
    pred[id] = (score, senti)



In [276]:
keys = pred.keys()

In [277]:
data['score'] = ''
data['senti'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [278]:
for key in keys:
    sc, se = pred[key]
    data.loc[data[data['SONG_ID']==key].index, 'score'] = str(sc)
    data.loc[data[data['SONG_ID']==key].index, 'senti'] = se

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [279]:
data.to_csv(DATA_IN_PATH+'dbScoreSenti.csv', encoding='utf-8-sig')

In [None]:
# np.fromstring(data.loc[data[data['SONG_ID']==9270].index, 'score'][0][1:-1], dtype=float, sep=' ')

array([0.12199493, 0.29389238, 0.58411264])

## 부정 세부분류

In [280]:
NEG_CLASS_NUMBER = 5

In [281]:
neg_data = data[data['senti']=='neg']

In [282]:
pos_data = data[data['senti']=='pos']

In [283]:
neg_ids = neg_data['SONG_ID']

In [None]:
pos_ids = pos_data['SONG_ID']

In [285]:
import pickle

with open(DATA_OUT_PATH+'bert_tokenizer.pickle', 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)

In [286]:
neg_model = TFElectraClassifier(model_name='monologg/koelectra-base-v3-discriminator', dir_path=os.path.join(BERT_CKPT, 'model'), num_class=NEG_CLASS_NUMBER)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFElectraModel: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'electra.embeddings.position_ids']
- This IS expected if you are initializing TFElectraModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFElectraModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFElectraModel for predictions without further train

In [287]:
a, b, c = bert_tokenizer('안녕하세요', MAX_LEN)
neg_model.call((np.array(a).reshape(1,-1), np.array(b).reshape(1,-1), np.array(c).reshape(1,-1)))
neg_model.built = True

In [288]:
neg_model.load_weights(DATA_OUT_PATH+'tf2_electra_plutchik_hs_10.h5')

In [289]:
with open(DATA_OUT_PATH+'negClassEncoder.pickle', 'rb') as handle:
    neg_le = pickle.load(handle)

In [290]:
pos_pred = dict()

In [291]:
neg_pred = dict()

In [292]:
def neg_predict(lyrics):
    temp = []
    lyrics = preprocessing(lyrics)
    lyrics = slicing(lyrics)
    for lyric in lyrics:
        try:
            txt = ' '.join(lyric)
            temp.append(txt)
        except:
            continue

    input_ids = []
    attention_masks = []
    token_type_ids = []

    for lyric in temp:
        try:
            input_id, attention_mask, token_type_id = bert_tokenizer(lyric, MAX_LEN)
            input_ids.append(input_id)
            attention_masks.append(attention_mask)
            token_type_ids.append(token_type_id)

        except Exception as e:
            print(e)
            pass
    prediction = neg_model.predict(input_ids)

    return prediction

def neg_transform(array):
    result = neg_le.inverse_transform([np.argmax(array)])
    return result

In [293]:
for id in neg_ids:
    lyric = data[data['SONG_ID']==id]['LYRICS'].values[0]
    prediction = neg_predict(lyric)
    score = sum(prediction)/len(prediction)
    senti = neg_transform(score)
    neg_pred[id] = (score, senti)



In [294]:
data['neg_score'] = ''
data['neg_senti'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [295]:
for key in neg_ids:
    sc, se = neg_pred[key]
    data.loc[data[data['SONG_ID']==key].index, 'neg_score'] = str(sc)
    data.loc[data[data['SONG_ID']==key].index, 'neg_senti'] = se

In [297]:
data.to_csv(DATA_IN_PATH+'dbNeg.csv', encoding='utf-8-sig')

In [298]:
data['total'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [299]:
np.fromstring(data.loc[data[data['SONG_ID']==key].index, 'neg_score'].tolist()[0][1:-1], dtype=float, sep=' ')

array([0.323326, 0.395095, 0.07907 , 0.158603, 0.043906])

In [300]:
for key in neg_ids:
    neg_sc = np.fromstring(data.loc[data[data['SONG_ID']==key].index, 'neg_score'].tolist()[0][1:-1], dtype=float, sep=' ')
    pos_sc = np.fromstring(data.loc[data[data['SONG_ID']==key].index, 'score'].tolist()[0][1:-1], dtype=float, sep=' ')
    ratio = np.array(((pos_sc[int(le.transform(['neg'])[0])]+pos_sc[1]*0.5), (pos_sc[int(le.transform(['pos'])[0])]+pos_sc[1]*0.5)))
    rated_sc = np.append(np.max(neg_sc)*(ratio[1]/ratio[0]), neg_sc)
    total_sc = rated_sc/(sum(rated_sc))
    data.loc[data[data['SONG_ID']==key].index, 'total'] = str(total_sc)

In [301]:
data.to_csv(DATA_IN_PATH+'totalScore.csv', encoding='utf-8-sig')

## 긍정에서 부정 세부분류

In [302]:
pos_data = data[data['senti']=='pos']

In [303]:
pos_ids = pos_data['SONG_ID']

In [304]:
import pickle

with open(DATA_OUT_PATH+'bert_tokenizer.pickle', 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)

In [305]:
pos_pred = dict()

In [306]:
def neg_predict(lyrics):
    temp = []
    lyrics = preprocessing(lyrics)
    lyrics = slicing(lyrics)
    for lyric in lyrics:
        try:
            txt = ' '.join(lyric)
            temp.append(txt)
        except:
            continue

    input_ids = []
    attention_masks = []
    token_type_ids = []

    for lyric in temp:
        try:
            input_id, attention_mask, token_type_id = bert_tokenizer(lyric, MAX_LEN)
            input_ids.append(input_id)
            attention_masks.append(attention_mask)
            token_type_ids.append(token_type_id)

        except Exception as e:
            print(e)
            pass
    prediction = neg_model.predict(input_ids)

    return prediction

def neg_transform(array):
    result = neg_le.inverse_transform([np.argmax(array)])
    return result

In [307]:
for id in pos_ids:
    lyric = data[data['SONG_ID']==id]['LYRICS'].values[0]
    prediction = neg_predict(lyric)
    score = sum(prediction)/len(prediction)
    pos_pred[id] = (score, 'happy')



In [308]:
for key in pos_ids:
    sc, se = pos_pred[key]
    data.loc[data[data['SONG_ID']==key].index, 'neg_score'] = str(sc)
    data.loc[data[data['SONG_ID']==key].index, 'neg_senti'] = se

In [309]:
data.to_csv(DATA_IN_PATH+'dbPos.csv', encoding='utf-8-sig')

In [310]:
for key in pos_ids:
    neg_sc = np.fromstring(data.loc[data[data['SONG_ID']==key].index, 'neg_score'].tolist()[0][1:-1], dtype=float, sep=' ')
    pos_sc = np.fromstring(data.loc[data[data['SONG_ID']==key].index, 'score'].tolist()[0][1:-1], dtype=float, sep=' ')
    ratio = np.array(((pos_sc[int(le.transform(['neg'])[0])]+pos_sc[1]*0.5), (pos_sc[int(le.transform(['pos'])[0])]+pos_sc[1]*0.5)))
    rated_sc = np.append(ratio[1], ratio[0]*neg_sc)
    total_sc = rated_sc/(sum(rated_sc))
    data.loc[data[data['SONG_ID']==key].index, 'total'] = str(total_sc)

In [311]:
data.to_csv(DATA_IN_PATH+'totalScore.csv', encoding='utf-8-sig')

In [316]:
# data = pd.read_csv(DATA_IN_PATH+'totalScore.csv')

In [320]:
data = data.dropna(axis=0)

In [322]:
neg_le.inverse_transform([0,1,2,3,4])

array(['angry', 'dislike', 'fear', 'sad', 'surprise'], dtype=object)

In [323]:
data.to_csv(DATA_IN_PATH+'total.csv', encoding='utf-8-sig')