In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
import os
import re

from tqdm import tqdm
from transformers import *
from langdetect import detect

In [None]:
import sys
sys.path.append('/home/cshoon036/MixMind/mm_bert/korean')

from kor_emotion_analysis import *
from kor_preprocessing import isKorean_percent

In [None]:
import sys
sys.path.append('/home/cshoon036/MixMind/mm_bert/english')

from eng_emotion_analysis import *

In [None]:
sentiment_model_kor = TFBertClassifier_KOR(model_name='bert-base-multilingual-cased', dir_path='bert_ckpt')
sentiment_model_eng = TFBertClassifier_ENG(model_name='bert-base-cased')

In [None]:
# 가중치(한글)
opt = tfa.optimizers.RectifiedAdam(lr=5.0e-5, total_steps = 2344*4, warmup_proportion=0.1, min_lr=1e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
sentiment_model_kor.compile(optimizer=opt, loss=loss, metrics=['accuracy'])

In [None]:
# 가중치(영어), this learning rate is for bert model , taken from huggingface website
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=5e-05, epsilon=1e-08,decay=0.01,clipnorm=1.0)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits = True)
metric = tf.keras.metrics.CategoricalAccuracy('balanced_accuracy'),
sentiment_model_eng.compile(optimizer = optimizer,loss = loss, metrics = metric)

In [None]:
sentiment_model_kor.load_weights('/home/cshoon036/MixMind/mm_bert/korean/korean_binary_sentiment_weights/korean_binary_sentiment_weights')
sentiment_model_eng.load_weights('/home/cshoon036/MixMind/mm_bert/english/new_multi_sentiment_weights/new_multi_sentiment_weights')

In [None]:
# 한글 사전 분류 위한 vocab
vocab_path = '/home/cshoon036/MixMind/mm_bert/korean/vocab_9class_500.csv'
stopwords_path = '/home/cshoon036/MixMind/mm_bert/korean/stopwords.txt'

vocab = load_vocab(vocab_path, stopwords_path)

In [None]:
# DB에서 리뷰 export
# import pymysql

# db = pymysql.connect(host='34.64.62.157', port=3306, user='cshoon036', passwd='clftjd4dlek!', dataset='mixmind_webservice', charset='utf8')

# cur = db.cursor()
# query = "Select id, review from mixmind_musicreview"

# review_df_from_db = pd.read_sql(query, db)

review_df_from_db = pd.read_csv('/home/cshoon036/MixMind/mm_bert/english/pop_review.csv')

In [None]:
# 리뷰 전처리
for idx, row in enumerate(review_df_from_db.itertuples()):
    temp = list(set(row.review.lstrip('[').rstrip(']').split(',')))
    remove_set = {"'YouTube에서 로그아웃한 상태입니다'", " 'YouTube에서 로그아웃한 상태입니다'", " '로그인하면 동영상에 좋아요를 표시하고 댓글을 달거나 구독할 수 있습니다 '", "'로그인하면 동영상에 좋아요를 표시하고 댓글을 달거나 구독할 수 있습니다 '"}
    clear_temp = [i for i in temp if i not in remove_set]

    review_df_from_db.loc[idx, 'review'] = clear_temp

review_df_from_db

In [None]:
# 벡터 데이터 프레임
predict_emotion = pd.DataFrame(index=range(0,0), columns=['id', 'love', 'joy', 'passion', 'happiness', 'sadness', 'anger', 'loneliness', 'longing', 'fear', 'surprise'])

In [None]:
# 리뷰 구분 / 예측

for idx, row in zip(range(len(predict_emotion), len(review_df_from_db)), review_df_from_db[len(predict_emotion):].itertuples()):
    temp_dict = {'love' : 0, 'fun': 0, 'enthusiasm' : 0, 'happiness' : 0, 'sadness' : 0, 'anger' : 0, 'loneliness' : 0, 'longing': 0, 'fear' : 0, 'surprise': 0}
    if row.review:
        for review in row.review:
            if isKorean_percent(review, 0.1) == 0: # 한글 10% 이상 있어야 한글로 취급, 아니면 알파벳으로 인식 
                try:
                    if detect(review) == 'en': # 알파벳 인식해도 스페인어나 이런게 섞인 경우가 있음. 영어로 인식하면 감정 처리.
                        eng_emotion = eng_emotion_predict(re.sub('[^a-zA-Z0-9]+', '', review), sentiment_model_eng)
                        for key1, val1 in eng_emotion.items():
                            temp_dict[key1] += val1
                except: # 스페인어 등은 패스
                    continue

            else:
                kor_emotion = hybrid_emotion_export_persent(re.sub('[^가-힣ㄱ-ㅎ0-9]+', '', review), sentiment_model_kor, vocab)
                # 영어, 한글 섞인 리뷰들 중 한글로 취급된 애들은 아예 한글만 보기 위해 알파벳 빼줌.
                for key2, val2 in kor_emotion.items():
                    temp_dict[key2] += val2

    else: # 리뷰 빈 칸인 경우
        predict_emotion[idx, 'id'] = idx
    
        
    predict_emotion.loc[idx, 'id'] =  idx
    predict_emotion.loc[idx, 'love'] = np.round((temp_dict['love'] / sum(temp_dict.values())), 5)
    predict_emotion.loc[idx, 'joy'] = np.round((temp_dict['fun'] / sum(temp_dict.values())), 5)
    predict_emotion.loc[idx, 'passion'] = np.round((temp_dict['enthusiasm'] / sum(temp_dict.values())), 5)
    predict_emotion.loc[idx, 'happiness'] = np.round((temp_dict['happiness'] / sum(temp_dict.values())), 5)
    predict_emotion.loc[idx, 'sadness'] = np.round((temp_dict['sadness'] / sum(temp_dict.values())), 5)
    predict_emotion.loc[idx, 'anger'] = np.round((temp_dict['anger'] / sum(temp_dict.values())), 5)
    predict_emotion.loc[idx, 'loneliness'] = np.round((temp_dict['loneliness'] / sum(temp_dict.values())), 5)
    predict_emotion.loc[idx, 'longing'] = np.round((temp_dict['longing'] / sum(temp_dict.values())), 5)
    predict_emotion.loc[idx, 'fear'] = np.round((temp_dict['fear'] / sum(temp_dict.values())), 5)
    predict_emotion.loc[idx, 'surprise'] = np.round((temp_dict['surprise'] / sum(temp_dict.values())), 5)

In [None]:
predict_emotion