In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV
from transformers import BertTokenizer, TFBertForSequenceClassification, AutoModel, AutoTokenizer, TFGPT2Model, Trainer, TrainingArguments
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score, mean_squared_error, roc_auc_score, r2_score, roc_curve, auc, mean_absolute_error, confusion_matrix, classification_report)
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import (Embedding, Conv1D, LSTM, Dense, Conv2D, GlobalMaxPooling2D, GlobalMaxPooling1D, Reshape)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from gensim.models import Word2Vec
import urllib.request
from tqdm import tqdm
import seaborn as sns
import os

os.environ["HF_TOKEN"] = "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"
os.environ["HF_HOME"] = "Nampromotion/KoGPT2-Review_Helpfulness"


In [None]:
def train_kobert():
    # 데이터 로드
    train_data = pd.read_csv('/home/olga/NSJ/전처리/train.csv')
    test_data = pd.read_csv('/home/olga/NSJ/전처리/test.csv')
    
    X_train = train_data['review_text']
    y_train = train_data['review_usefulness']
    X_test = test_data['review_text']
    y_test = test_data['review_usefulness']

    # 모델을 텐서플로우 버전으로 변환
    model_pt = AutoModel.from_pretrained("monologg/kobert")
    model_pt.save_pretrained("./kobert_from_pt", saved_model=True)
    model = TFBertForSequenceClassification.from_pretrained("./kobert_from_pt", from_pt=True, num_labels=2)

    # KoBERT 토크나이저 초기화
    tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

    # 데이터 토크나이징
    train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
    test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)

    # 얼리스타핑
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

    # 모델 컴파일
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    # 훈련 데이터셋 준비
    train_dataset = tf.data.Dataset.from_tensor_slices((
        dict(train_encodings),
        y_train
    )).shuffle(1000).batch(32)

    # 검증 데이터셋 준비
    test_dataset = tf.data.Dataset.from_tensor_slices((
        dict(test_encodings),
        y_test
    )).batch(32)

    # 모델 학습
    history = model.fit(train_dataset, epochs=100, validation_data=test_dataset, callbacks=[early_stopping])

    # 예측 및 성능 지표 계산
    y_pred = model.predict(test_dataset)
    y_pred_class = np.argmax(y_pred.logits, axis=1)
    y_pred_proba = np.max(tf.nn.softmax(y_pred.logits, axis=1), axis=1)

    acc = accuracy_score(y_test, y_pred_class)
    f1 = f1_score(y_test, y_pred_class)
    precision = precision_score(y_test, y_pred_class)
    recall = recall_score(y_test, y_pred_class)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    mae = mean_absolute_error(y_test, y_pred_class)
    mse = mean_squared_error(y_test, y_pred_class)
    rmse = mean_squared_error(y_test, y_pred_class, squared=False)
    r2 = r2_score(y_test, y_pred_class)

    # 결과 출력
    print('Accuracy:', acc)
    print('F1 Score:', f1)
    print('Precision:', precision)
    print('Recall:', recall)
    print('ROC AUC:', roc_auc)
    print('Mean Absolute Error:', mae)
    print('Mean Squared Error:', mse)
    print('Root Mean Squared Error:', rmse)
    print('R2 Score:', r2)

    # 학습 과정에서의 loss 및 accuracy 시각화
    plt.figure(figsize=(12, 4))

    # Loss 시각화
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss Evolution')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    # Accuracy 시각화
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Accuracy Evolution')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()

    # ROC 커브 그리기
    fpr, tpr, _ = roc_curve(y_test, y_pred_class)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()
    
    
    return y_test, y_pred.logits

# train_kobert 함수 실행 및 결과 받기
#y_test_kobert, y_pred_kobert = train_kobert()