In [None]:
!pip install transformers --upgrade

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV
from transformers import BertTokenizer, TFBertForSequenceClassification, AutoModel, AutoTokenizer, TFGPT2Model, Trainer, TrainingArguments
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score, mean_squared_error, roc_auc_score, r2_score, roc_curve, auc, mean_absolute_error, confusion_matrix, classification_report)
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import (Embedding, Conv1D, LSTM, Dense, Conv2D, GlobalMaxPooling2D, GlobalMaxPooling1D, Reshape)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from gensim.models import Word2Vec
import urllib.request
from tqdm import tqdm
import seaborn as sns
import os

os.environ["HF_TOKEN"] = "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"
os.environ["HF_HOME"] = "Nampromotion/KoGPT2-Review_Helpfulness"


# Google Maps

In [None]:
def save_model_to_huggingface(model, tokenizer, model_name):
    # 모델과 토크나이저 저장
    model.save_pretrained(model_name)
    tokenizer.save_pretrained(model_name)
    
    # 허깅페이스에 업로드
    model.push_to_hub(model_name)
    tokenizer.push_to_hub(model_name)


# 데이터 불러오기
def load_data(train_file, test_file):
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
    return train_data, test_data


# 데이터 전처리
def preprocess_data(data, tokenizer, max_seq_len):
    # 모든 리뷰와 라벨을 리스트로 추출
    all_examples = data['review_text'].tolist()
    all_labels = data['review_usefulness'].tolist()
    
    # Batch tokenization
    tokenized_batch = tokenizer(all_examples, padding=True, truncation=True, max_length=max_seq_len)
    input_ids = np.array(tokenized_batch['input_ids'])
    
    # 라벨을 NumPy 배열로 변환
    data_labels = np.asarray(all_labels, dtype=np.int32)
    
    return input_ids, data_labels

# 모델 정의 (수정됨)
def build_model(model_name):
    class TFGPT2ForSequenceClassification(tf.keras.Model):
        def __init__(self):
            super(TFGPT2ForSequenceClassification, self).__init__()
            self.gpt = TFGPT2Model.from_pretrained(model_name, from_pt=True)
            self.dropout = tf.keras.layers.Dropout(0.2)
            self.classifier = tf.keras.layers.Dense(1, kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02), activation='sigmoid', name='classifier')

        def call(self, inputs):
            input_ids = inputs
            outputs = self.gpt(input_ids=input_ids)
            cls_token = outputs[0][:, -1]
            cls_token = self.dropout(cls_token)
            prediction = self.classifier(cls_token)
            return prediction

    model = TFGPT2ForSequenceClassification()
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    loss = tf.keras.losses.BinaryCrossentropy()
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    return model

# 모델 훈련 함수
def train_model(model, train_X, train_y, batch_size, epochs, early_stopping, model_checkpoint):
    history = model.fit(train_X, train_y, validation_split=0.1, batch_size=batch_size, epochs=epochs, callbacks=[early_stopping, model_checkpoint])

    # 훈련과정에서의 손실과 정확도 출력
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='train loss')
    plt.plot(history.history['val_loss'], label='val loss')
    plt.title('Loss Evolution')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='train acc')
    plt.plot(history.history['val_accuracy'], label='val acc')
    plt.title('Accuracy Evolution')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.tight_layout()
    plt.show()


# 모델 평가 (수정됨)
def evaluate_model(model, test_X, test_y):
    predictions = model.predict(test_X)
    y_pred = (predictions > 0.5).astype(int)

    acc = accuracy_score(test_y, y_pred)
    f1 = f1_score(test_y, y_pred)
    precision = precision_score(test_y, y_pred)
    recall = recall_score(test_y, y_pred)
    roc_auc = roc_auc_score(test_y, predictions)
    mse = mean_squared_error(test_y, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(test_y, predictions)
    r2 = r2_score(test_y, predictions)

    # Confusion Matrix
    cm = confusion_matrix(test_y, y_pred)
    print('Confusion Matrix:')
    print(cm)

    # Classification Report
    cr = classification_report(test_y, y_pred)
    print('Classification Report:')
    print(cr)

    return acc, f1, precision, recall, roc_auc, mse, rmse, mae, r2

# 모델 예측 및 해석
def predict_and_interpret(model, tokenizer, example, max_seq_len):
    tokens = [tokenizer.bos_token] + tokenizer.tokenize(example) + [tokenizer.eos_token]
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_id = pad_sequences([input_id], maxlen=max_seq_len, value=tokenizer.pad_token_id, padding='post')[0]
    prediction = model.predict([input_id])
    prediction = prediction[0][0]

def train_kogpt2():
    train_file = '/home/olga/NSJ/전처리/train.csv'
    test_file = '/home/olga/NSJ/전처리/test.csv'
    model_name = 'skt/kogpt2-base-v2'
    max_seq_len = 1024
    batch_size = 32
    epochs = 100
    val_split = 0.1
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=2, restore_best_weights=True)
    model_checkpoint = ModelCheckpoint('/home/olga/NSJ/KoGPT2-Review_Helpfulness', save_best_only=True, save_format="tf")

    # 데이터 로드
    train_data, test_data = load_data(train_file, test_file)

    # 토크나이저 초기화
    tokenizer = AutoTokenizer.from_pretrained(model_name, bos_token='</s>', eos_token='</s>', pad_token='<pad>')

    # 데이터 전처리
    train_X, train_y = preprocess_data(train_data, tokenizer, max_seq_len)
    test_X, test_y = preprocess_data(test_data, tokenizer, max_seq_len)

    # 모델 구축
    model = build_model(model_name)

    # 모델 훈련
    train_model(model, train_X, train_y, batch_size, epochs, early_stopping, model_checkpoint)

    # 베스트 모델 불러오기
    model.load_weights('/home/olga/NSJ/KoGPT2-Review_Helpfulness')

    # 예측
    predictions = model.predict(test_X)
    #y_pred_class = (predictions > 0.5).astype(int)

    # 기존 코드에서는 y_pred를 반환했지만, 여기서는 확률값인 predictions를 반환합니다.
    return test_y, predictions

y_test_kogpt2, y_pred_kogpt2 = train_kogpt2()