In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV
from transformers import BertTokenizer, TFBertForSequenceClassification, AutoModel, AutoTokenizer, TFGPT2Model, Trainer, TrainingArguments
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score, mean_squared_error, roc_auc_score, r2_score, roc_curve, auc, mean_absolute_error, confusion_matrix, classification_report)
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import (Embedding, Conv1D, LSTM, Dense, Conv2D, GlobalMaxPooling2D, GlobalMaxPooling1D, Reshape)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from gensim.models import Word2Vec
import urllib.request
from tqdm import tqdm
import seaborn as sns
import os

os.environ["HF_TOKEN"] = "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"
os.environ["HF_HOME"] = "Nampromotion/KoGPT2-Review_Helpfulness"


In [None]:
def preprocess_text(text):
    # 한글과 공백을 제외한 모든 문자를 제거
    return re.sub("[^가-힣\s]", "", text)

def train_cnn_lstm():
    # 데이터 로드
    train_data = pd.read_csv('/home/olga/NSJ/전처리/train.csv')
    test_data = pd.read_csv('/home/olga/NSJ/전처리/test.csv')

    train_data['review_text'] = train_data['review_text'].apply(preprocess_text)
    test_data['review_text'] = test_data['review_text'].apply(preprocess_text)

    # 텍스트 데이터 토큰화
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_data['review_text'])
    X_train_tokenized = tokenizer.texts_to_sequences(train_data['review_text'])
    X_test_tokenized = tokenizer.texts_to_sequences(test_data['review_text'])

    # 단어 인덱스
    word_index = tokenizer.word_index

    # Word2Vec 모델 학습을 위한 토큰화된 문장 준비
    sentences = [[word for word in str(document).split()] for document in train_data['review_text']]

    # Word2Vec 모델 학습
    word2vec_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, sg=0, workers=4)
    word2vec_model.save("word2vec.model")

    # 임베딩 행렬 생성
    vocab_size = len(word_index) + 1
    embedding_matrix = np.zeros((vocab_size, 100))
    for word, i in word_index.items():
        try:
            vector = word2vec_model.wv[word]
            embedding_matrix[i] = vector
        except KeyError:
            continue

    # 패딩
    X_train_padded = pad_sequences(X_train_tokenized, padding='post')
    X_test_padded = pad_sequences(X_test_tokenized, padding='post', maxlen=len(X_train_padded[0]))

    # CNN-LSTM 모델 구축
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False),
        Conv1D(128, 5, activation='relu'),
        LSTM(64),
        Dense(1, activation='sigmoid')
    ])

    # 모델 컴파일
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # 얼리스타핑
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

    # 모델 훈련
    history = model.fit(X_train_padded, train_data['review_usefulness'], epochs=100, validation_split=0.2, callbacks=[early_stopping])

    # 모델 평가
    y_pred = model.predict(X_test_padded)
    y_pred_class = (y_pred > 0.5).astype("int32")

    # 학습 과정에서의 loss 및 accuracy 시각화
    plt.figure(figsize=(12, 4))

    # Loss 시각화
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss Evolution')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    # Accuracy 시각화
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Accuracy Evolution')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()

    # 성능 지표
    acc = accuracy_score(test_data['review_usefulness'], y_pred_class)
    f1 = f1_score(test_data['review_usefulness'], y_pred_class)
    precision = precision_score(test_data['review_usefulness'], y_pred_class)
    recall = recall_score(test_data['review_usefulness'], y_pred_class)
    roc_auc = roc_auc_score(test_data['review_usefulness'], y_pred)
    mse = mean_squared_error(test_data['review_usefulness'], y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(test_data['review_usefulness'], y_pred)
    r2 = r2_score(test_data['review_usefulness'], y_pred)

    print('Accuracy:', acc)
    print('F1 Score:', f1)
    print('Precision:', precision)
    print('Recall:', recall)
    print('ROC AUC:', roc_auc)
    print('Mean Squared Error:', mse)
    print('Root Mean Squared Error:', rmse)
    print('Mean Absolute Error:', mae)
    print('R2 Score:', r2)

    return test_data['review_usefulness'], y_pred

#y_test_cnn_lstm, y_pred_cnn_lstm = train_cnn_lstm()

# Yelp

In [None]:
# 데이터 로드
train_data = pd.read_csv('/home/olga/NSJ/Yelp/train.csv')
test_data = pd.read_csv('/home/olga/NSJ/Yelp/test.csv')

# 텍스트 데이터 토큰화
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])
X_train_tokenized = tokenizer.texts_to_sequences(train_data['text'])
X_test_tokenized = tokenizer.texts_to_sequences(test_data['text'])

# 단어 인덱스
word_index = tokenizer.word_index

# Word2Vec 모델 학습을 위한 토큰화된 문장 준비
sentences = [[word for word in str(document).split()] for document in train_data['text']]

# Word2Vec 모델 학습
word2vec_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, sg=0, workers=4)
word2vec_model.save("word2vec.model")

# 임베딩 행렬 생성
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in word_index.items():
    try:
        vector = word2vec_model.wv[word]
        embedding_matrix[i] = vector
    except KeyError:
        continue

# 패딩
X_train_padded = pad_sequences(X_train_tokenized, padding='post')
X_test_padded = pad_sequences(X_test_tokenized, padding='post', maxlen=len(X_train_padded[0]))

# CNN-LSTM 모델 구축
model = Sequential([
    Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False),
    Conv1D(128, 5, activation='relu'),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

# 모델 컴파일
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 얼리스타핑
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

# 모델 훈련
history = model.fit(X_train_padded, train_data['useful'], epochs=100, validation_split=0.2, callbacks=[early_stopping])

# 모델 평가
y_pred = model.predict(X_test_padded)
y_pred_class = (y_pred > 0.5).astype("int32")

# 성능 지표
acc = accuracy_score(test_data['useful'], y_pred_class)
f1 = f1_score(test_data['useful'], y_pred_class)
precision = precision_score(test_data['useful'], y_pred_class)
recall = recall_score(test_data['useful'], y_pred_class)
roc_auc = roc_auc_score(test_data['useful'], y_pred)
mse = mean_squared_error(test_data['useful'], y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(test_data['useful'], y_pred)
r2 = r2_score(test_data['useful'], y_pred)

print('Accuracy:', acc)
print('F1 Score:', f1)
print('Precision:', precision)
print('Recall:', recall)
print('ROC AUC:', roc_auc)
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)
print('Mean Absolute Error:', mae)
print('R2 Score:', r2)

# 에폭당 정확도 및 로스 그래프 그리기
plt.figure(figsize=[8, 6])
plt.plot(history.history['accuracy'], 'r', linewidth=3.0)
plt.plot(history.history['val_accuracy'], 'b', linewidth=3.0)
plt.legend(['Training Accuracy', 'Validation Accuracy'], fontsize=18)
plt.xlabel('Epochs', fontsize=16)
plt.ylabel('Accuracy', fontsize=16)
plt.title('Accuracy Curves', fontsize=16)
plt.show()

plt.figure(figsize=[8, 6])
plt.plot(history.history['loss'], 'r', linewidth=3.0)
plt.plot(history.history['val_loss'], 'b', linewidth=3.0)
plt.legend(['Training Loss', 'Validation Loss'], fontsize=18)
plt.xlabel('Epochs', fontsize=16)
plt.ylabel('Loss', fontsize=16)
plt.title('Loss Curves', fontsize=16)
plt.show()