In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV
from transformers import BertTokenizer, TFBertForSequenceClassification, AutoModel, AutoTokenizer, TFGPT2Model, Trainer, TrainingArguments
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score, mean_squared_error, roc_auc_score, r2_score, roc_curve, auc, mean_absolute_error, confusion_matrix, classification_report)
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import (Embedding, Conv1D, LSTM, Dense, Conv2D, GlobalMaxPooling2D, GlobalMaxPooling1D, Reshape)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from gensim.models import Word2Vec
import urllib.request
from tqdm import tqdm
import seaborn as sns
import os

os.environ["HF_TOKEN"] = "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"
os.environ["HF_HOME"] = "Nampromotion/KoGPT2-Review_Helpfulness"


  from cryptography.hazmat.bindings.openssl.binding import Binding


In [2]:
def train_svm():
    # 데이터 로드
    print("Start loading data...")
    train_data = pd.read_csv('/home/olga/NSJ/전처리/train.csv')
    test_data = pd.read_csv('/home/olga/NSJ/전처리/test.csv')
    print("Data loading completed.")
    
    # Text preprocessing function
    def preprocess_text(text):
        # 한글과 공백만 남기고 나머지 문자 제거
        text = re.sub(r'[^가-힣\s]', '', text)
        return text
    
    # Apply text preprocessing
    print('Start preprocessing...')
    X_train = train_data['review_text'].apply(preprocess_text)
    y_train = train_data['review_usefulness']
    X_test = test_data['review_text'].apply(preprocess_text)
    y_test = test_data['review_usefulness']
    print('Preprocessing completed.')
    
    # TF-IDF 변환
    print('Start TF-IDF...')
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    print('TF-IDF completed.')
    
    # Hyperparameter tuning using GridSearchCV with parallel processing and more verbose output
    print('Start Grid Search...')
    param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
    grid = GridSearchCV(SVC(probability=True), param_grid, refit=True, verbose=3, n_jobs=-1)
    grid.fit(X_train_tfidf, y_train)
    print('Grid Search completed.')
    
    # 최적의 모델 사용
    svm_model = grid.best_estimator_
    
    # 모델 예측
    print('Start prediction...')
    y_pred = svm_model.predict_proba(X_test_tfidf)[:, 1]  # 이 부분 수정
    y_pred_class = (y_pred > 0.5).astype(int)  # 확률 값을 이진 분류 레이블로 변환
    print('Prediction completed.')
    
    # 평가 지표 계산
    print('Start evaluation...')
    acc = accuracy_score(y_test, y_pred_class)  # 수정된 레이블 사용
    f1 = f1_score(y_test, y_pred_class)          # 수정된 레이블 사용
    precision = precision_score(y_test, y_pred_class)  # 수정된 레이블 사용
    recall = recall_score(y_test, y_pred_class)        # 수정된 레이블 사용
    roc_auc = roc_auc_score(y_test, y_pred)  # 확률 값 그대로 사용
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    print('Evaluation completed.')

    # 결과 출력
    print('Best Parameters:', grid.best_params_)
    print('Accuracy:', acc)
    print('F1 Score:', f1)
    print('Precision:', precision)
    print('Recall:', recall)
    print('ROC AUC:', roc_auc)
    print('Mean Absolute Error:', mae)
    print('Mean Squared Error:', mse)
    print('Root Mean Squared Error:', rmse)
    print('R2 Score:', r2)

    # ROC 커브 그리기
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

    return y_test, y_pred  # y_pred_proba 대신 y_pred 반환

# 함수 호출
y_test_svm, y_pred_svm = train_svm()

Start loading data...


KeyboardInterrupt: 