In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import re


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import math
import optuna
import torch.optim as optim
import time
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')
print(device)
torch.cuda.is_available()

cuda:0


True

In [4]:
df = pd.read_csv("./tokenized_0.csv")

### Embeding vector(태양)

In [5]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# 각 행의 텍스트를 토큰화하여 리스트로 변환
tokenized_corpus = [str(sentence).lower().split() for sentence in df['comments'] if pd.notnull(sentence)]

def objective(trial):
    # 하이퍼파라미터 탐색할 범위 지정
    vector_size = trial.suggest_categorical('vector_size', [50, 100])
    window = trial.suggest_categorical('window', [2, 3])
    min_count = trial.suggest_categorical('min_count', [1, 2])
    sg = trial.suggest_categorical('sg', [1, 2])

    # Word2Vec 모델 정의
    model = Word2Vec(vector_size=vector_size, window=window, min_count=min_count, sg=sg)

    # 모델 학습
    model.build_vocab(tokenized_corpus)
    model.train(tokenized_corpus, total_examples=model.corpus_count, epochs=model.epochs)

    # 목적 함수(예를 들어, 여기선 validation loss 등)를 반환하도록 작성
    # 여기서는 단순히 학습된 모델의 전체 손실값을 반환하는 것으로 가정합니다
    loss = model.get_latest_training_loss()

    return loss

# Optuna를 사용하여 최적화 실행
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)  # 시행 횟수는 필요에 따라 조정할 수 있습니다

# 최적의 하이퍼파라미터 출력
print("Best parameters:", study.best_params)

[I 2023-11-27 18:17:38,601] A new study created in memory with name: no-name-0d806ca4-7811-40ad-8520-98380967f945
[I 2023-11-27 18:17:39,160] Trial 0 finished with value: 0.0 and parameters: {'vector_size': 50, 'window': 2, 'min_count': 2, 'sg': 1}. Best is trial 0 with value: 0.0.
[I 2023-11-27 18:17:39,688] Trial 1 finished with value: 0.0 and parameters: {'vector_size': 50, 'window': 3, 'min_count': 2, 'sg': 1}. Best is trial 0 with value: 0.0.
[I 2023-11-27 18:17:40,451] Trial 2 finished with value: 0.0 and parameters: {'vector_size': 100, 'window': 3, 'min_count': 1, 'sg': 2}. Best is trial 0 with value: 0.0.
[I 2023-11-27 18:17:40,862] Trial 3 finished with value: 0.0 and parameters: {'vector_size': 100, 'window': 3, 'min_count': 2, 'sg': 1}. Best is trial 0 with value: 0.0.
[I 2023-11-27 18:17:41,370] Trial 4 finished with value: 0.0 and parameters: {'vector_size': 50, 'window': 3, 'min_count': 1, 'sg': 1}. Best is trial 0 with value: 0.0.
[I 2023-11-27 18:17:41,832] Trial 5 fin

Best parameters: {'vector_size': 50, 'window': 2, 'min_count': 2, 'sg': 1}


##### 매핑될 사전set vocab 정의

In [7]:
# 데이터셋에서 모든 단어 수집
all_words = [word for sentence in tokenized_corpus for word in sentence]

# 중복 제거를 통해 고유한 단어만 남기고 크기 계산
vocab = set(all_words)
vocab_size = len(vocab)


#### 패딩 및 dataloader 생성

In [13]:
# 데이터셋을 train, valid, test로 나눔
train_data, test_data = train_test_split(df['comments'], test_size=0.2, random_state=42)
train_data, valid_data = train_test_split(train_data, test_size=0.25, random_state=42)  # 나머지를 valid로

# 각 행의 텍스트를 토큰화하여 리스트로 변환
train_tokenized = [str(sentence).lower().split() for sentence in train_data if pd.notnull(sentence)]
valid_tokenized = [str(sentence).lower().split() for sentence in valid_data if pd.notnull(sentence)]
test_tokenized = [str(sentence).lower().split() for sentence in test_data if pd.notnull(sentence)]

# 문장의 최대 길이를 찾음
max_seq_length = max(len(sentence) for sentence in tokenized_corpus)

# 패딩 추가하여 문장의 길이를 최대 길이로 맞춤
padded_tokenized = [sentence + ['<PAD>'] * (max_seq_length - len(sentence)) for sentence in tokenized_corpus]

# 문장을 숫자로 변환하는 과정에서 패딩된 단어에 대한 인덱스를 0으로 처리하는 방식을 고려하여 각 단어에 고유한 숫자를 매핑해줌
word_to_index = {word: idx for idx, word in enumerate(vocab)}
padded_indexed = [[word_to_index[word] for word in sentence] for sentence in padded_tokenized]

# PyTorch Tensor로 변환
padded_tensor = torch.LongTensor(padded_indexed)

# 데이터셋 클래스 정의padded_indexed
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# 데이터셋 인스턴스 생성
train_dataset = MyDataset(padded_tensor)
valid_dataset = MyDataset(padded_tensor)
test_dataset = MyDataset(padded_tensor)

# 데이터 로더 생성
batch_size = 32  # 배치 크기 설정
trn_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(valid_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

##### 모델 정의, 모델 인스턴스 및 embeding 벡터 입력

In [14]:

# LSTM 모델 정의
class MyLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, vocab_size, n_classes=10):
        super(MyLSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.n_classes = n_classes
        self.lstm = nn.LSTM(input_size=self.input_dim, 
                            hidden_size=self.hidden_dim, 
                            batch_first=True)
        self.fc = nn.Linear(in_features=self.hidden_dim,
                            out_features=self.n_classes)
        self.embedding = nn.Embedding(vocab_size, input_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        out, _ = self.lstm(embedded)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

# 모델 튜닝 결과를 기반으로 LSTM 모델 생성
best_params = study.best_params
vector_size = best_params['vector_size']
window = best_params['window']
min_count = best_params['min_count']
sg = best_params['sg']

# Word2Vec 모델 훈련
word2vec_model = Word2Vec(vector_size=vector_size, window=window, min_count=min_count, sg=sg)
word2vec_model.build_vocab(tokenized_corpus)
word2vec_model.train(tokenized_corpus, total_examples=word2vec_model.corpus_count, epochs=word2vec_model.epochs)


# Word2Vec 모델을 통해 패딩이 추가된 임베딩된 데이터를 PyTorch Tensor로 변환
embedded_data = torch.tensor(padded_indexed, dtype=torch.float32)

# LSTM 모델의 입력 차원 설정
input_dim = vector_size  # 임베딩 차원을 LSTM 입력 차원으로 설정

# LSTM 모델 인스턴스 생성
lstm_model = MyLSTM(input_dim=input_dim, hidden_dim=50, vocab_size=len(word2vec_model.wv.key_to_index), n_classes=10)

In [15]:
def train(model, data_loader, optimizer, criterion, device):
    model.train() # 모델을 학습모드로!
    trn_loss = 0
    for i, (label, text) in enumerate(data_loader):
        # Step 1. mini-batch에서 x,y 데이터를 얻고, 원하는 device에 위치시키기
        x = torch.LongTensor(text).to(device)
        y = torch.LongTensor(label).to(device)
        
        # Step 2. gradient 초기화
        optimizer.zero_grad()
        
        # Step 3. Forward Propagation
        y_pred_prob = model(x)
        
        # Step 4. Loss Calculation
        loss = criterion(y_pred_prob, y)
        
        # Step 5. Gradient Calculation (Backpropagation)
        loss.backward()
        
        # Step 6. Update Parameter (by Gradient Descent)
        optimizer.step()
        
        # Step 7. trn_loss 변수에 mini-batch loss를 누적해서 합산
        trn_loss += loss.item()
        
    # Step 8. 데이터 한 개당 평균 train loss
    avg_trn_loss = trn_loss / len(data_loader.dataset)
    return avg_trn_loss

In [16]:
def evaluate(model, data_loader, optimizer, criterion, device):
    model.eval() # 모델을 평가모드로!
    eval_loss = 0
    
    results_pred = []
    results_real = []
    with torch.no_grad(): # evaluate()함수에는 단순 forward propagation만 할 뿐, gradient 계산 필요 X.
        for i, (label, text) in enumerate(data_loader):
            # Step 1. mini-batch에서 x,y 데이터를 얻고, 원하는 device에 위치시키기
            x = torch.LongTensor(text).to(device)
            y = torch.LongTensor(label).to(device)

            # Step 2. Forward Propagation
            y_pred_prob = model(x)

            # Step 3. Loss Calculation
            loss = criterion(y_pred_prob, y)
            
            # Step 4. Predict label
            y_pred_label = torch.argmax(y_pred_prob, dim=1)
            
            # Step 5. Save real and predicte label
            results_pred.extend(y_pred_label.detach().cpu().numpy())
            results_real.extend(y.detach().cpu().numpy())
            
            # Step 6. eval_loss변수에 mini-batch loss를 누적해서 합산
            eval_loss += loss.item()

    # Step 7. 데이터 한 개당 평균 eval_loss와 accuracy구하기
    avg_eval_loss = eval_loss / len(data_loader.dataset)
    results_pred = np.array(results_pred)
    results_real = np.array(results_real)
    accuracy = np.sum(results_pred == results_real) / len(results_real)
    
    return avg_eval_loss, accuracy

In [17]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

### vocabsize

In [18]:
vocab = set(word for sentence in tokenized_corpus for word in sentence)
VOCAB_SIZE = len(vocab)


In [19]:
VOCAB_SIZE = len(vocab)
model = MyLSTM(input_dim=input_dim, hidden_dim=50, vocab_size=VOCAB_SIZE, n_classes=10)
model = model.to(device)
my_opt = optim.Adam(lstm_model.parameters(), lr=0.001)
best_val_loss = float('inf')
loss_func = nn.CrossEntropyLoss(reduction='sum')

In [20]:
N_EPOCHS = 10
LR = 0.001
BATCH_SIZE = 2**6

##### 모델링

In [None]:
best_val_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    trn_loss = train(model=lstm_model, 
                     data_loader=trn_loader, 
                     criterion=loss_func,
                     optimizer=my_opt, 
                     device=device)
    val_loss, accuracy = evaluate(model=lstm_model, 
                                  data_loader=val_loader, 
                                  criterion=loss_func,
                                  optimizer=my_opt, 
                                  device=device)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {trn_loss:.3f} | Val Loss: {val_loss:.3f} | Val Acc: {100*accuracy:.3f}% ')