In [12]:
import pandas as pd
from gensim.models import Word2Vec
import optuna
from sklearn.model_selection import train_test_split

class Embedding:
    def __init__(self, corpus):
        self.df = pd.read_csv(corpus)
        # 텍스트를 토큰화하여 리스트로 변환
        self.corpus = [
            str(sentence).lower().split()
            for sentence in self.df["comments"]
            if pd.notnull(sentence)
        ]

    # TODO: corpus를 train, test, validation으로 split 하여 임베딩 진행 or 임베딩 진행 후 split -> 나중에 argparser로 선택할 수 있도록 구현
    def get_split_data(self, option):
        if option == 0:
            # 옵션 0: 나누지 않고 전체 코퍼스를 훈련에 사용
            train_corpus = self.corpus
            test_corpus = None
            validation_corpus = None
        else:
            # 옵션 1: 코퍼스를 훈련, 테스트, 검증 세트로 나눔
            train_corpus, temp_corpus = train_test_split(self.corpus, test_size=0.5, random_state=42)
            test_corpus, validation_corpus = train_test_split(temp_corpus, test_size=0.4, random_state=42)
        return train_corpus, test_corpus, validation_corpus

    def objective(self, trial):
        # 하이퍼파라미터 탐색할 범위 지정
        vector_size = trial.suggest_int("vector_size", 10, 100)
        window = trial.suggest_int("window", 3, 10)
        min_count = trial.suggest_int("min_count", 5, 30)
        sg = trial.suggest_categorical("sg", [0])
        # Word2Vec 모델 정의
        model = Word2Vec(
            vector_size=vector_size, window=window, min_count=min_count, sg=sg
        )
        # 모델 학습
        # print(tokenized_corpus)
        model.build_vocab(self.corpus)
        model.train(
            corpus_iterable=self.corpus,
            total_examples=model.corpus_count,
            epochs=model.epochs,
            compute_loss=True,
        )
        # 목적 함수(여기선 단순히 학습 손실값 반환) 설정
        loss = model.get_latest_training_loss()
        return loss

    def get_embedding_vector(self):
        # Optuna를 사용하여 최적화 실행
        study = optuna.create_study(direction="minimize")
        study.optimize(self.objective, n_trials=100)
        # 최적의 하이퍼파라미터 출력
        print("Best parameters:", study.best_params)
        # 최적의 하이퍼파라미터로 모델 재훈련
        best_params = study.best_params
        best_model = Word2Vec(
            vector_size=best_params["vector_size"],
            window=best_params["window"],
            min_count=best_params["min_count"],
            sg=best_params["sg"],
        )
        best_model.build_vocab(self.corpus)
        best_model.train(
            self.corpus,
            total_examples=best_model.corpus_count,
            epochs=best_model.epochs,
        )
        # 튜닝된 Word2Vec 모델에서 단어 벡터 출력
        word_vectors = best_model.wv
        print("형태: ", word_vectors.vectors.shape)
        return word_vectors


if __name__ == "__main__":
    # 임베딩 옵션 설정
    
    embedding_option = 1
    embedding = Embedding("../tokenized_0.csv")
    train_data, test_data, validation_data = embedding.get_split_data(embedding_option)
    word_vectors = embedding.get_embedding_vector()


[I 2023-11-28 20:33:59,147] A new study created in memory with name: no-name-bcffabbb-33ee-4e19-8285-534005ddf841


[I 2023-11-28 20:33:59,243] Trial 0 finished with value: 65818.015625 and parameters: {'vector_size': 73, 'window': 5, 'min_count': 8, 'sg': 0}. Best is trial 0 with value: 65818.015625.
[I 2023-11-28 20:33:59,336] Trial 1 finished with value: 81003.75 and parameters: {'vector_size': 37, 'window': 8, 'min_count': 5, 'sg': 0}. Best is trial 0 with value: 65818.015625.
[I 2023-11-28 20:33:59,401] Trial 2 finished with value: 31747.177734375 and parameters: {'vector_size': 82, 'window': 8, 'min_count': 26, 'sg': 0}. Best is trial 2 with value: 31747.177734375.
[I 2023-11-28 20:33:59,481] Trial 3 finished with value: 36825.74609375 and parameters: {'vector_size': 89, 'window': 9, 'min_count': 19, 'sg': 0}. Best is trial 2 with value: 31747.177734375.
[I 2023-11-28 20:33:59,582] Trial 4 finished with value: 28909.48046875 and parameters: {'vector_size': 80, 'window': 9, 'min_count': 29, 'sg': 0}. Best is trial 4 with value: 28909.48046875.
[I 2023-11-28 20:33:59,713] Trial 5 finished with v

Best parameters: {'vector_size': 10, 'window': 9, 'min_count': 30, 'sg': 0}
형태:  (130, 10)


In [None]:
 def objective(self, trial):
        # 하이퍼파라미터 탐색할 범위 지정
        vector_size = trial.suggest_int("vector_size", 10, 100)
        window = trial.suggest_int("window", 3, 10)
        min_count = trial.suggest_int("min_count", 5, 30)
        sg = trial.suggest_categorical("sg", [0])
        # Word2Vec 모델 정의
        model = Word2Vec(
            vector_size=vector_size, window=window, min_count=min_count, sg=sg
        )
        # 모델 학습
        # print(tokenized_corpus)
        model.build_vocab(self.corpus)
        model.train(
            corpus_iterable=self.corpus,
            total_examples=model.corpus_count,
            epochs=model.epochs,
            compute_loss=True,
        )
        # 목적 함수(여기선 단순히 학습 손실값 반환) 설정
        loss = model.get_latest_training_loss()
        return loss

In [None]:
    def get_embedding_vector(self):
        # Optuna를 사용하여 최적화 실행
        study = optuna.create_study(direction="minimize")
        study.optimize(self.objective, n_trials=100)
        # 최적의 하이퍼파라미터 출력
        print("Best parameters:", study.best_params)
        # 최적의 하이퍼파라미터로 모델 재훈련
        best_params = study.best_params
        best_model = Word2Vec(
            vector_size=best_params["vector_size"],
            window=best_params["window"],
            min_count=best_params["min_count"],
            sg=best_params["sg"],
        )
        best_model.build_vocab(self.corpus)
        best_model.train(
            self.corpus,
            total_examples=best_model.corpus_count,
            epochs=best_model.epochs,
        )
        # 튜닝된 Word2Vec 모델에서 단어 벡터 출력
        word_vectors = best_model.wv
        print("형태: ", word_vectors.vectors.shape)
        return word_vectors


if __name__ == "__main__":
    embedding = Embedding("./tokenized_0.csv")
    word_vectors = embedding.get_embedding_vector()