In [2]:
import optuna
from gensim.models import Word2Vec
import pandas as pd

In [3]:
import pandas as pd
from gensim.models import Word2Vec
import optuna

# 데이터 불러오기
df = pd.read_csv('./tokenized_0.csv')

# 텍스트를 토큰화하여 리스트로 변환
tokenized_corpus = [str(sentence).lower().split() for sentence in df['comments'] if pd.notnull(sentence)]

def objective(trial):
    # 하이퍼파라미터 탐색할 범위 지정
    size = trial.suggest_categorical('size', [50, 100])
    window = trial.suggest_categorical('window', [3, 5])
    min_count = trial.suggest_categorical('min_count', [1, 2])
    sg = trial.suggest_categorical('sg', [0, 1])

    # Word2Vec 모델 정의
    model = Word2Vec(size=size, window=window, min_count=min_count, sg=sg)

    # 모델 학습
    model.build_vocab(tokenized_corpus)
    model.train(tokenized_corpus, total_examples=model.corpus_count, epochs=model.epochs)

    # 목적 함수(여기선 단순히 학습 손실값 반환) 설정
    loss = model.get_latest_training_loss()

    return loss

# Optuna를 사용하여 최적화 실행
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

# 최적의 하이퍼파라미터 출력
print("Best parameters:", study.best_params)

# 최적의 하이퍼파라미터로 모델 재훈련
best_params = study.best_params
best_model = Word2Vec(size=best_params['size'], window=best_params['window'], min_count=best_params['min_count'], sg=best_params['sg'])
best_model.build_vocab(tokenized_corpus)
best_model.train(tokenized_corpus, total_examples=best_model.corpus_count, epochs=best_model.epochs)

# 튜닝된 Word2Vec 모델에서 단어 벡터 출력
word_vectors = best_model.wv
# 예시: 단어 'apple'의 벡터 출력
print("Vector for 'apple':", word_vectors['apple'])


[I 2023-11-23 22:13:36,168] A new study created in memory with name: no-name-3f4a69a8-f7b3-4427-ac2a-8fc2a98ad39e


[I 2023-11-23 22:13:36,271] Trial 0 finished with value: 0.0 and parameters: {'vector_size': 100, 'window': 3, 'min_count': 2, 'sg': 0}. Best is trial 0 with value: 0.0.
[I 2023-11-23 22:13:36,421] Trial 1 finished with value: 0.0 and parameters: {'vector_size': 50, 'window': 5, 'min_count': 1, 'sg': 0}. Best is trial 0 with value: 0.0.
[I 2023-11-23 22:13:36,555] Trial 2 finished with value: 0.0 and parameters: {'vector_size': 50, 'window': 3, 'min_count': 1, 'sg': 0}. Best is trial 0 with value: 0.0.
[I 2023-11-23 22:13:36,698] Trial 3 finished with value: 0.0 and parameters: {'vector_size': 100, 'window': 5, 'min_count': 1, 'sg': 0}. Best is trial 0 with value: 0.0.
[I 2023-11-23 22:13:36,845] Trial 4 finished with value: 0.0 and parameters: {'vector_size': 50, 'window': 3, 'min_count': 1, 'sg': 0}. Best is trial 0 with value: 0.0.
[I 2023-11-23 22:13:36,992] Trial 5 finished with value: 0.0 and parameters: {'vector_size': 50, 'window': 3, 'min_count': 1, 'sg': 0}. Best is trial 0 w

Best parameters: {'vector_size': 100, 'window': 3, 'min_count': 2, 'sg': 0}
