In [18]:
import pandas as pd
from gensim.models import Word2Vec
import optuna

# 데이터 불러오기
df = pd.read_csv('./tokenized_0.csv')

# 텍스트를 토큰화하여 리스트로 변환
tokenized_corpus = [str(sentence).lower().split() for sentence in df['comments'] if pd.notnull(sentence)]

def objective(trial):
    # 하이퍼파라미터 탐색할 범위 지정
    vector_size = trial.suggest_int('vector_size', 10, 300)
    window = trial.suggest_int('window', 3, 10)
    min_count = trial.suggest_int('min_count', 5, 30)
    sg = trial.suggest_categorical('sg', [0])
    # Word2Vec 모델 정의
    model = Word2Vec(vector_size=vector_size, window=window, min_count=min_count, sg=sg)
    # 모델 학습
    # print(tokenized_corpus)
    model.build_vocab(tokenized_corpus)
    model.train(corpus_iterable=tokenized_corpus, total_examples=model.corpus_count, epochs=model.epochs, compute_loss=True)
    # 목적 함수(여기선 단순히 학습 손실값 반환) 설정
    loss = model.get_latest_training_loss()
    return loss

# Optuna를 사용하여 최적화 실행
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# 최적의 하이퍼파라미터 출력
print("Best parameters:", study.best_params)

# 최적의 하이퍼파라미터로 모델 재훈련
best_params = study.best_params
best_model = Word2Vec(vector_size=best_params['vector_size'], window=best_params['window'], min_count=best_params['min_count'], sg=best_params['sg'])
best_model.build_vocab(tokenized_corpus)
best_model.train(tokenized_corpus, total_examples=best_model.corpus_count, epochs=best_model.epochs)

# 튜닝된 Word2Vec 모델에서 단어 벡터 출력
word_vectors = best_model.wv
print('형태', word_vectors.vectors.shape)
# 예시: 단어 'apple'의 벡터 출력
# print("Vector for 'apple':", word_vectors['apple'])


[32m[I 2023-11-28 00:42:51,472][0m A new study created in memory with name: no-name-74a062ee-cf94-47bc-a6c6-4c52932ea1e9[0m
[32m[I 2023-11-28 00:42:51,522][0m Trial 0 finished with value: 35003.12890625 and parameters: {'vector_size': 66, 'window': 4, 'min_count': 24, 'sg': 0}. Best is trial 0 with value: 35003.12890625.[0m
[32m[I 2023-11-28 00:42:51,581][0m Trial 1 finished with value: 61320.44140625 and parameters: {'vector_size': 282, 'window': 3, 'min_count': 11, 'sg': 0}. Best is trial 0 with value: 35003.12890625.[0m
[32m[I 2023-11-28 00:42:51,625][0m Trial 2 finished with value: 38205.3984375 and parameters: {'vector_size': 164, 'window': 7, 'min_count': 19, 'sg': 0}. Best is trial 0 with value: 35003.12890625.[0m
[32m[I 2023-11-28 00:42:51,678][0m Trial 3 finished with value: 45072.6640625 and parameters: {'vector_size': 164, 'window': 9, 'min_count': 15, 'sg': 0}. Best is trial 0 with value: 35003.12890625.[0m
[32m[I 2023-11-28 00:42:51,723][0m Trial 4 finishe

Best parameters: {'vector_size': 15, 'window': 9, 'min_count': 30, 'sg': 0}
(130, 15)
