In [1]:
import pandas as pd
from gensim.models import Word2Vec



### 데이터 불러오기

In [2]:
def load_data(filepath):
    data = pd.read_csv(filepath, delimiter=';', header=None, names=['sentence','emotion'])
    data = data['sentence']

    gensim_input = []
    for text in data:
        gensim_input.append(text.rstrip().split())
    return gensim_input

In [3]:
input_data = load_data("emotions_train.txt")

### 모델 학습

In [4]:
# word2vec 모델을 학습하세요.
w2v_model = Word2Vec(window=2, vector_size=300)
w2v_model.build_vocab(input_data)
w2v_model.train(input_data, total_examples = w2v_model.corpus_count, epochs=10)

(1907525, 3066610)

### 유사 단어 확인

In [6]:
# happy와 유사한 단어
similar_happy = w2v_model.wv.most_similar('happy')
print(similar_happy)

[('excited', 0.9074899554252625), ('thrilled', 0.871713399887085), ('pleased', 0.8471595048904419), ('determined', 0.8442872762680054), ('blessed', 0.8437955975532532), ('stubborn', 0.8403089046478271), ('grateful', 0.8387118577957153), ('thankful', 0.8339088559150696), ('alone', 0.823351263999939), ('afraid', 0.8222542405128479)]


In [7]:
# sad와 유사한 단어
similar_sad =  w2v_model.wv.most_similar('sad')
print(similar_sad)

[('scared', 0.9419350624084473), ('depressed', 0.9271098375320435), ('hopeless', 0.9239340424537659), ('angry', 0.9194381237030029), ('lonely', 0.915341317653656), ('bitchy', 0.9121831059455872), ('worthless', 0.9103984832763672), ('disappointed', 0.9100468158721924), ('unhappy', 0.9080536365509033), ('needy', 0.9021602869033813)]


### 유사도 확인

In [8]:
# 단어 good과 bad의 임베딩 벡터 간 유사도
similar_good_bad = w2v_model.wv.similarity('good', 'bad')

print(similar_good_bad)

0.8156189


In [9]:
# 단어 sad과 lonely의 임베딩 벡터 간 유사도를 확인 
similar_sad_lonely =  w2v_model.wv.similarity('sad', 'lonely')

print(similar_sad_lonely)

0.9153413


### 임베딩 벡터 확인

In [10]:
# happy의 임베딩 벡터를 확인 
wv_happy = w2v_model.wv['happy']

print(wv_happy)

[-1.34485707e-01  1.55972883e-01 -6.02693744e-02  5.39667942e-02
 -2.10517183e-01 -1.51054278e-01 -5.70126772e-02  4.50399488e-01
 -2.78545506e-02 -1.56204417e-01  1.24205515e-01 -1.48717329e-01
  4.71505336e-03 -2.10759401e-01  1.98053736e-02 -1.17002130e-01
  3.76331061e-01 -2.02878788e-01 -3.93327735e-02 -1.95689183e-02
 -5.23693487e-02  7.33617991e-02  6.20256476e-02 -7.68440366e-02
  1.45906687e-01 -9.11755413e-02 -3.06494325e-01  2.03523606e-01
 -2.34626099e-01 -2.54533052e-01  1.17501937e-01  3.79053392e-02
  1.61961406e-01 -3.71327102e-01 -2.76991338e-01  4.24374938e-01
  3.06911409e-01 -3.36622715e-01 -1.00527316e-01  5.73755912e-02
 -1.74747199e-01  1.83987379e-01 -1.47187421e-02 -4.44820523e-02
  1.64843947e-01 -1.54928789e-02  1.43704414e-01  5.96615151e-02
  4.27752957e-02  2.12745607e-01  2.31992751e-01  2.68400550e-01
 -2.10796878e-01 -7.04367831e-02 -2.03192621e-01  1.71163380e-02
  4.16711681e-02  1.04675189e-01 -1.99326575e-01 -1.35764003e-01
 -1.61593005e-01  7.29124

### 전체 코드

In [None]:
import pandas as pd
from gensim.models import Word2Vec

def load_data(filepath):
    data = pd.read_csv(filepath, delimiter=';', header=None, names=['sentence','emotion'])
    data = data['sentence']

    gensim_input = []
    for text in data:
        gensim_input.append(text.rstrip().split())
    return gensim_input

input_data = load_data("emotions_train.txt")

# word2vec 모델 학습 
w2v_model = Word2Vec(window=2, vector_size=300)
w2v_model.build_vocab(input_data)
w2v_model.train(input_data, total_examples = w2v_model.corpus_count, epochs=10)

# happy와 유사한 단어를 확인  
similar_happy = w2v_model.wv.most_similar('happy')

print(similar_happy)

# sad와 유사한 단어를 확인 
similar_sad =  w2v_model.wv.most_similar('sad')
print(similar_sad)

# 단어 good과 bad의 임베딩 벡터 간 유사도를 확인 
similar_good_bad = w2v_model.wv.similarity('good', 'bad')

print(similar_good_bad)

# 단어 sad과 lonely의 임베딩 벡터 간 유사도를 확인 
similar_sad_lonely =  w2v_model.wv.similarity('sad', 'lonely')

print(similar_sad_lonely)

# happy의 임베딩 벡터를 확인 
wv_happy = w2v_model.wv['happy']

print(wv_happy)
