In [2]:
import os
import re

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

In [3]:
DATA_IN_PATH = './data_in/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [4]:
train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

In [7]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [8]:
sentences = []
for review in reviews:
    sentences.append(review.split())

In [13]:
num_features = 300 # 단어 임베딩 벡터 차원
min_word_count = 40 # 적은 빈도수 배제용
num_workers = 4     # 학습 위한 프로세스 개수 지정
context = 10        # 컨텍스트 윈도우 크기 지정
downsampling = 1e-3 # 빠른 학습을 위해 정답 단어 레이블 다운샘플링
                    # 비율 지정(보통 0.001이 좋은 성능을 낸다고 함)

In [14]:
import logging
# level=logging.INFO : word2vec 학습 과정에서 로그 메시지를 양식에 맞게 -
# - info 수준으로 보여줌 
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', \
                   level=logging.INFO)

In [15]:
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=num_workers, \
           vector_size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

2022-11-07 16:53:24,592 : INFO : collecting all words and their counts
2022-11-07 16:53:24,593 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-07 16:53:24,799 : INFO : PROGRESS: at sentence #10000, processed 1261831 words, keeping 152318 word types
2022-11-07 16:53:25,016 : INFO : PROGRESS: at sentence #20000, processed 2510155 words, keeping 237994 word types
2022-11-07 16:53:25,131 : INFO : collected 275098 word types from a corpus of 3129782 raw words and 25000 sentences
2022-11-07 16:53:25,131 : INFO : Creating a fresh vocabulary
2022-11-07 16:53:25,229 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=40 retains 8869 unique words (3.2239420133915915%% of original 275098, drops 266229)', 'datetime': '2022-11-07T16:53:25.229417', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare_vocab'}
2022-11-07 16:53:25,230 : INFO : Word2Vec lif

In [16]:
model.wv.index_to_key

['movie',
 'film',
 'one',
 'like',
 'would',
 'even',
 'good',
 'really',
 'see',
 '-',
 'get',
 'much',
 'story',
 'also',
 'time',
 'first',
 'great',
 'people',
 'could',
 'make',
 'made',
 'bad',
 'think',
 'many',
 'never',
 'two',
 'little',
 'way',
 'well',
 'watch',
 'best',
 'know',
 'seen',
 'love',
 'characters',
 'character',
 'movies',
 'ever',
 'still',
 'it.',
 'movie.',
 'films',
 'plot',
 'acting',
 'show',
 '"i',
 'go',
 'better',
 'say',
 'something',
 'makes',
 'film.',
 'watching',
 'back',
 'scene',
 'film,',
 'real',
 'find',
 'new',
 'movie,',
 "i'm",
 'actually',
 'scenes',
 'every',
 'life',
 'man',
 'going',
 'nothing',
 'look',
 'quite',
 'another',
 'lot',
 'old',
 '"this',
 'want',
 'end',
 'pretty',
 'thing',
 'seems',
 'got',
 '&',
 "can't",
 'take',
 'years',
 'part',
 'give',
 'actors',
 'young',
 'may',
 'us',
 "that's",
 'without',
 'things',
 'gets',
 'though',
 'big',
 'around',
 'thought',
 'almost',
 'it,',
 'director',
 'always',
 'saw',
 "i've

In [24]:
'''
words : 단어의 모음인 하나의 리뷰
model : word2vec모델
num_features : word2vec로 임베딩할 때 정했던 벡터의 차원 수
'''
def get_features(words, model, num_features):
    # 출력 벡터 초기화
    feature_vector = np.zeros((num_features),dtype=np.float32)
    num_words = 0
    # 어휘 사전 준비
    index_to_key_set = set(model.wv.index_to_key)
    for w in words:
        if w in index_to_key_set:
            num_words += 1
            # 사전에 해당하는 단어에 대해 단어 벡터를 더함
            feature_vector = np.add(feature_vector, model.wv[w])
    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [25]:
def get_dataset(reviews, model, num_features):
    dataset = list()
    for s in reviews:
        dataset.append(get_features(s, model, num_features))
    reviewFeatureVecs = np.stack(dataset)
    print(dataset)
    print("------------")
    print(reviewFeatureVecs)
    return reviewFeatureVecs

In [26]:
test_data_vecs = get_dataset(sentences[:1], model, num_features)

[array([-8.82773697e-02,  2.17417136e-01, -2.59579830e-02,  7.32667670e-02,
       -2.66081616e-02, -1.97184503e-01, -8.88579339e-02,  1.01570457e-01,
       -1.34318173e-01, -1.71068937e-01, -4.68272902e-02, -9.62430090e-02,
       -7.96676129e-02, -3.98080200e-02,  2.81828456e-02, -1.81281924e-01,
        1.41212195e-01, -1.10254712e-01, -1.18992127e-01, -1.17746003e-01,
       -4.84001637e-02, -1.81146991e-02,  2.16895565e-01,  1.69422999e-02,
        1.63398981e-01, -1.32727429e-01, -3.87945145e-01, -6.05251193e-02,
       -5.07661179e-02, -9.18957964e-02,  1.76544562e-01, -1.63193360e-01,
        9.65613723e-02, -8.78242105e-02, -8.45573470e-02, -9.41260234e-02,
       -5.30423075e-02, -5.73751442e-02,  1.96397826e-02, -2.60585807e-02,
       -1.30722120e-01, -6.66299090e-02,  7.61836544e-02, -6.61454275e-02,
        3.68529744e-02,  7.01156706e-02,  1.62113711e-01,  4.52426895e-02,
        1.78937480e-01,  1.88850969e-01,  1.74959809e-01, -3.80175449e-02,
       -1.05224028e-01, 