In [26]:
import pandas as pd
import numpy as np
import os

In [2]:
DATA_IN_PATH = './data_in/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [3]:
sentences = []

for review in reviews : 
    sentences.append(review.split())

In [4]:
# 하이퍼파라미터
num_features = 300 # 워드 벡터 특징값 수
min_word_count = 40 # 단어에 대한 최소 빈도 수
num_workers = 4 # 프로세스 개수
context = 10 # 컨텍스트 윈도 크기
downsampling = 1e-3 # 다운 샘플링 비율

In [5]:
import logging

In [6]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [7]:
from gensim.models import word2vec

In [8]:
print('Training model...')
model = word2vec.Word2Vec(sentences, 
                         workers=num_workers,
                         vector_size=num_features,
                         min_count=min_word_count,
                         window=context,
                         sample=downsampling)

2023-01-11 13:57:14,052 : INFO : collecting all words and their counts
2023-01-11 13:57:14,076 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training model...


2023-01-11 13:57:15,292 : INFO : PROGRESS: at sentence #10000, processed 1205223 words, keeping 51374 word types
2023-01-11 13:57:16,312 : INFO : PROGRESS: at sentence #20000, processed 2396605 words, keeping 67660 word types
2023-01-11 13:57:16,664 : INFO : collected 74065 word types from a corpus of 2988089 raw words and 25000 sentences
2023-01-11 13:57:16,667 : INFO : Creating a fresh vocabulary
2023-01-11 13:57:16,822 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=40 retains 8160 unique words (11.02% of original 74065, drops 65905)', 'datetime': '2023-01-11T13:57:16.822635', 'gensim': '4.3.0', 'python': '3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-01-11 13:57:16,824 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=40 leaves 2627273 word corpus (87.92% of original 2988089, drops 360816)', 'datetime': '2023-01-11T13:57:16.824638', 'gensim': '4.3.0', 'python':

In [9]:
model_name = '300features_40minwords_10context'
model.save(model_name)

2023-01-11 13:57:46,896 : INFO : Word2Vec lifecycle event {'fname_or_handle': '300features_40minwords_10context', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-01-11T13:57:46.894579', 'gensim': '4.3.0', 'python': '3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'saving'}
2023-01-11 13:57:46,900 : INFO : not storing attribute cum_table
2023-01-11 13:57:46,962 : INFO : saved 300features_40minwords_10context


In [13]:
# word2vec는 하나의 리뷰를 같은 형태의 입력값으로 만들어야한다.
# 리뷰마다 단어의 개수가 다르기 때문에 입력값을 하나의 형태로 만들어야 한다.
# 여기서는 문장에 있는 모든 단어의 벡터값에 대해 평균을 내어 리뷰 하나당 하나의 벡터로 만드는 방법을 사용한다.
def get_features(words, model, num_features) :
    # 출력 벡터 초기화
    feature_vector = np.zeros((num_features), dtype=np.float32)
    
    num_words = 0
    # 어휘사전 준비
    index2word_set = set(model.wv.index_to_key)
    
    for w in words :
        if w in index2word_set :
            num_words += 1
            #사전에 해당하는 단어에 대해 단어 벡터를 더함
            feature_vector = np.add(feature_vector, model.wv.get_index(w))
    
    # 문장의 단어 수만큼 나누어 단어 벡터의 평균값을 문장 벡터로 함
    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [11]:
def get_dataset(reviews, model, num_features):
    dataset = []
    
    for s in reviews :
        dataset.append(get_features(s, model, num_features))
        
    reviewFeatureVecs = np.stack(dataset)
    
    return reviewFeatureVecs

In [16]:
test_data_vecs = get_dataset(sentences, model, num_features)

In [18]:
from sklearn.model_selection import train_test_split

X = test_data_vecs
y = np.array(sentiments)

RANDOM_SEED = 42
TEST_SPLIT = 0.2

X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [19]:
from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

In [20]:
print('Accuracy: %f'%lgs.score(X_eval, y_eval))

Accuracy: 0.503800


In [21]:
TEST_CLEAN_DATA = 'test_clean.csv'

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)

test_review = list(test_data['review'])

In [22]:
test_data.head(5)

Unnamed: 0,review,id
0,naturally film main themes mortality nostalgia...,"""12311_10"""
1,movie disaster within disaster film full great...,"""8348_2"""
2,movie kids saw tonight child loved one point k...,"""5828_4"""
3,afraid dark left impression several different ...,"""7186_2"""
4,accurate depiction small time mob life filmed ...,"""12128_7"""


In [23]:
test_sentences = list()
for review in test_review:
    test_sentences.append(review.split())

In [24]:
test_data_vecs = get_dataset(test_sentences, model, num_features)

In [27]:
DATA_OUT_PATH = './data_out/'

test_predicted = lgs.predict(test_data_vecs)

if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    
ids = list(test_data['id'])
answer_dataset = pd.DataFrame({'id': ids, 'sentiment': test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_w2v_answer.csv', index=False, quoting=3)

In [28]:
model_name = "300features_40minwords_10context"
model.save(model_name)

2023-01-11 15:26:19,018 : INFO : Word2Vec lifecycle event {'fname_or_handle': '300features_40minwords_10context', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-01-11T15:26:19.018795', 'gensim': '4.3.0', 'python': '3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'saving'}
2023-01-11 15:26:19,022 : INFO : not storing attribute cum_table
2023-01-11 15:26:19,082 : INFO : saved 300features_40minwords_10context
