In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv('data_in/train_clean.csv')
reviews = list(train_data['review'])
y = np.array(train_data['sentiment'])

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word', max_features=5000)

train_data_features = vectorizer.fit_transform(reviews)

In [3]:
train_data_features # (25000, 5000) 크기의 행렬값이 됨 25000개의 데이터가 5000개의 특징을 가짐

<25000x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 1975124 stored elements in Compressed Sparse Row format>

# 학습데이터를 검증 데이터와 분리

In [4]:
from sklearn.model_selection import train_test_split
test_size = 0.2
random_seed = 42

train_input, eval_input, train_label, eval_label = train_test_split(train_data_features, y, test_size=test_size, random_state=random_seed)

In [5]:
from sklearn.ensemble import RandomForestClassifier

# 랜덤 포레스트 분류기에 100개의 의사결정 트리를 사용한다.
forest = RandomForestClassifier(n_estimators=100)

# 단어 묶음을 벡터화한 데이터와 정답 데이터를 가지고 학습을 시작한다.
forest.fit(train_input, train_label)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [6]:
print('Accuracy: {0:.1%}'.format(forest.score(eval_input, eval_label)))

Accuracy: 84.3%


# test data 제출하기

In [20]:
test_data = pd.read_csv('data_in/test_clean.csv')

In [25]:
test_reviews = list(test_data['review'])
ids = list(test_data['id'])

In [26]:
test_data_features = vectorizer.transform(test_reviews)

In [27]:
result = forest.predict(test_data_features)
output = pd.DataFrame(data={"id": ids, "sentiment": result})
output.to_csv('data_out/bag_of_words_model.csv', index=False, quoting=3)