## TF-IDF Feature

In [36]:
import os

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [37]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [38]:
train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

In [39]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [40]:
# tf-idf값으로 벡터화
# min_df : 설정한 값보다 토큰 df값이 적게 나오면 벡터화 과정에서 제거 
# analyzer : 분석하기 위한 기준 단위(word(단어), char(문자))
# sublinear_tf : tf(문서 빈도수)에 스무딩(smoothing) 여부 설정(제로나오는거 방지)
# ngram_range : 단어 묶음 범위 설정
# max_features : 벡터 최대길이

vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", \
                            sublinear_tf=True, ngram_range=(1,3), max_features=5000)

X = vectorizer.fit_transform(reviews)
y = np.array(sentiments)

In [41]:
X

<25000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 20163057 stored elements in Compressed Sparse Row format>

In [42]:
features = vectorizer.get_feature_names()
features



[' ',
 ' !',
 ' ! ',
 ' $',
 ' &',
 ' & ',
 " '",
 " 'c",
 " 's",
 " 't",
 ' (',
 ' (1',
 ' (\\',
 ' (a',
 ' (b',
 ' (c',
 ' (d',
 ' (e',
 ' (f',
 ' (g',
 ' (h',
 ' (i',
 ' (j',
 ' (l',
 ' (m',
 ' (n',
 ' (o',
 ' (p',
 ' (r',
 ' (s',
 ' (t',
 ' (w',
 ' *',
 ' **',
 ' ,',
 ' , ',
 ' -',
 ' - ',
 ' --',
 ' .',
 ' . ',
 ' ..',
 ' 1',
 ' 1 ',
 ' 10',
 ' 15',
 ' 19',
 ' 2',
 ' 2 ',
 ' 20',
 ' 3',
 ' 3 ',
 ' 30',
 ' 4',
 ' 4 ',
 ' 40',
 ' 5',
 ' 5 ',
 ' 50',
 ' 6',
 ' 7',
 ' 70',
 ' 8',
 ' 80',
 ' 9',
 ' 90',
 ' :',
 ' \\',
 ' \\"',
 ' `',
 ' a',
 ' ab',
 ' ac',
 ' ad',
 ' af',
 ' ag',
 ' ah',
 ' ai',
 ' al',
 ' am',
 ' an',
 ' ap',
 ' ar',
 ' as',
 ' at',
 ' au',
 ' av',
 ' aw',
 ' b',
 ' ba',
 ' be',
 ' bi',
 ' bl',
 ' bo',
 ' br',
 ' bu',
 ' by',
 ' c',
 ' ca',
 ' ce',
 ' ch',
 ' ci',
 ' cl',
 ' co',
 ' cr',
 ' cu',
 ' cy',
 ' d',
 ' da',
 ' de',
 ' di',
 ' do',
 ' dr',
 ' du',
 ' dv',
 ' dy',
 ' e',
 ' ea',
 ' ec',
 ' ed',
 ' ef',
 ' eg',
 ' ei',
 ' el',
 ' em',
 ' en',
 ' ep',
 ' eq',
 

In [43]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, \
                                                   random_state=RANDOM_SEED)

In [44]:
# class_weight='balanced' : 각 레이블 균형있게 학습
lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

In [45]:
predicted = lgs.predict(X_eval)

In [46]:
print("Accuracy: %f" % lgs.score(X_eval, y_eval))

Accuracy: 0.862800


In [48]:
TEST_CLEAN_DATA = 'test_clean.csv'

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)

FileNotFoundError: [Errno 2] No such file or directory: './data_in/test_clean.csv'

In [None]:
testDataVecs = vectorizer.transform(test_data['review'])

In [None]:
test_predicted = lgs.predict(testDataVecs)
print(test_predicted)

In [None]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

answer_dataset = pd.DataFrame({'id': test_data['id'], 'sentiment':\
                               test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_tfidf_answer.csv', \
                      index=False, quoting=3)