### Text Data Loading

In [3]:
from sklearn.datasets import load_files

reviews_train = load_files("./data/aclImdb/train")

text_train, y_train = reviews_train.data, reviews_train.target

print(type(reviews_train))
print("text_train의 타입: {}".format(type(text_train)))
print("text_train의 길이: {}".format(len(text_train)))
print("text_train[6]: \n{}".format(text_train[6]))
print("y_train[6]: \n{}".format(y_train[6]))

<class 'sklearn.utils.Bunch'>
text_train의 타입: <class 'list'>
text_train의 길이: 25000
text_train[6]: 
b"This movie has a special way of telling the story, at first i found it rather odd as it jumped through time and I had no idea whats happening.<br /><br />Anyway the story line was although simple, but still very real and touching. You met someone the first time, you fell in love completely, but broke up at last and promoted a deadly agony. Who hasn't go through this? but we will never forget this kind of pain in our life. <br /><br />I would say i am rather touched as two actor has shown great performance in showing the love between the characters. I just wish that the story could be a happy ending."
y_train[6]: 
1


#### text data 중에서 개행문자 삭제

In [4]:
print(text_train[5], "\n")
print(type(text_train[5]), "\n")

text_train = [doc.replace(b"<br />", b" ") for doc in text_train]
print(text_train[5])

b"The Movie was sub-par, but this Television Pilot delivers a great springboard into what has become a Sci-Fi fans Ideal program. The Actors deliver and the special effects (for a television series) are spectacular. Having an intelligent interesting script doesn't hurt either.<br /><br />Stargate SG1 is currently one of my favorite programs." 

<class 'bytes'> 

b"The Movie was sub-par, but this Television Pilot delivers a great springboard into what has become a Sci-Fi fans Ideal program. The Actors deliver and the special effects (for a television series) are spectacular. Having an intelligent interesting script doesn't hurt either.  Stargate SG1 is currently one of my favorite programs."


#### class 별 샘플 수 확인

In [5]:
import numpy as np
print("클래스별 샘플 수 (훈련 데이터): {}".format(np.bincount(y_train)))

클래스별 샘플 수 (훈련 데이터): [12500 12500]


#### full code

In [6]:
# 데이터 로드
reviews_train = load_files("./data/aclImdb/train")
reviews_test = load_files("./data/aclImdb/test")

# data는 text파일, target은 폴더 번호
text_train, y_train = reviews_train.data, reviews_train.target
text_test, y_test = reviews_test.data, reviews_test.target

# data에서 개행문자 삭제
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

print("학습 데이터의 문서 수: {}".format(len(text_train)))
print("테스트 데이터의 문서 수: {}".format(len(text_test)))
print("클래스별 샘플 수 (학습 데이터): {}".format(np.bincount(y_train)))
print("클래스별 샘플 수 (테스트 데이터): {}".format(np.bincount(y_test)))

학습 데이터의 문서 수: 25000
테스트 데이터의 문서 수: 25000
클래스별 샘플 수 (학습 데이터): [12500 12500]
클래스별 샘플 수 (테스트 데이터): [12500 12500]


#### 어휘사전 생성 및 확인
> CountVactorizer() : 문서 집합으로부터 단어의 수를 세어 카운트 행렬을 생성

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

bards_words = ["The fool doth think he is wise,",
              "but the wise man knows himself to be a fool"]

# CountVectorizer 객체 생성
vect = CountVectorizer()

# 문자열 리스트를 토큰으로 분리하고, 어휘사전 구축
vect.fit(bards_words)

# 어휘사전은 vocabulary_ 속성에 저장됨
print("어휘 사전의 크기: {}".format(len(vect.vocabulary_)))
print("어휘 사전의 내용: \n{}".format(vect.vocabulary_))

어휘 사전의 크기: 13
어휘 사전의 내용: 
{'the': 9, 'fool': 3, 'doth': 2, 'think': 10, 'he': 4, 'is': 6, 'wise': 12, 'but': 1, 'man': 8, 'knows': 7, 'himself': 5, 'to': 11, 'be': 0}


#### BOW 생성

In [16]:
bag_of_words = vect.transform(bards_words)

# 희소행렬로 표현(0이 많기 때문)
# repr : 숫자 -> 문자열
print("BOW: {}".format(repr(bag_of_words)), "\n")
print(bag_of_words, "\n")

# numpy로 변환해야 볼 수 있음. 각 document의 bow 표현
print("BOW의 밀집 표현: \n{}".format(bag_of_words.toarray()))

BOW: <2x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format> 

  (0, 19787)	1
  (0, 25399)	1
  (0, 30211)	1
  (0, 34585)	1
  (0, 66339)	1
  (0, 66526)	1
  (0, 73287)	1
  (1, 6334)	1
  (1, 9881)	1
  (1, 25399)	1
  (1, 31014)	1
  (1, 36926)	1
  (1, 40591)	1
  (1, 66339)	1
  (1, 67125)	1
  (1, 73287)	1 

BOW의 밀집 표현: 
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


#### 영화 리뷰에 대한 BOW

In [17]:
# BOW 생성
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print("X_train:\n{}".format(repr(X_train)))

X_train:
<25000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 3431196 stored elements in Compressed Sparse Row format>


In [18]:
# get_feature_names : 단어(특징)을 추출하여 리스트 구성
feature_names = vect.get_feature_names()
print("특성 개수: {}".format(len(feature_names)))
print("처음 20개 특성:\n{}".format(feature_names[:20]))

특성 개수: 74849
처음 20개 특성:
['00', '000', '0000000000001', '00001', '00015', '000s', '001', '003830', '006', '007', '0079', '0080', '0083', '0093638', '00am', '00pm', '00s', '01', '01pm', '02']
