### Text Data Loading

In [2]:
from sklearn.datasets import load_files

reviews_train = load_files("./data/aclImdb/train")

text_train, y_train = reviews_train.data, reviews_train.target

print(type(reviews_train))
print("text_train의 타입: {}".format(type(text_train)))
print("text_train의 길이: {}".format(len(text_train)))
print("text_train[6]: \n{}".format(text_train[6]))
print("y_train[6]: \n{}".format(y_train[6]))

<class 'sklearn.utils.Bunch'>
text_train의 타입: <class 'list'>
text_train의 길이: 25000
text_train[6]: 
b"This movie has a special way of telling the story, at first i found it rather odd as it jumped through time and I had no idea whats happening.<br /><br />Anyway the story line was although simple, but still very real and touching. You met someone the first time, you fell in love completely, but broke up at last and promoted a deadly agony. Who hasn't go through this? but we will never forget this kind of pain in our life. <br /><br />I would say i am rather touched as two actor has shown great performance in showing the love between the characters. I just wish that the story could be a happy ending."
y_train[6]: 
1


#### text data 중에서 개행문자 삭제

In [3]:
print(text_train[5], "\n")
print(type(text_train[5]), "\n")

text_train = [doc.replace(b"<br />", b" ") for doc in text_train]
print(text_train[5])

b"The Movie was sub-par, but this Television Pilot delivers a great springboard into what has become a Sci-Fi fans Ideal program. The Actors deliver and the special effects (for a television series) are spectacular. Having an intelligent interesting script doesn't hurt either.<br /><br />Stargate SG1 is currently one of my favorite programs." 

<class 'bytes'> 

b"The Movie was sub-par, but this Television Pilot delivers a great springboard into what has become a Sci-Fi fans Ideal program. The Actors deliver and the special effects (for a television series) are spectacular. Having an intelligent interesting script doesn't hurt either.  Stargate SG1 is currently one of my favorite programs."


#### class 별 샘플 수 확인

In [4]:
import numpy as np
print("클래스별 샘플 수 (훈련 데이터): {}".format(np.bincount(y_train)))

클래스별 샘플 수 (훈련 데이터): [12500 12500]


#### full code

In [5]:
# 데이터 로드
reviews_train = load_files("./data/aclImdb/train")
reviews_test = load_files("./data/aclImdb/test")

# data는 text파일, target은 폴더 번호
text_train, y_train = reviews_train.data, reviews_train.target
text_test, y_test = reviews_test.data, reviews_test.target

# data에서 개행문자 삭제
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

print("학습 데이터의 문서 수: {}".format(len(text_train)))
print("테스트 데이터의 문서 수: {}".format(len(text_test)))
print("클래스별 샘플 수 (학습 데이터): {}".format(np.bincount(y_train)))
print("클래스별 샘플 수 (테스트 데이터): {}".format(np.bincount(y_test)))

학습 데이터의 문서 수: 25000
테스트 데이터의 문서 수: 25000
클래스별 샘플 수 (학습 데이터): [12500 12500]
클래스별 샘플 수 (테스트 데이터): [12500 12500]


#### 어휘사전 생성 및 확인
> CountVactorizer() : 문서 집합으로부터 단어의 수를 세어 카운트 행렬을 생성

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

bards_words = ["The fool doth think he is wise,",
              "but the wise man knows himself to be a fool"]

# CountVectorizer 객체 생성
vect = CountVectorizer()

# 문자열 리스트를 토큰으로 분리하고, 어휘사전 구축
vect.fit(bards_words)

# 어휘사전은 vocabulary_ 속성에 저장됨
print("어휘 사전의 크기: {}".format(len(vect.vocabulary_)))
print("어휘 사전의 내용: \n{}".format(vect.vocabulary_))

어휘 사전의 크기: 13
어휘 사전의 내용: 
{'the': 9, 'fool': 3, 'doth': 2, 'think': 10, 'he': 4, 'is': 6, 'wise': 12, 'but': 1, 'man': 8, 'knows': 7, 'himself': 5, 'to': 11, 'be': 0}


#### BOW 생성

In [7]:
bag_of_words = vect.transform(bards_words)

# 희소행렬로 표현(0이 많기 때문)
# repr : 숫자 -> 문자열
print("BOW: {}".format(repr(bag_of_words)), "\n")
print(bag_of_words, "\n")

# numpy로 변환해야 볼 수 있음. 각 document의 bow 표현
print("BOW의 밀집 표현: \n{}".format(bag_of_words.toarray()))

BOW: <2x13 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format> 

  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 6)	1
  (0, 9)	1
  (0, 10)	1
  (0, 12)	1
  (1, 0)	1
  (1, 1)	1
  (1, 3)	1
  (1, 5)	1
  (1, 7)	1
  (1, 8)	1
  (1, 9)	1
  (1, 11)	1
  (1, 12)	1 

BOW의 밀집 표현: 
[[0 0 1 1 1 0 1 0 0 1 1 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]


#### 영화 리뷰에 대한 BOW

In [8]:
# BOW 생성
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print("X_train:\n{}".format(repr(X_train)))

X_train:
<25000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 3431196 stored elements in Compressed Sparse Row format>


In [9]:
# get_feature_names : 단어(특징)을 추출하여 리스트 구성
feature_names = vect.get_feature_names()
print("특성 개수: {}".format(len(feature_names)))
print("처음 20개 특성:\n{}".format(feature_names[:20]))

특성 개수: 74849
처음 20개 특성:
['00', '000', '0000000000001', '00001', '00015', '000s', '001', '003830', '006', '007', '0079', '0080', '0083', '0093638', '00am', '00pm', '00s', '01', '01pm', '02']


In [10]:
print("20010에서 20030까지 특성:\n{}".format(feature_names[20010:20030]))
print("매 2000번째 특성:\n{}".format(feature_names[::2000]))

20010에서 20030까지 특성:
['dratted', 'draub', 'draught', 'draughts', 'draughtswoman', 'draw', 'drawback', 'drawbacks', 'drawer', 'drawers', 'drawing', 'drawings', 'drawl', 'drawled', 'drawling', 'drawn', 'draws', 'draza', 'dre', 'drea']
매 2000번째 특성:
['00', 'aesir', 'aquarian', 'barking', 'blustering', 'bête', 'chicanery', 'condensing', 'cunning', 'detox', 'draper', 'enshrined', 'favorit', 'freezer', 'goldman', 'hasan', 'huitieme', 'intelligible', 'kantrowitz', 'lawful', 'maars', 'megalunged', 'mostey', 'norrland', 'padilla', 'pincher', 'promisingly', 'receptionist', 'rivals', 'schnaas', 'shunning', 'sparse', 'subset', 'temptations', 'treatises', 'unproven', 'walkman', 'xylophonist']


#### 불용어 제거

In [11]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print("불용어 개수: {}".format(len(ENGLISH_STOP_WORDS)))
print("불용어 일부:\n{}".format(list(ENGLISH_STOP_WORDS)[:20]))

불용어 개수: 318
불용어 일부:
['among', 'cry', 'though', 'top', 'through', 'whereas', 'be', 'con', 'can', 'eight', 'if', 'keep', 'ltd', 'sixty', 'beyond', 'bottom', 'twenty', 'everything', 'seem', 'very']


In [12]:
# stop words="english"라고 지정하면 내장된 불용어를 사용함
vect = CountVectorizer(min_df=5, stop_words="english").fit(text_train)
X_train = vect.transform(text_train)
print("불용어가 제거된 X_train:\n{}".format(repr(X_train)))

불용어가 제거된 X_train:
<25000x26966 sparse matrix of type '<class 'numpy.int64'>'
	with 2149958 stored elements in Compressed Sparse Row format>


#### TF-IDF

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?'
]

vect1 = CountVectorizer().fit(corpus)
tf = vect1.transform(corpus)

feature_names = vect1.get_feature_names()
print("Term:{}".format(feature_names[:]))
print(tf.toarray(), "\n")

vect2 = TfidfVectorizer().fit(corpus)
tfidf = vect2.transform(corpus)
print(tfidf.toarray())

Term:['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]] 

[[0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]
 [0.         0.27230147 0.         0.27230147 0.         0.85322574
  0.22262429 0.         0.27230147]
 [0.55280532 0.         0.         0.         0.55280532 0.
  0.28847675 0.55280532 0.        ]
 [0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]]


#### 단어의 쌍으로 구성하는 n-gram 방식

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

bards_words = [
    'The fool doth think he is wise,',
    'but the wise man knows himself to be a fool'
]

# (1, 1) 최소, 최대 단어수
cv = CountVectorizer(ngram_range=(1, 1)).fit(bards_words)
print("어휘 사전 크기: {}".format(len(cv.vocabulary_)))
print("어휘 사전:\n{}".format(cv.get_feature_names()))
print("변환된 데이터 (밀집 배열):\n{}".format(cv.transform(bards_words).toarray()))

어휘 사전 크기: 13
어휘 사전:
['be', 'but', 'doth', 'fool', 'he', 'himself', 'is', 'knows', 'man', 'the', 'think', 'to', 'wise']
변환된 데이터 (밀집 배열):
[[0 0 1 1 1 0 1 0 0 1 1 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]


In [21]:
# 두 쌍의 단어로 구성
cv = CountVectorizer(ngram_range=(2,2)).fit(bards_words)
print("어휘 사전 크기: {}".format(len(cv.vocabulary_)))
print("어휘 사전:\n{}".format(cv.get_feature_names()))
print("변환된 데이터 (밀집 배열):\n{}".format(cv.transform(bards_words).toarray()))

어휘 사전 크기: 14
어휘 사전:
['be fool', 'but the', 'doth think', 'fool doth', 'he is', 'himself to', 'is wise', 'knows himself', 'man knows', 'the fool', 'the wise', 'think he', 'to be', 'wise man']
변환된 데이터 (밀집 배열):
[[0 0 1 1 1 0 1 0 0 1 0 1 0 0]
 [1 1 0 0 0 1 0 1 1 0 1 0 1 1]]


In [22]:
# 1-gram부터 trigram까지 모두 포함
cv = CountVectorizer(ngram_range=(1, 3)).fit(bards_words)
print("어휘 사전 크기: {}".format(len(cv.vocabulary_)))
print("어휘 사전:\n{}".format(cv.get_feature_names()))
print("변환된 데이터 (밀집 배열):\n{}".format(cv.transform(bards_words).toarray()))

어휘 사전 크기: 39
어휘 사전:
['be', 'be fool', 'but', 'but the', 'but the wise', 'doth', 'doth think', 'doth think he', 'fool', 'fool doth', 'fool doth think', 'he', 'he is', 'he is wise', 'himself', 'himself to', 'himself to be', 'is', 'is wise', 'knows', 'knows himself', 'knows himself to', 'man', 'man knows', 'man knows himself', 'the', 'the fool', 'the fool doth', 'the wise', 'the wise man', 'think', 'think he', 'think he is', 'to', 'to be', 'to be fool', 'wise', 'wise man', 'wise man knows']
변환된 데이터 (밀집 배열):
[[0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 1 1 1 0 0 0
  1 0 0]
 [1 1 1 1 1 0 0 0 1 0 0 0 0 0 1 1 1 0 0 1 1 1 1 1 1 1 0 0 1 1 0 0 0 1 1 1
  1 1 1]]


#### stemming test

In [24]:
# 단어별 stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# stemmer 객체 생성
ps = PorterStemmer()

example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"]
for stem in example_words:
    print(ps.stem(stem)) # stem 함수로 단어별 stemming 실행

python
python
python
python
pythonli


In [26]:
# word_tokenize 에러 해결 부분
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/sunghee/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [27]:
# 문장 전체를 stemming
new_text = "It is important to be very pythonly while you are pythoning with python."

words = word_tokenize(new_text)
print(words)

for w in words:
    print(ps.stem(w))

['It', 'is', 'important', 'to', 'be', 'very', 'pythonly', 'while', 'you', 'are', 'pythoning', 'with', 'python', '.']
It
is
import
to
be
veri
pythonli
while
you
are
python
with
python
.


In [28]:
# 화면에 출력하지 않고 단어리스트로 유지
result = [ps.stem(w) for w in words]
print(result)

['It', 'is', 'import', 'to', 'be', 'veri', 'pythonli', 'while', 'you', 'are', 'python', 'with', 'python', '.']


#### stemming 기능을 추가한 BOW 생성

In [30]:
stemmer = PorterStemmer()

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

# CountVectorizer에 토큰을 생성하는 별도의 함수를 지정하고 함수 내에서 토큰화와 stemming을 실행
vect = CountVectorizer(tokenizer=tokenize, stop_words='english')
vect.fit(['This swimmer likes swimming.'])

sentence1 = vect.transform(['The swimmer likes swimming.'])
sentence2 = vect.transform(['The swimmer swim. .'])

print(vect.get_feature_names())
print(sentence1.toarray())
print(sentence2.toarray())

['.', 'like', 'swim', 'swimmer', 'thi']
[[1 1 1 1 0]]
[[2 0 1 1 0]]


#### full code

In [38]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import nltk
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import numpy as np

reviews_train = load_files("./data/aclImdb/train")
text_train, y_train = reviews_train.data, reviews_train.target

reviews_test = load_files('./data/aclImdb/test')
text_test, y_test = reviews_test.data, reviews_test.target
print("테스트 데이터의 문서 수: {}".format(len(text_test)))
print("클래스별 샘플 수 (테스트 데이터): {}".format(np.bincount(y_test)))

text_test = [doc.replace(b"<br />", b" ") for doc in text_test]
stemmer = PorterStemmer()

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

vect = CountVectorizer(tokenizer=tokenize, stop_words='english', token_pattern=u"(?u)\b\w\w+\b").fit(text_train)
X_train = vect.transform(text_train)
print("X_train:\n{}".format(repr(X_train)))

param_grid = {'C':[0.01, 0.1]}
grid = GridSearchCV(LogisticRegression(solver='liblinear'), param_grid, cv=5)
grid.fit(X_train, y_train)
print("최상의 크로스 밸리데이션 점수: {:.2f}".format(grid.best_score_))
print("최적의 매개변수: ", grid.best_params_)


X_test = vect.transform(text_test)
print("테스트 점수: {:.2f}".format(grid.score(X_test, y_test)))

테스트 데이터의 문서 수: 25000
클래스별 샘플 수 (테스트 데이터): [12500 12500]
X_train:
<25000x89234 sparse matrix of type '<class 'numpy.int64'>'
	with 2516865 stored elements in Compressed Sparse Row format>
최상의 크로스 밸리데이션 점수: 0.88
최적의 매개변수:  {'C': 0.1}
테스트 점수: 0.88
