## 1 Bag of Words(BoW)

#### 1.1 

In [8]:
from konlpy.tag import Okt

okt = Okt()

def build_bag_of_words(document):
    document = document.replace('.', '')
    tokenized_document = okt.morphs(document)
    
    word_to_index = {}
    bow = []
    
    for word in tokenized_document:
        if word not in word_to_index.keys():    # 단어가 처음 나왔을 경우
            word_to_index[word] = len(word_to_index)
            bow.insert(len(word_to_index) - 1, 1) # 처음나왔을때 무조건 1
        
        else:   # 반복해서 나올때 세줘야함
            index = word_to_index.get(word)
            bow[index] = bow[index] + 1
            
            
        
    return word_to_index, bow

# 단어가 몇번 나왔나

doc1 = "정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다."
vocab, bow = build_bag_of_words(doc1)
print('vocabulary :', vocab)
print('bag of words vector :', bow)

vocabulary : {'정부': 0, '가': 1, '발표': 2, '하는': 3, '물가상승률': 4, '과': 5, '소비자': 6, '느끼는': 7, '은': 8, '다르다': 9}
bag of words vector : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [12]:
doc2 = '소비자는 주로 소비하는 상품을 기준으로 물가상승률을 느낀다.'
vocab, bow = build_bag_of_words(doc2)

print('vocabulary :', vocab)
print('bag of words vector :', bow)

vocabulary : {'소비자': 0, '는': 1, '주로': 2, '소비': 3, '하는': 4, '상품': 5, '을': 6, '기준': 7, '으로': 8, '물가상승률': 9, '느낀다': 10}
bag of words vector : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
doc3 = doc1 + ' ' + doc2
vocab, bow = 

### 1.2 BOW-CountVectorizer

In [14]:
# CountVectorizer 띄어쓰기만 가능, 한국어에 적합하지 않다, 영어중심
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['you know I want your love. because I love you.']
vector = CountVectorizer()

# 빈도수 기록
print('bag of words vector :', vector.fit_transform(corpus).toarray())

# 단어 인덱스
print('vocabulary :', vector.vocabulary_)

bag of words vector : [[1 1 2 1 2 1]]
vocabulary : {'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}


### 1.3 BOW-불용어 제거
- BOW 쓰는 이유: 횟수로 중요한 단어 찾아내는거, 횟수 작은거 불용어로 들어가게 하는거

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

In [16]:
# 사용자정의 불용어 적용
text = ["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words=["the", "a", "an", "is", "not"])

print('bag of words vector :' ,vect.fit_transform(text).toarray())
print('vocabulary :',vect.vocabulary_)

bag of words vector : [[1 1 1 1 1]]
vocabulary : {'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


### 1.4 BOW-불용어 제거(CountVectorizer 포함된 불용어)

In [18]:
text = ["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words="english")

print('bag of words vector :',vect.fit_transform(text).toarray())
print('vocabulary :',vect.vocabulary_)

bag of words vector : [[1 1 1]]
vocabulary : {'family': 0, 'important': 1, 'thing': 2}


### 1.5 BOW-불용어 제거(NLTK 포함된 불용어)

In [19]:
text = ["Family is not an important thing. It's everything."]
stop_words = stopwords.words("english")

vect = CountVectorizer(stop_words=stop_words)
print('bag of words vector :',vect.fit_transform(text).toarray())
print('vocabulary :',vect.vocabulary_)

bag of words vector : [[1 1 1 1]]
vocabulary : {'family': 1, 'important': 2, 'thing': 3, 'everything': 0}


## 2 문서 단어 행렬(Document-Term Matrix, DTM)
- 문제점 : 0 이 많다, 단어텀이 많다

#### 2.0.1 데이터 다운로드

In [None]:
# colab : wget -c https://raw.githubusercontent.com/euphoris/datasets/master/imdb.xlsx

In [20]:
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py): started
  Building wheel for wget (setup.py): finished with status 'done'
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=bf4b9bde4e237429116354d03b484e93f05abf0164015a498c56fba6235a7757
  Stored in directory: c:\users\virtue\appdata\local\pip\cache\wheels\04\5f\3e\46cc37c5d698415694d83f607f833f83f0149e49b3af9d0f38
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [23]:
# windows
import wget

wget.download('https://raw.githubusercontent.com/euphoris/datasets/master/imdb.xlsx')

  0% [                                                                              ]     0 / 55772 14% [...........                                                                   ]  8192 / 55772 29% [......................                                                        ] 16384 / 55772 44% [..................................                                            ] 24576 / 55772 58% [.............................................                                 ] 32768 / 55772 73% [.........................................................                     ] 40960 / 55772 88% [....................................................................          ] 49152 / 55772100% [..............................................................................] 55772 / 55772

'imdb (1).xlsx'

#### 2.0.2 데이터 열기

In [27]:
import pandas as pd

df = pd.read_excel('imdb (1).xlsx', index_col=0)

In [28]:
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


### 2.3 TDM 만들기

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

# max_features : 빈도순으로 최대 500단어까지 포함
# stop_words='english' : 불용어(관사, 전치사등을 제거)
cv = CountVectorizer(max_features=500, stop_words='english') 
#숫자는 바꿔보고 잘 나오는걸로 하기

In [30]:
tdm = cv.fit_transform(df['review']) # 콜론에 맞게 변환, 띄어쓰기 기준으로 단어 자름
tdm

<748x500 sparse matrix of type '<class 'numpy.int64'>'
	with 3434 stored elements in Compressed Sparse Row format>

In [31]:
tdm.shape

(748, 500)

### 2.4 단어 목록

In [32]:
# 단어 목록을 볼때는 tdm이 아니라 cv에 저장되어 있음, 단어가 피쳐, 기본함수만 사용하면 500개 단어가 나옴
cv.get_feature_names()[:10]



['10',
 '20',
 '90',
 'absolutely',
 'acted',
 'acting',
 'action',
 'actor',
 'actors',
 'actress']

In [33]:
len(cv.get_feature_names()) # 500개 단어있는지 확인

500

### 2.5 단어별 총 빈도