## Bag of Words(BoW)

### Basic

In [1]:
from konlpy.tag import Okt

okt = Okt()

def build_bag_of_words(document):
    document = document.replace('.', '')
    tokenized_document = okt.morphs(document)
    
    word_to_index = {}
    bow = []
    
    for word in tokenized_document:        
        if word not in word_to_index.keys():  # 단어가 처음 나왔을 경우
            word_to_index[word] = len(word_to_index)  
            bow.insert(len(word_to_index) - 1, 1)
        else:                                 # 단어가 다시 나왔을 경우
            index = word_to_index.get(word)
            print(word, type(word_to_index))
            bow[index] = bow[index] + 1
                    
    return word_to_index, bow
    
    
doc1 = "정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다."
vocab, bow = build_bag_of_words(doc1)
print('vocabulary :', vocab)
print('bag of words vector :', bow)

가 <class 'dict'>
물가상승률 <class 'dict'>
vocabulary : {'정부': 0, '가': 1, '발표': 2, '하는': 3, '물가상승률': 4, '과': 5, '소비자': 6, '느끼는': 7, '은': 8, '다르다': 9}
bag of words vector : [1, 2, 1, 1, 2, 1, 1, 1, 1, 1]


In [2]:
doc2 = '소비자는 주로 소비하는 상품을 기준으로 물가상승률을 느낀다.'

vocab, bow = build_bag_of_words(doc2)
print('vocabulary :', vocab)
print('bag of words vector :', bow)

을 <class 'dict'>
vocabulary : {'소비자': 0, '는': 1, '주로': 2, '소비': 3, '하는': 4, '상품': 5, '을': 6, '기준': 7, '으로': 8, '물가상승률': 9, '느낀다': 10}
bag of words vector : [1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]


In [3]:
doc3 = doc1 + ' ' + doc2
vocab, bow = build_bag_of_words(doc3)

print('vocabulary :', vocab)
print('bag of words vector :', bow)

가 <class 'dict'>
물가상승률 <class 'dict'>
소비자 <class 'dict'>
하는 <class 'dict'>
물가상승률 <class 'dict'>
을 <class 'dict'>
vocabulary : {'정부': 0, '가': 1, '발표': 2, '하는': 3, '물가상승률': 4, '과': 5, '소비자': 6, '느끼는': 7, '은': 8, '다르다': 9, '는': 10, '주로': 11, '소비': 12, '상품': 13, '을': 14, '기준': 15, '으로': 16, '느낀다': 17}
bag of words vector : [1, 2, 1, 2, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1]


### BOW-CountVectorizer 

In [4]:
# CountVectorizer : 띄어쓰기만으로 토큰화 진행
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['you know I want your love. because I love you.']
vector = CountVectorizer()

# 빈도수 기록
print('bag of words vector :', vector.fit_transform(corpus).toarray()) 

# 단어의 인덱스
print('vocabulary :',vector.vocabulary_)

bag of words vector : [[1 1 2 1 2 1]]
vocabulary : {'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}


### BOW-불용어 제거(사용자 정의 불용어)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

In [6]:
# 사용자정의 불용어 적용
text = ["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words=["the", "a", "an", "is", "not"])

print('bag of words vector :',vect.fit_transform(text).toarray())
print('vocabulary :',vect.vocabulary_)

bag of words vector : [[1 1 1 1 1]]
vocabulary : {'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


### BOW-불용어 제거(CountVectorizer 포함된 불용어)

In [7]:
text = ["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words="english")

print('bag of words vector :',vect.fit_transform(text).toarray())
print('vocabulary :',vect.vocabulary_)

bag of words vector : [[1 1 1]]
vocabulary : {'family': 0, 'important': 1, 'thing': 2}


### BOW-불용어 제거(NLTK 포함된 불용어)

In [8]:
text = ["Family is not an important thing. It's everything."]
stop_words = stopwords.words("english")

vect = CountVectorizer(stop_words=stop_words)
print('bag of words vector :',vect.fit_transform(text).toarray()) 
print('vocabulary :',vect.vocabulary_)

bag of words vector : [[1 1 1 1]]
vocabulary : {'family': 1, 'important': 2, 'thing': 3, 'everything': 0}


## 문서 단어 행렬(Document-Term Matrix, DTM)

### 데이터 다운로드

In [9]:
# colab : wget -c https://raw.githubusercontent.com/euphoris/datasets/master/imdb.xlsx

In [10]:
!pip install wget



In [11]:
# windows일 경우
import wget

wget.download('https://raw.githubusercontent.com/euphoris/datasets/master/imdb.xlsx')

100% [..............................................................................] 55772 / 55772

'imdb (1).xlsx'

### 데이터 열기

In [12]:
import pandas as pd

df = pd.read_excel('imdb.xlsx', index_col=0)

In [13]:
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [14]:
df.shape

(748, 2)

### TDM 만들기

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

# max_features : 빈도순으로 최대 500단어까지 포함
# stop_words='english' : 불용어(관사, 전치사등을 제거)
cv = CountVectorizer(max_features=500, stop_words='english')

In [16]:
tdm = cv.fit_transform(df['review'])
tdm

<748x500 sparse matrix of type '<class 'numpy.int64'>'
	with 3434 stored elements in Compressed Sparse Row format>

In [17]:
tdm.shape

(748, 500)

### 단어 목록

In [18]:
# 단어 목록을 볼때는 tdm이 아니라 cv에 저장되어 있음
cv.get_feature_names()[:10]



['10',
 '20',
 '90',
 'absolutely',
 'acted',
 'acting',
 'action',
 'actor',
 'actors',
 'actress']

In [19]:
len(cv.get_feature_names())

500

### 단어별 총 빈도

In [20]:
tdm.sum(axis=0) # 열별 단어 수 

matrix([[ 29,   3,   6,   9,   3,  43,   7,  10,  19,   3,   3,  10,   3,
           4,   3,   4,   9,   3,   3,   3,   6,   3,   4,   3,  13,   4,
           3,   5,   3,   8,   5,   3,  14,  71,   4,  11,   4,   6,   8,
          25,  18,  10,   5,  10,   4,   3,   4,  10,   3,   3,   6,   7,
           3,   4,  10,   5,   3,  18,   6,   8,  24,  35,   4,   3,   7,
           3,   4,   5,   8,  10,   6,   5,   5,   7,   4,   3,   3,   5,
           3,   3,   4,   4,   7,   4,   5,   3,   4,   6,   4,   3,   5,
           4,   3,   5,   4,   3,   9,   3,   4,   3,  13,  11,  22,   5,
           3,   9,   5,  12,   6,   3,   9,  12,  26,   4,  10,   3,   3,
           3,   3,   4,   4,   3,   6,   3,   9,  11,  11,   4,   5,   3,
           9,   4,   6,   3,   8,   3,   3,  16,   4,   7,   4,   4,   5,
           8,   5,   7,   5,   7,   5,   3,   3,  10,   5, 163,  24,   3,
           3,   5,   6,   4,   5,  19,   3,   9,   3,   7,   5,   3,   4,
           3,   6,   3,   7,   3,   6,

In [21]:
tdm.sum(axis=1) # 행별 단어 수 

matrix([[  5],
        [  5],
        [ 12],
        [  3],
        [  6],
        [  7],
        [  2],
        [  7],
        [  2],
        [  2],
        [  1],
        [  4],
        [  3],
        [  1],
        [  5],
        [  5],
        [  6],
        [ 10],
        [  3],
        [229],
        [  5],
        [  2],
        [  5],
        [  3],
        [  1],
        [  2],
        [  4],
        [  2],
        [  7],
        [  3],
        [  1],
        [  0],
        [  1],
        [  3],
        [  4],
        [  6],
        [  3],
        [  6],
        [ 11],
        [  3],
        [  4],
        [  3],
        [  2],
        [  0],
        [  0],
        [  6],
        [  4],
        [  3],
        [  1],
        [  8],
        [  5],
        [  5],
        [  2],
        [  2],
        [  4],
        [  5],
        [  5],
        [  1],
        [  2],
        [  4],
        [  3],
        [  4],
        [  0],
        [  1],
        [  2],
        [  3],
        [ 

In [11]:
word_count = pd.DataFrame({
    '단어': cv.get_feature_names(),
    '빈도': tdm.sum(axis=0).flat
})

NameError: name 'pd' is not defined

In [12]:
type(tdm.sum(axis=0))

NameError: name 'tdm' is not defined

In [13]:
word_count

NameError: name 'word_count' is not defined

In [14]:
word_count.sort_values('빈도', ascending=False).head()

NameError: name 'word_count' is not defined

### 단어 빈도 저장

In [15]:
word_count.to_csv('word_count.csv')

NameError: name 'word_count' is not defined

## 단어 구름(wordcloud)

### wordcloud 설치

In [None]:
#아나콘다를 이용할 경우:
!conda install -y -c conda-forge wordcloud

#맥 또는 리눅스에서는 pip 명령어로 설치
!pip install wordcloud

In [None]:
!pip install wordcloud

### 데이터 불러오기

In [28]:
import pandas as pd

word_count = pd.read_csv('word_count.csv', index_col=0)
word_count.head()

Unnamed: 0,단어,빈도
0,10,29
1,20,3
2,90,6
3,absolutely,9
4,acted,3


### 단어 구름

In [29]:
from wordcloud import WordCloud

#font_path: 글꼴의 경로
#max_words: 워드클라우드를 그릴 단어의 개수
#background_color: 배경색 설정
#width : 가로크기(픽셀 단위)
#height: 세로크기(픽셀 단위)
#배경은 흰색, 최대 100단어를 가로 400픽셀, 세로 300픽셀 크기로 단어 구름으로 그린다.
wc = WordCloud(background_color='white', max_words=100, width=400, height=300)