In [1]:
!pip install konlpy
!pip install nltk==3.5

Collecting nltk==3.5
  Downloading nltk-3.5.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 1.5 MB/s eta 0:00:01
Collecting regex
  Downloading regex-2021.3.17-cp37-cp37m-macosx_10_9_x86_64.whl (285 kB)
[K     |████████████████████████████████| 285 kB 2.0 MB/s eta 0:00:01
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldone
[?25h  Created wheel for nltk: filename=nltk-3.5-py3-none-any.whl size=1434674 sha256=d78adb5c43e27174d0ec71454f02d8308595ce33ddce6d0889171659441ab112
  Stored in directory: /Users/taejinoh/Library/Caches/pip/wheels/45/6c/46/a1865e7ba706b3817f5d1b2ff7ce8996aabdd0d03d47ba0266
Successfully built nltk
Installing collected packages: regex, nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.4.5
    Uninstalling nltk-3.4.5:
      Successfully uninstalled nltk-3.4.5
Successfully installed nltk-3.5 regex-2021.3.17


In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/taejinoh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/taejinoh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# N-Gram

In [4]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

In [5]:
text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]

In [7]:
list(bigrams(text[0]))

[('a', 'b'), ('b', 'c')]

In [8]:
list(ngrams(text[1], n=3))

[('a', 'c', 'd'), ('c', 'd', 'c'), ('d', 'c', 'e'), ('c', 'e', 'f')]

### 문장의 시작과 끝에 위치한 단어를 알 수 있도록 padding을 할 수 있다
- Bigram

In [12]:
from nltk.util import pad_sequence
list(pad_sequence(text[0],
                  pad_left=True, left_pad_symbol="<s>",
                  pad_right=True, right_pad_symbol="</s>",
                  n=2)) 

['<s>', 'a', 'b', 'c', '</s>']

In [None]:
padded_sent = list(pad_sequence(text[0], pad_left=True, left_pad_symbol="<s>", 
                                pad_right=True, right_pad_symbol="</s>", n=2))
list(ngrams(padded_sent, n=2))

[('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]

- Trigram

In [None]:
list(pad_sequence(text[0],
                  pad_left=True, left_pad_symbol="<s>",
                  pad_right=True, right_pad_symbol="</s>",
                  n=3)

['<s>', '<s>', 'a', 'b', 'c', '</s>', '</s>']

In [None]:
padded_sent = list(pad_sequence(text[0], pad_left=True, left_pad_symbol="<s>", 
                                pad_right=True, right_pad_symbol="</s>", n=3))
list(ngrams(padded_sent, n=3))

[('<s>', '<s>', 'a'),
 ('<s>', 'a', 'b'),
 ('a', 'b', 'c'),
 ('b', 'c', '</s>'),
 ('c', '</s>', '</s>')]

- `pad_both_ends`를 이용해 한번에 처리 가능

In [None]:
from nltk.lm.preprocessing import pad_both_ends
list(bigrams(pad_both_ends(text[0], n=2)))

[('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]

- `everygrams`를 이용해 한번에 여러 n-gram을 생성할 수 있다

In [None]:
from nltk.util import everygrams
padded_bigrams = list(pad_both_ends(text[1], n=3))
list(everygrams(padded_bigrams, max_len=3))

[('<s>',),
 ('<s>',),
 ('a',),
 ('c',),
 ('d',),
 ('c',),
 ('e',),
 ('f',),
 ('</s>',),
 ('</s>',),
 ('<s>', '<s>'),
 ('<s>', 'a'),
 ('a', 'c'),
 ('c', 'd'),
 ('d', 'c'),
 ('c', 'e'),
 ('e', 'f'),
 ('f', '</s>'),
 ('</s>', '</s>'),
 ('<s>', '<s>', 'a'),
 ('<s>', 'a', 'c'),
 ('a', 'c', 'd'),
 ('c', 'd', 'c'),
 ('d', 'c', 'e'),
 ('c', 'e', 'f'),
 ('e', 'f', '</s>'),
 ('f', '</s>', '</s>')]

**학습을 위해서는 코퍼스의 모든 단어를 포함하는 단어사전(Vocabulary)이 필요**

In [None]:
from nltk.lm.preprocessing import flatten
list(flatten(pad_both_ends(sent, n=2) for sent in text))

['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']

`padded_everygram_pipeline`으로 모든 과정 한번에 수행

In [None]:
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(2, text)

In [None]:
for ngramlize_sent in train:
    print(list(ngramlize_sent))
    print()
print('#############')
list(vocab)

[('<s>',), ('a',), ('b',), ('c',), ('</s>',), ('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]

[('<s>',), ('a',), ('c',), ('d',), ('c',), ('e',), ('f',), ('</s>',), ('<s>', 'a'), ('a', 'c'), ('c', 'd'), ('d', 'c'), ('c', 'e'), ('e', 'f'), ('f', '</s>')]

#############


['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']

## 실제 데이터로 N-Gram 모델 학습

### 데이터 가져오기 및 토큰화

In [None]:
import os
import requests
import io

url = "https://gist.githubusercontent.com/alvations/53b01e4076573fea47c6057120bb017a/raw/b01ff96a5f76848450e648f35da6497ca9454e4a/language-never-random.txt"
text = requests.get(url).content.decode('utf8')
with io.open('language-never-random.txt', 'w', encoding='utf8') as fout:
    fout.write(text)

In [None]:
from nltk import word_tokenize, sent_tokenize

tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
print(text[:500])

                       Language is never, ever, ever, random

                                                               ADAM KILGARRIFF




Abstract
Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in cor-
pora, the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish 


In [None]:
tokenized_text[0]

['language',
 'is',
 'never',
 ',',
 'ever',
 ',',
 'ever',
 ',',
 'random',
 'adam',
 'kilgarriff',
 'abstract',
 'language',
 'users',
 'never',
 'choose',
 'words',
 'randomly',
 ',',
 'and',
 'language',
 'is',
 'essentially',
 'non-random',
 '.']

In [None]:
# 3-grams 언어모델 생성
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

## 모델 학습

In [None]:
from nltk.lm import MLE
model = MLE(n)

In [None]:
model.fit(train_data, padded_sents)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 1391 items>


In [None]:
len(model.vocab)

1391

In [None]:
print(model.vocab.lookup(tokenized_text[0]))

('language', 'is', 'never', ',', 'ever', ',', 'ever', ',', 'random', 'adam', 'kilgarriff', 'abstract', 'language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.')


In [None]:
print(model.vocab.lookup('language is never random lah .'.split()))

('language', 'is', 'never', 'random', '<UNK>', '.')


## 모델 사용

In [None]:
print(model.counts)

<NgramCounter with 3 ngram orders and 19611 ngrams>


In [None]:
model.counts['language'] # i.e. Count('language')

25

In [None]:
model.counts[['language']]['is'] # i.e. Count('is'|'language')

11

In [None]:
model.counts[['language', 'is']]['never'] # i.e. Count('never'|'language is')

7

In [None]:
model.score('language') # P('language')

0.003691671588895452

In [None]:
model.score('is', 'language'.split())  # P('is'|'language')

0.44

In [None]:
model.score('never', 'language is'.split())  # P('never'|'language is')

0.6363636363636364

## 문장 생성

In [None]:
print(model.generate(20, random_seed=7))

['and', 'carroll', 'used', 'hypothesis', 'testing', 'has', 'been', 'used', ',', 'and', 'a', 'half', '.', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']


In [None]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, random_seed=42):

    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

In [None]:
generate_sent(model, 20, random_seed=7)

'and carroll used hypothesis testing has been used, and a half.'

In [None]:
generate_sent(model, 28, random_seed=6)

'situation up symmetrically, a measure of salience, whereas in fact they were merely testing whether they had been selected at random, and the random in'

# Bag of Words

## Konlpy

In [None]:
from konlpy.tag import Okt
import re  
okt=Okt() 

In [None]:
text1 = '정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다.'
text2 = '소비자는 주로 소비하는 상품을 기준으로 물가상승률을 느낀다.'
text3 = '정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다. 소비자는 주로 소비하는 상품을 기준으로 물가상승률을 느낀다.'

In [None]:
token=re.sub("(\.)","",text1)  # 정규 표현식을 통해 온점을 제거 
token=okt.morphs(token)  # OKT 형태소 분석기를 통해 토큰화 
print(token)

['정부', '가', '발표', '하는', '물가상승률', '과', '소비자', '가', '느끼는', '물가상승률', '은', '다르다']


In [None]:
word2index={}  
bow=[]  
for voca in token:  
    if voca not in word2index.keys():  
        word2index[voca]=len(word2index)  # word2index에 없는 토큰은 추가   
        bow.insert(len(word2index)-1,1) 
    else:
        index=word2index.get(voca)  # 토큰의 인덱스 가져오기
        bow[index]=bow[index]+1  # 빈도수 += 1 

In [None]:
print(word2index) 

{'정부': 0, '가': 1, '발표': 2, '하는': 3, '물가상승률': 4, '과': 5, '소비자': 6, '느끼는': 7, '은': 8, '다르다': 9}


In [None]:
print(bow)

[1, 2, 1, 1, 2, 1, 1, 1, 1, 1]


In [None]:
token=re.sub("(\.)","",text2)
token=okt.morphs(token)

word2index={}  
bow=[]  
for voca in token:  
    if voca not in word2index.keys():  
        word2index[voca]=len(word2index)
        bow.insert(len(word2index)-1,1)
    else:
        index=word2index.get(voca)
        bow[index]=bow[index]+1
print(word2index)
print(bow)

{'소비자': 0, '는': 1, '주로': 2, '소비': 3, '하는': 4, '상품': 5, '을': 6, '기준': 7, '으로': 8, '물가상승률': 9, '느낀다': 10}
[1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]


In [None]:
token=re.sub("(\.)","",text3)
token=okt.morphs(token)

word2index={}  
bow=[]  
for voca in token:  
    if voca not in word2index.keys():  
        word2index[voca]=len(word2index)
        bow.insert(len(word2index)-1,1)
    else:
        index=word2index.get(voca)
        bow[index]=bow[index]+1
print(word2index)
print(bow)

{'정부': 0, '가': 1, '발표': 2, '하는': 3, '물가상승률': 4, '과': 5, '소비자': 6, '느끼는': 7, '은': 8, '다르다': 9, '는': 10, '주로': 11, '소비': 12, '상품': 13, '을': 14, '기준': 15, '으로': 16, '느낀다': 17}
[1, 2, 1, 2, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1]


## Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['you know I want your love. because I love you.']
vector = CountVectorizer()
print(vector.fit_transform(corpus).toarray()) # 각 단어의 빈도수
print(vector.vocabulary_) # 각 단어의 인덱스

[[1 1 2 1 2 1]]
{'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}


## 불용어 제거한 BoW 만들기

### 직접 정의

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

text=["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words=["the", "a", "an", "is", "not"])
print(vect.fit_transform(text).toarray()) 
print(vect.vocabulary_)

[[1 1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


### CountVectorizer 제공

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

text=["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words="english")
print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)

[[1 1 1]]
{'family': 0, 'important': 1, 'thing': 2}


### NLTK 제공

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

text=["Family is not an important thing. It's everything."]
sw = stopwords.words("english")
vect = CountVectorizer(stop_words =sw)
print(vect.fit_transform(text).toarray()) 
print(vect.vocabulary_)

[[1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 3, 'everything': 0}


# TF-IDF

## 직접 구현

In [None]:
import pandas as pd
from math import log

In [None]:
docs = [
  '먹고 싶은 사과',
  '먹고 싶은 바나나',
  '길고 노란 바나나 바나나',
  '저는 과일이 좋아요'
] 
vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()

In [None]:
N = len(docs) # 총 문서의 수

def tf(t, d):
    return d.count(t)

def idf(t):
    df = 0
    for doc in docs:
        df += t in doc
    return log(N/(df + 1))

def tfidf(t, d):
    return tf(t,d)* idf(t)

### TF

In [None]:
result = []
for i in range(N): # 각 문서에 대해서 아래 명령을 수행
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]        
        result[-1].append(tf(t, d))

tf_ = pd.DataFrame(result, columns = vocab)
tf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


### IDF

In [None]:
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index = vocab, columns = ["IDF"])
idf_

Unnamed: 0,IDF
과일이,0.693147
길고,0.693147
노란,0.693147
먹고,0.287682
바나나,0.287682
사과,0.693147
싶은,0.287682
저는,0.693147
좋아요,0.693147


### 결과

In [None]:
result = []
for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]

        result[-1].append(tfidf(t,d))

tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


## Scikit-Learn을 이용한 TF-IDF

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',    
]
vector = CountVectorizer()
print(vector.fit_transform(corpus).toarray()) # 코퍼스로부터 각 단어의 빈도 수를 기록한다.
print(vector.vocabulary_) # 각 단어의 인덱스가 어떻게 부여되었는지를 보여준다.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',    
]
tfidfv = TfidfVectorizer().fit(corpus)
print(tfidfv.transform(corpus).toarray())
print(tfidfv.vocabulary_)

[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


# Document Similarity

## Cosine similarity

In [None]:
from numpy import dot
from numpy.linalg import norm
import numpy as np
def cos_sim(A, B):
       return dot(A, B)/(norm(A)*norm(B))

In [None]:
doc1=np.array([0,1,1,1])
doc2=np.array([1,0,1,1])
doc3=np.array([2,0,2,2])

In [None]:
print(cos_sim(doc1, doc2)) #문서1/문서2
print(cos_sim(doc1, doc3)) #문서1/문서3
print(cos_sim(doc2, doc3)) #문서2/문서3

0.6666666666666667
0.6666666666666667
1.0000000000000002


## Jaccard similarity

In [None]:
doc1 = "apple banana everyone like likey watch card holder"
doc2 = "apple banana coupon passport love you"

# 토큰화
tokenized_doc1 = doc1.split()
tokenized_doc2 = doc2.split()

print(tokenized_doc1)
print(tokenized_doc2)

['apple', 'banana', 'everyone', 'like', 'likey', 'watch', 'card', 'holder']
['apple', 'banana', 'coupon', 'passport', 'love', 'you']


In [None]:
union = set(tokenized_doc1).union(set(tokenized_doc2))
print(union)

{'you', 'coupon', 'apple', 'passport', 'like', 'holder', 'likey', 'love', 'banana', 'everyone', 'card', 'watch'}


In [None]:
intersection = set(tokenized_doc1).intersection(set(tokenized_doc2))
print(intersection)

{'banana', 'apple'}


In [None]:
print(len(intersection)/len(union))


0.16666666666666666


## Euclidean distance

In [None]:
import numpy as np
def dist(x,y):   
    return np.sqrt(np.sum((x-y)**2))

doc1 = np.array((2,3,0,1))
doc2 = np.array((1,2,3,1))
doc3 = np.array((2,1,2,2))
docQ = np.array((1,1,0,1))

print(dist(doc1,docQ))
print(dist(doc2,docQ))
print(dist(doc3,docQ))

2.23606797749979
3.1622776601683795
2.449489742783178
