### 1.1 표준 토큰화

In [1]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
text = "Model-based RL don't need a value function for the policy."
print(tokenizer.tokenize(text))

['Model-based', 'RL', 'do', "n't", 'need', 'a', 'value', 'function', 'for', 'the', 'policy', '.']


In [2]:
from nltk.tokenize import word_tokenize
print(word_tokenize(text))

['Model-based', 'RL', 'do', "n't", 'need', 'a', 'value', 'function', 'for', 'the', 'policy', '.']


### 1.2 어간 추출 및 표제어 추출

단어의 형태소 level에서 분석을 하면 다른 품사 or 다른 시제의 단어라 해도 같은 형태로 토큰화 할수 있다

In [3]:
from nltk.stem import PorterStemmer, LancasterStemmer
stem1 = PorterStemmer()
stem2 = LancasterStemmer()

words= ['eat','ate','eaten','eating']
print([stem1.stem(v) for v in words])
print([stem2.stem(v) for v in words])

['eat', 'ate', 'eaten', 'eat']
['eat', 'at', 'eat', 'eat']


이렇게 4가지 단어를 다르게 분류하면 안됨

In [4]:
import nltk
from nltk import WordNetLemmatizer
# nltk.download('wordnet')
lemm = WordNetLemmatizer()
words= ['eat','ate','eaten','eating']
print([lemm.lemmatize(w, pos = 'v') for w in words])

['eat', 'eat', 'eat', 'eat']


전부 다 eat(동사)로 추출 완료

### 불용어 제거

In [5]:
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.words('english')[:5])

['i', 'me', 'my', 'myself', 'we']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\p\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

input_sentence = "We shoud all study hard for the exam."
stop_words= set(stopwords.words('english'))
word_tokens = word_tokenize(input_sentence)
result = []

for w in word_tokens:
  if w not in stop_words:
    result.append(w)
print(word_tokens)
print(result)

['We', 'shoud', 'all', 'study', 'hard', 'for', 'the', 'exam', '.']
['We', 'shoud', 'study', 'hard', 'exam', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\p\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 정수 인토딩 및 Sorting

In [7]:
mylist = ['English','Math','Science']
for n, name in enumerate(mylist):
  print("Course : {}, Number : {}".format(name,n))

Course : English, Number : 0
Course : Math, Number : 1
Course : Science, Number : 2


High-frequency sorting

In [9]:
vocab = {'apple':2, 'July':6, 'piano':4, 'cup':8, 'orange':1}
vocab_sort = sorted(vocab.items(), key = lambda x:x[1] , reverse =True)
print(vocab_sort)

word2inx = {word[0] : index + 1 for index, word in enumerate(vocab_sort)}
print(word2inx)

[('cup', 8), ('July', 6), ('piano', 4), ('apple', 2), ('orange', 1)]
{'cup': 1, 'July': 2, 'piano': 3, 'apple': 4, 'orange': 5}


In [12]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
text = "Model-based RL don't need a value function for the polish, "\
      "but some of Model-based RL algorithms do have a value function"

token_text = tokenizer.tokenize(text)
word2inx = {}
Bow = []

for word in token_text:
  if word not in word2inx.keys():
    word2inx[word] = len(word2inx)
    Bow.insert(len(word2inx)-1,1)
  else:
    inx = word2inx.get(word)
    Bow[inx] +=1

print(word2inx)
print(Bow)

{'Model-based': 0, 'RL': 1, 'do': 2, "n't": 3, 'need': 4, 'a': 5, 'value': 6, 'function': 7, 'for': 8, 'the': 9, 'polish': 10, ',': 11, 'but': 12, 'some': 13, 'of': 14, 'algorithms': 15, 'have': 16}
[2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1]


### 유사도 분석

코사인 유사도

In [13]:
import numpy as np
def cos_sim(A,B):
  return np.dot(A,B)/(np.linalg.norm(A)*np.linalg.norm(B))

a = [1,0,0,1]
b = [0,1,1,0]
c = [1,1,1,1]

print(cos_sim(a,b), cos_sim(a,c), cos_sim(b,c))

0.0 0.7071067811865475 0.7071067811865475
