In [28]:
from nltk import sent_tokenize, word_tokenize
from nltk.stem import LancasterStemmer, WordNetLemmatizer
import nltk
import numpy as np
from scipy import sparse
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/bongeungu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
text_sample = 'The Matrix is everywhere its all around us, here even in this room. \
                You can see it out your window or on your television. \
                You feel it when you go to work, or go to church or pay your taxes.'
                
# 각각 문장으로 구성된 list 객체 반환
sentences = sent_tokenize(text = text_sample)

print(type(sentences), len(sentences))
print(sentences)

<class 'list'> 3
['The Matrix is everywhere its all around us, here even in this room.', 'You can see it out your window or on your television.', 'You feel it when you go to work, or go to church or pay your taxes.']


In [6]:
sentence = sentences[0]

# 문장을 단어로 토큰화
words = word_tokenize(sentence)

print(type(words), len(words))
print(words)

<class 'list'> 15
['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']


In [16]:
for i, sentence in enumerate(sentences):
    tokens = word_tokenize(sentence)
    
    print(tokens)
print(type(tokens), len(tokens))

['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']
['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.']
['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']
<class 'list'> 18


In [10]:
def tokenize_test(text):
    sentences = sent_tokenize(text)
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    
    return word_tokens

In [11]:
word_tokens = tokenize_test(text_sample)
print(type(word_tokens), len(word_tokens))
print(word_tokens)

<class 'list'> 3
[['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.'], ['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.'], ['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']]


# Delete Stopwords

In [17]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bongeungu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
print(nltk.corpus.stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [19]:
stopwords = nltk.corpus.stopwords.words('english')
all_tokens=[]

for sentence in word_tokens:
    filtered_words=[]
    
    for word in sentence:
        word = word.lower()
        
        if word not in stopwords:
            filtered_words.append(word)
    all_tokens.append(filtered_words)

print(all_tokens)

[['matrix', 'everywhere', 'around', 'us', ',', 'even', 'room', '.'], ['see', 'window', 'television', '.'], ['feel', 'go', 'work', ',', 'go', 'church', 'pay', 'taxes', '.']]


# Stemming & Lemmatization

In [21]:
# Stemming

stemmer = LancasterStemmer()

print(stemmer.stem('working'), stemmer.stem('works'), stemmer.stem('worked'))
print(stemmer.stem('amusing'), stemmer.stem('amuses'), stemmer.stem('amused'))
print(stemmer.stem('happier'), stemmer.stem('happiest'))
print(stemmer.stem('fancier'), stemmer.stem('fanciest'))

# Stemming의 경우 비교형, 최상급형으로 변형된 단어의 정확한 원형을 찾지 못함

work work work
amus amus amus
happy happiest
fant fanciest


In [23]:
# WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bongeungu/nltk_data...


True

In [25]:
lemma = WordNetLemmatizer()
# 추출할 원형 단어의 품사를 지정함(v = 동사, a = 형용사)
print(lemma.lemmatize('amusing', 'v'), lemma.lemmatize('amuses', 'v'), lemma.lemmatize('amused','v'))
print(lemma.lemmatize('working', 'v'), lemma.lemmatize('works', 'v'), lemma.lemmatize('worked','v'))
print(lemma.lemmatize('happier', 'a'), lemma.lemmatize('happiest', 'a'))
print(lemma.lemmatize('fancier', 'a'), lemma.lemmatize('fanciest', 'a'))

amuse amuse amuse
work work work
happy happy
fancy fancy


# Bag of Words(BOW)

## 희소행렬 - COO 형식

In [43]:
# 기준 데이터
dense = np.array([[3,0,1],[0,2,0]])

In [41]:
data = np.array([3,1,2])

row_pos = np.array([0,0,1]) # value의 위치 (row 기준)
col_pos = np.array([0,2,1]) # value의 위치 (col 기준)

sparse_coo = sparse.coo_matrix((data, (row_pos, col_pos)))

In [42]:
sparse_coo.toarray()

array([[3, 0, 1],
       [0, 2, 0]])

## 희소행렬 - CSR 형식

In [44]:
dense2 = np.array([[0,0,1,0,0,5],
            [1,4,0,3,2,5],
            [0,6,0,3,0,0],
            [2,0,0,0,0,0],
            [0,0,0,7,0,8],
            [1,0,0,0,0,0]])

In [47]:
# coo
data2 = np.array([1,5,1,4,3,2,5,6,3,2,7,8,1])

row_pos = np.array([0,0,1,1,1,1,1,2,2,3,4,4,5])
col_pos = np.array([2,5,0,1,3,4,5,1,3,0,3,5,0])

sparse_coo = sparse.coo_matrix((data2, (row_pos, col_pos)))

In [49]:
# csr
row_pos_ind = np.array([0.2,7,9,10,12,13])

sparse_csr = sparse.csr_matrix((data2, col_pos, row_pos_ind))

In [51]:
print(sparse_coo.toarray())
print('-------------------')
print(sparse_csr.toarray())

[[0 0 1 0 0 5]
 [1 4 0 3 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]
-------------------
[[ 1  4  1  3  2 10]
 [ 0  6  0  3  0  0]
 [ 2  0  0  0  0  0]
 [ 0  0  0  7  0  8]
 [ 1  0  0  0  0  0]]
