### NLTK(Natural Language Toolkit) 
텍스트 전처리 및 탐색 코드를 보다 빠르고 간편하게 작성할 수 있게 도와주는 Python 라이브러리

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [3]:
test_sentences = [
    "i have looked forward to seeing this since i first saw it amoungst her work",
    "this is a superb movie suitable for all but the very youngest",
    "i first saw this movie when I was a little kid and fell in love with it at once",
    "i am sooo tired but the show must go on",
]

In [4]:
# 영어 stopword
stopwords = stopwords.words('english')

print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
# stopword를 추가하고 업데이트된 stopword 저장
new_keywords = ['noone', 'sooo', 'thereafter', 'beyond', 'amoungst', 'among']
updated_stopwords = stopwords + new_keywords

print(updated_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
# stopword로 test_sentences를 전처리하고 tokenized_word에 저장
tokenize_list = []

for sentence in test_sentences:
    tokens = word_tokenize(sentence)
    word_list = []
    for token in tokens:
        if token not in updated_stopwords:
            word_list.append(token)
    tokenize_list.append(word_list)

print(tokenize_list)

[['looked', 'forward', 'seeing', 'since', 'first', 'saw', 'work'], ['superb', 'movie', 'suitable', 'youngest'], ['first', 'saw', 'movie', 'I', 'little', 'kid', 'fell', 'love'], ['tired', 'show', 'must', 'go']]


In [8]:
# stemming을 해보세요.
stemmed_sent = []
stemmer = PorterStemmer()

for word in tokenized_word[0]:
    # 첫번째 문장에 대해서만 stemming을 진행 tokenized_word[0]
    # 첫번째 문장에 있는 단어들을 하나씩 불러온다. 
    stemmed_sent.append(stemmer.stem(word))

print(stemmed_sent)

['look', 'forward', 'see', 'sinc', 'first', 'saw', 'work']


---

### 전체 코드

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

test_sentences = [
    "i have looked forward to seeing this since i first saw it amoungst her work",
    "this is a superb movie suitable for all but the very youngest",
    "i first saw this movie when I was a little kid and fell in love with it at once",
    "i am sooo tired but the show must go on",
]

# 영어 stopword를 저장 
stopwords = stopwords.words('english')

print(stopwords)

# stopword를 추가하고 업데이트된 stopword를 저장 
new_keywords = ['noone', 'sooo', 'thereafter', 'beyond', 'amoungst', 'among']
updated_stopwords = stopwords + new_keywords

print(updated_stopwords)

# 업데이트된 stopword로 test_sentences를 전처리하고 tokenized_word에 저장 
tokenized_word = []

for sentence in test_sentences:
    tokens = word_tokenize(sentence)
    new_sent = []
    for token in tokens:
        if token not in updated_stopwords:
        # 각 문장별로 각기 다른 토큰화된 단어 목록이 생성됨
            new_sent.append(token)
    tokenized_word.append(new_sent)

print(tokenized_word)

# stemming을 해보세요.
stemmed_sent = []
stemmer = PorterStemmer()

for word in tokenized_word[0]:
    # 첫번째 문장에 대해서만 stemming을 진행 tokenized_word[0]
    # 첫번째 문장에 있는 단어들을 하나씩 불러온다. 
    stemmed_sent.append(stemmer.stem(word))

print(stemmed_sent)

---

### 문서 유사도

In [9]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sooyeon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### sentence

In [10]:
document_1 = 'South Korea is a country in East Asia, constituting the southern part of the Korean Peninsula, and sharing a land border with North Korea. 25 million people, around half of the countrys population of more than 51 million people, live in the Seoul Capital Area, the fifth-largest metropolitan area in the world.'
document_2 = 'North Korea is a country in East Asia constituting the northern part of the Korean Peninsula. The country is bordered to the north by China and by Russia along the Amnok and Tumen rivers, and to the south by South Korea, with the heavily fortified Korean Demilitarized Zone (DMZ) separating the two.'

word_tokens_document_1 = word_tokenize(document_1)
word_tokens_document_2 = word_tokenize(document_2)

### Jaccard Similarity
두 집합 $A, B$에서 교집합 $A \cap B$와 합집합 $A \cup B$를 구하여 그 둘의 크기를 비교

Jaccard similarity $D_1, D_2$ = $\frac{|D_1 \cap D_2|}{|D_1 \cup D_2|}$

In [11]:
# 합집합
union_tokens = set(word_tokens_document_1 + word_tokens_document_2)

In [12]:
# 교집합
intersection_tokens = set(word_tokens_document_1).intersection(set(word_tokens_document_2))

In [13]:
# 자카드 유사도
jaccard_similarity = len(intersection_tokens) / len(union_tokens) # 교집합 / 합집합
print(jaccard_similarity)

0.30158730158730157


문서 내 단어의 개수를 신경쓰지 않는다는 문제점이 존재함<br>
단어가 여러 번 사용되었음에도 이를 무시하고 단어의 존재여부만을 가지고 문서 유사도를 계산

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
corpus = [
    'This is the first document. This is very important.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
    'This is Sparta'
]

In [17]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print()
print(X.toarray())

['and', 'document', 'first', 'important', 'is', 'one', 'second', 'sparta', 'the', 'third', 'this', 'very']

[[0 1 1 1 2 0 0 0 1 0 2 1]
 [0 2 0 0 1 0 1 0 1 0 1 0]
 [1 0 0 0 1 1 0 0 1 1 1 0]
 [0 1 1 0 1 0 0 0 1 0 1 0]
 [0 0 0 0 1 0 0 1 0 0 1 0]]


---

### TF-IDF
#### Term Frequency - Inverse Document Frequency

In [18]:
corpus = [
    'This is the first document. This is very important.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
    'This is Sparta'
]
print("\n".join(corpus))

This is the first document. This is very important.
This document is the second document.
And this is the third one.
Is this the first document?
This is Sparta


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
corpus = [
    'This is the first document. This is very important.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
    'This is Sparta'
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print()
print(X.toarray())

['and', 'document', 'first', 'important', 'is', 'one', 'second', 'sparta', 'the', 'third', 'this', 'very']

[[0.         0.29275244 0.35267539 0.43713206 0.41659154 0.
  0.         0.         0.24627258 0.         0.41659154 0.43713206]
 [0.         0.70933829 0.         0.         0.25235002 0.
  0.52958485 0.         0.29835887 0.         0.25235002 0.        ]
 [0.51492278 0.         0.         0.         0.24536346 0.51492278
  0.         0.         0.29009851 0.51492278 0.24536346 0.        ]
 [0.         0.48961805 0.58983706 0.         0.34836727 0.
  0.         0.         0.41188214 0.         0.34836727 0.        ]
 [0.         0.         0.         0.         0.39515588 0.
  0.         0.829279   0.         0.         0.39515588 0.        ]]


In [20]:
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(X[0], X[1]).flatten()
print("Similarity between first and second documents: {}".format(cos_sim))

Similarity between first and second documents: [0.49139188]
