<a href="https://colab.research.google.com/github/PythonToGo/nlp_movie_review/blob/main/NLP_test_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Processing

In [1]:
# Package Install
# !pip install nltk
# !pip install kss
# !pip install konlpy



## 1-1 Tokoenization
Tokenization : separate sentences as Token(Token is smallest meaningful unit)

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('punkt')      # Download necessary data
print()




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Separating Paragraph
text = "The sun sets behind the mountains, painting the sky in shades of orange and pink. A gentle breeze whispers through the leaves, bringing a sense of calm to the bustling city. Under the starlit sky, the world seems to pause, reflecting on the day's events and dreaming of tomorrow."

print("paragraph: ", text)
print("<Sentences>")

n=0
for sent_i in sent_tokenize(text):
  print("{0}th sentence: {1}".format(n, sent_i))
  n += 1


paragraph:  The sun sets behind the mountains, painting the sky in shades of orange and pink. A gentle breeze whispers through the leaves, bringing a sense of calm to the bustling city. Under the starlit sky, the world seems to pause, reflecting on the day's events and dreaming of tomorrow.
<Sentences>
0th sentence: The sun sets behind the mountains, painting the sky in shades of orange and pink.
1th sentence: A gentle breeze whispers through the leaves, bringing a sense of calm to the bustling city.
2th sentence: Under the starlit sky, the world seems to pause, reflecting on the day's events and dreaming of tomorrow.


In [4]:
# Separating Words
sentence = "The sun sets behind the mountains, painting the sky in shades of orange and pink."
print("sentence : ", sentence)
print("words : ", word_tokenize(sentence))


sentence :  The sun sets behind the mountains, painting the sky in shades of orange and pink.
words :  ['The', 'sun', 'sets', 'behind', 'the', 'mountains', ',', 'painting', 'the', 'sky', 'in', 'shades', 'of', 'orange', 'and', 'pink', '.']


## 1-2 KSS (Korean Sentence Splitter)

In [5]:
import kss

text = "산 너머로 해가 지면서 하늘은 오렌지와 핑크색으로 물들어 간다. 나뭇잎 사이로 부는 산들바람이 분주한 도시에 평온함을 가져다준다. 별이 빛나는 밤하늘 아래에서 세상은 잠시 멈추어 오늘의 일을 되돌아보고 내일을 꿈꾼다."
print("Paragraph: ", text)
print("Sentence: ", kss.split_sentences(text))    # Split as sencentences
print("Sentence: ", kss.split_morphemes(text))    # Split as morphemes

For your information, Kss also supports mecab backend.
We recommend you to install mecab or konlpy.tag.Mecab for faster execution of Kss.
Please refer to following web sites for details:
- mecab: https://github.com/hyunwoongko/python-mecab-kor
- konlpy.tag.Mecab: https://konlpy.org/en/latest/api/konlpy.tag/#mecab-class



Paragraph:  산 너머로 해가 지면서 하늘은 오렌지와 핑크색으로 물들어 간다. 나뭇잎 사이로 부는 산들바람이 분주한 도시에 평온함을 가져다준다. 별이 빛나는 밤하늘 아래에서 세상은 잠시 멈추어 오늘의 일을 되돌아보고 내일을 꿈꾼다.
Sentence:  ['산 너머로 해가 지면서 하늘은 오렌지와 핑크색으로 물들어 간다.', '나뭇잎 사이로 부는 산들바람이 분주한 도시에 평온함을 가져다준다.', '별이 빛나는 밤하늘 아래에서 세상은 잠시 멈추어 오늘의 일을 되돌아보고 내일을 꿈꾼다.']
Sentence:  [('산', 'NNG'), ('너머', 'NNG'), ('로', 'JKB'), ('해', 'NNG'), ('가', 'JKS'), ('지', 'VV'), ('면서', 'EC'), ('하늘', 'NNG'), ('은', 'JX'), ('오렌지', 'NNG'), ('와', 'JC'), ('핑크', 'NNG'), ('색', 'NNG'), ('으로', 'JKB'), ('물들', 'VV'), ('어', 'EC'), ('간다', 'VX+EF'), ('.', 'SF'), ('나뭇잎', 'NNG'), ('사이', 'NNG'), ('로', 'JKB'), ('부', 'VV'), ('는', 'ETM'), ('산들바람', 'NNG'), ('이', 'JKS'), ('분주', 'NNG'), ('한', 'XSA+ETM'), ('도시', 'NNG'), ('에', 'JKB'), ('평온', 'NNG'), ('함', 'XSA+ETN'), ('을', 'JKO'), ('가져다준다', 'VV+EF'), ('.', 'SF'), ('별', 'NNG'), ('이', 'JKS'), ('빛나', 'VV'), ('는', 'ETM'), ('밤하늘', 'NNG'), ('아래', 'NNG'), ('에서', 'JKB'), ('세상', 'NNG'), ('은', 'JX'), ('잠시', 'MAG'), ('멈추', 'VV'), ('어', 'EC'), ('오늘', 'NNG'), ('의', 'JKG'), ('일', 'NN

## 1-3. POS Tagging
POS Tagging: The task of classifying what part of speech each word is

In [6]:
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger')
print()

text = "The sun sets behind the mountains, painting the sky in shades of orange and pink. A gentle breeze whispers through the leaves, bringing a sense of calm to the bustling city. Under the starlit sky, the world seems to pause, reflecting on the day's events and dreaming of tomorrow."
tokenized_sentence = word_tokenize(text)

print("Sentence: ", text)
print("Words: ", tokenized_sentence)
print("Parts: ", pos_tag(tokenized_sentence))


Sentence:  The sun sets behind the mountains, painting the sky in shades of orange and pink. A gentle breeze whispers through the leaves, bringing a sense of calm to the bustling city. Under the starlit sky, the world seems to pause, reflecting on the day's events and dreaming of tomorrow.
Words:  ['The', 'sun', 'sets', 'behind', 'the', 'mountains', ',', 'painting', 'the', 'sky', 'in', 'shades', 'of', 'orange', 'and', 'pink', '.', 'A', 'gentle', 'breeze', 'whispers', 'through', 'the', 'leaves', ',', 'bringing', 'a', 'sense', 'of', 'calm', 'to', 'the', 'bustling', 'city', '.', 'Under', 'the', 'starlit', 'sky', ',', 'the', 'world', 'seems', 'to', 'pause', ',', 'reflecting', 'on', 'the', 'day', "'s", 'events', 'and', 'dreaming', 'of', 'tomorrow', '.']
Morphemes:  [('The', 'DT'), ('sun', 'NN'), ('sets', 'NNS'), ('behind', 'IN'), ('the', 'DT'), ('mountains', 'NNS'), (',', ','), ('painting', 'VBG'), ('the', 'DT'), ('sky', 'NN'), ('in', 'IN'), ('shades', 'NNS'), ('of', 'IN'), ('orange', 'NN'

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [9]:
from konlpy.tag import Okt
from konlpy.tag import Kkma

okt = Okt()
kkma = Kkma()
kor_text = "산 너머로 해가 지면서 하늘은 오렌지와 핑크색으로 물들어 간다."
print()

print("Sentence: ", kor_text)
print()
print("Analyse Morpheme with okt: ", okt.morphs(kor_text))
print("Parts tagging with okt: ", okt.pos(kor_text))
print("Extract Noun with okt: ", okt.nouns(kor_text))
print()

print("Analyse Morpheme with kkma: ", kkma.morphs(kor_text))
print("Parts tagging with kkma: ", kkma.pos(kor_text))
print("Extract Noun with kkma: ", kkma.nouns(kor_text))
print("")


Sentence:  산 너머로 해가 지면서 하늘은 오렌지와 핑크색으로 물들어 간다.

Analyse Morpheme with okt:  ['산', '너머', '로', '해', '가', '지면', '서', '하늘', '은', '오렌지', '와', '핑크색', '으로', '물들어', '간다', '.']
Parts tagging with okt:  [('산', 'Noun'), ('너머', 'Noun'), ('로', 'Josa'), ('해', 'Noun'), ('가', 'Josa'), ('지면', 'Noun'), ('서', 'Josa'), ('하늘', 'Noun'), ('은', 'Josa'), ('오렌지', 'Noun'), ('와', 'Josa'), ('핑크색', 'Noun'), ('으로', 'Josa'), ('물들어', 'Verb'), ('간다', 'Noun'), ('.', 'Punctuation')]
Extract Noun with okt:  ['산', '너머', '해', '지면', '하늘', '오렌지', '핑크색', '간다']

Analyse Morpheme with kkma:  ['산', '너머', '로', '해', '가', '지', '면서', '하늘', '은', '오렌지', '와', '핑크색', '으로', '물들', '어', '갈', 'ㄴ다', '.']
Parts tagging with kkma:  [('산', 'NNG'), ('너머', 'NNG'), ('로', 'JKM'), ('해', 'NNG'), ('가', 'JKS'), ('지', 'VV'), ('면서', 'ECE'), ('하늘', 'NNG'), ('은', 'JX'), ('오렌지', 'NNG'), ('와', 'JKM'), ('핑크색', 'NNG'), ('으로', 'JKM'), ('물들', 'VV'), ('어', 'ECD'), ('갈', 'VV'), ('ㄴ다', 'EFN'), ('.', 'SF')]
Extract Noun with kkma:  ['산', '너머', '해', '해가지', '가지', '하늘'

## 1-4 Lemmatization and Stemming
Lemmatization: The base dictionary form of a word (the root of the word)


In [24]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

words = ['has','running','walked', 'easily','caring','flies','happier']

# Extract Lemmatization
print("Words: ", words)
print("Lemmatization : " , [lemmatizer.lemmatize(word) for word in words])
print()

# Words:  ['has', 'running', 'walked', 'easily', 'caring', 'flies', 'happier']
# Lemmatization :  ['ha', 'running', 'walked', 'easily', 'caring', 'fly', 'happier']

# Add Information of Parts
print("Information for 'has': ", lemmatizer.lemmatize('has', 'v'))
print("Information for 'running': ", lemmatizer.lemmatize('running', 'v'))
print("Information for 'walked': ", lemmatizer.lemmatize('walked', 'v'))
print("Information for 'easily': ", lemmatizer.lemmatize('easily', 'r'))
print("Information for 'caring': ", lemmatizer.lemmatize('caring', 'v'))
print("Information for 'flies': ", lemmatizer.lemmatize('flies', 'v'))
print("Information for 'happier': ", lemmatizer.lemmatize('happier', 'a'))

Words:  ['has', 'running', 'walked', 'easily', 'caring', 'flies', 'happier']
Lemmatization :  ['ha', 'running', 'walked', 'easily', 'caring', 'fly', 'happier']

Information for 'has':  have
Information for 'running':  run
Information for 'walked':  walk


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Common POS Tags
Part of Speech (POS) tags are used to classify words into their grammatical categories, which helps in understanding their roles within sentences. The following are some common POS tags along with their meanings:

#### 1. Noun (N): A person, place, thing, or idea

- NN: Singular noun
- NNS: Plural nouns
- NNP: Proper noun, singular
- NNPS: Proper noun, plural

#### 2. Pronoun (PRP): A word that takes the place of a noun

- PRP: Personal pronoun
- PRP$: Possessive pronoun

#### 3. Verb (V): Expresses action or being
- VB: Base form
- VBD: Past tense
- VBG: Gerund or present participle
- VBN: Past participle
- VBP: Present tense, non-3rd person singular
- VBZ: Present tense, 3rd person singular

#### 4. Adjective (A): Describes a noun
- JJ: Adjective
- JJR: Comparative adjective
- JJS: Superlative adjective

#### 5. Adverb (R): Modifies a verb, an adjective, or another adverb.
- RB: Adverb
- RBR: Comparative adverb
- RBS: Superlative adverb

#### 6. Preposition (IN): Shows the relationship of a noun or pronoun to another word.
#### 7. Conjunction (CC): Connects words, phrases, or clauses.
#### 8. Determiner (DT): Introduces a noun.
#### 9. Interjection (UH): Expresses emotion.
#### 10. Modal (MD): Expresses necessity or possibility.


#### Stem: The core part of a word that carries its meaning.

In [31]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

porter = PorterStemmer()
lancaster = LancasterStemmer()

sentence = "The sun sets behind the mountains, painting the sky in shades of orange and pink."
tokenized_sentence = word_tokenize(sentence)[ :16]

# Extracting Stem
print("Words: ", tokenized_sentence)
print("Extract with Porter: ", [porter.stem(word) for word in tokenized_sentence])
print("Extract with Lancaster: ", [lancaster.stem(word) for word in tokenized_sentence])


Words:  ['The', 'sun', 'sets', 'behind', 'the', 'mountains', ',', 'painting', 'the', 'sky', 'in', 'shades', 'of', 'orange', 'and', 'pink']
Extract with Porter:  ['the', 'sun', 'set', 'behind', 'the', 'mountain', ',', 'paint', 'the', 'sky', 'in', 'shade', 'of', 'orang', 'and', 'pink']
Extract with Lancaster:  ['the', 'sun', 'set', 'behind', 'the', 'mountain', ',', 'paint', 'the', 'sky', 'in', 'shad', 'of', 'orang', 'and', 'pink']


### 1-5 Stop word
A word that carries little meaning.

In [35]:
from nltk.corpus import stopwords
nltk.download('stopwords')
print()

stop_words_list = stopwords.words('english')
print("the number of nltk stopwords: ", len(stop_words_list))
print("Example of stopwords: ", stop_words_list[:5])


the number of nltk stopwords:  179
Example of stopwords:  ['i', 'me', 'my', 'myself', 'we']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
# Setting manuelly stopwords
example = "이 문장에서 불용어를 제외하면 무엇이 남을까요?"
stop_words = "이 에서 를 하면"

# Remove stop_words
stop_words = stop_words.split(' ')
word_tokens = okt.morphs(example)

result = [word for word in word_tokens if not word in stop_words]

print("before: ", word_tokens)
print("after: ", result)

before:  ['이', '문장', '에서', '불', '용어', '를', '제외', '하면', '무엇', '이', '남을까요', '?']
after:  ['문장', '불', '용어', '제외', '무엇', '남을까요', '?']


### 1-6. Regular Expressions

In [40]:
import re

# a random character
r_1 = re.compile("a.c")
print("abc: ", r_1.search("abc"))     # any only one character(b) between a and c, it should be ok
print("azc: ", r_1.search("azc"))     # any only one character(z) between a and c, it should be ok
print("abbc: ", r_1.search("abbc"))   # any two character between a and c, it should be None
print("acd: ", r_1.search("acd"))     # others, it should be None

abc:  <re.Match object; span=(0, 3), match='abc'>
azc:  <re.Match object; span=(0, 3), match='azc'>
abbc:  None
acd:  None


In [42]:
# If character exists or not
r_2 = re.compile("ab?c")
print("abc: ", r_2.search("abc"))     # any only one character(b) between a and c , it should be ok
print("ac: ", r_2.search("ac"))       # no character exists, it should be ok
print("ab: ", r_2.search("ab"))       # others, it should be None

abc:  <re.Match object; span=(0, 3), match='abc'>
ac:  <re.Match object; span=(0, 2), match='ac'>
ab:  None


In [44]:
# If more than 0 character exists
r_3 = re.compile("ab*c")
print("ac: ", r_3.search("ac"))       # no character exists, it should be ok
print("abc: ", r_3.search("abc"))     # any only one character(b) between a and c , it should be ok
print("ab: ", r_3.search("ab"))       # others, it should be None

ac:  <re.Match object; span=(0, 2), match='ac'>
abc:  <re.Match object; span=(0, 3), match='abc'>
ab:  None


In [47]:
# If more than 1 character exists
r_4 = re.compile("ab+c")
print("ac: ", r_4.search("ac"))       # no character exists, it should be None
print("abc: ", r_4.search("abc"))     # any only one character(b) between a and c , it should be ok
print("ab: ", r_4.search("ab"))       # others, it should be None

ac:  None
abc:  <re.Match object; span=(0, 3), match='abc'>
ab:  None


In [48]:
# Start with a specific string
r_5 = re.compile("^a")
print("a: ", r_5.search("a"))         # start with 'a', it should be ok
print("abc: ", r_5.search("abc"))     # start with 'a', it should be ok
print("ba: ", r_5.search("ba"))       # not start with 'a'. it should be None

a:  <re.Match object; span=(0, 1), match='a'>
abc:  <re.Match object; span=(0, 1), match='a'>
ba:  None


In [49]:
# Repeat a certain number of times
r_6 = re.compile("ab{2}c")
print("a: ", r_6.search("abc"))         # it should be None
print("abbc: ", r_6.search("abbc"))     # repeat b twice, it should be ok
print("ba: ", r_6.search("babbc"))      # it should be None

a:  None
abbc:  <re.Match object; span=(0, 4), match='abbc'>
ba:  <re.Match object; span=(1, 5), match='abbc'>


In [52]:
# Repeat a certain range of times
r_7 = re.compile("ab{2,3}c")
print("abc: ", r_7.search("abc"))         # only one 'b', it should be None
print("abbc: ", r_7.search("abbc"))     # repeat b twice, it should be ok
print("abbbc: ", r_7.search("abbbc"))   # repeat b three times, it should be ok
print("abbbbc: ", r_7.search("abbbbc"))   # repeat b four times, it should be None
print()

r_7_a = re.compile("ab{2,}c")
print("abc: ", r_7_a.search("abc"))         # only one 'b', it should be None
print("abbc: ", r_7_a.search("abbc"))     # repeat b twice, it should be ok
print("abbbc: ", r_7_a.search("abbbc"))   # repeat b three times, it should be ok
print("abbbbc: ", r_7_a.search("abbbbc"))   # repeat b four times, it should be ok
print()

r_7_b = re.compile("ab{,2}c")
print("abc: ", r_7_b.search("abc"))         # only one 'b', it should be ok
print("abbc: ", r_7_b.search("abbc"))     # repeat b twice, it should be ok
print("abbbc: ", r_7_b.search("abbbc"))   # repeat b three times, it should be None
print("abbbbc: ", r_7_b.search("abbbbc"))   # repeat b four times, it should be None

a:  None
abbc:  <re.Match object; span=(0, 4), match='abbc'>
abbbc:  <re.Match object; span=(0, 5), match='abbbc'>
abbbbc:  None

a:  None
abbc:  <re.Match object; span=(0, 4), match='abbc'>
abbbc:  <re.Match object; span=(0, 5), match='abbbc'>
abbbbc:  <re.Match object; span=(0, 6), match='abbbbc'>

a:  <re.Match object; span=(0, 3), match='abc'>
abbc:  <re.Match object; span=(0, 4), match='abbc'>
abbbc:  None
abbbbc:  None


In [56]:
# Include a specific string
r_8 = re.compile("[abc]")   # a or b or c
print("aim: ", r_8.search("aim"))         # there exists 'a', it should be ok
print("cat: ", r_8.search("cat"))         # it has 'a' and 'c', it should be ok
print("banana: ", r_8.search("banana"))   # it has 'b' and 'a', it should be ok
print("run: ", r_8.search("run"))         # no 'a', 'b' or 'c', it should be None
print()

# Include specific strings with a certain ranges
r_8_a = re.compile("[a-y]")   # a to y
print("aim: ", r_8_a.search("aim"))         # it should be ok
print("z: ", r_8_a.search("z"))             # it has only 'z', it should be None
print("yz: ", r_8_a.search("yz"))           # it has 'y', it should be ok
print("!: ", r_8_a.search("!"))             # it has no character, it should be None
print("run!: ", r_8_a.search("run!"))       # it has character, it should be ok

aim:  <re.Match object; span=(0, 1), match='a'>
cat:  <re.Match object; span=(0, 1), match='c'>
banana:  <re.Match object; span=(0, 1), match='b'>
run:  None

aim:  <re.Match object; span=(0, 1), match='a'>
z:  None
yz:  <re.Match object; span=(0, 1), match='y'>
!:  None
run!:  <re.Match object; span=(0, 1), match='r'>


In [60]:
# Exclude specific strings
r_9 = re.compile("[^abc]")   # exclude a, b and c
print("abc: ", r_9.search("abc"))         # there exists only 'a', 'b' and 'c', it should be None
print("ca: ", r_9.search("ca"))           # it has 'a' and 'c', it should be None
print("banana: ", r_9.search("banana"))   # it has 'n', it should be ok
print("run: ", r_9.search("run"))         # no 'a', 'b' or 'c', it should be ok
print()

# Exclude specific strings with a certain range
r_9_a = re.compile("[^a-y]")   # exclude a to y
print("abc: ", r_9_a.search("abc"))         # there exists only 'a', 'b' and 'c', it should be None
print("y: ", r_9_a.search("y"))             # it has 'a' and 'y', it should be None
print("z: ", r_9_a.search("z"))             # it has no 'a' to 'y', it should be ok
print("yz: ", r_9_a.search("yz"))           # it has 'z', it should be ok

abc:  None
ca:  None
banana:  <re.Match object; span=(2, 3), match='n'>
run:  <re.Match object; span=(0, 1), match='r'>



In [63]:
# Include one of a specific string
r_10 = re.compile("[a|ox]")   # Include a or ox
print("a: ", r_10.search("a"))              # there exists only 'a', it should be ok
print("ox: ", r_10.search("ox"))            # there exsits only 'ox', it should be ok
print("aox: ", r_10.search("aox"))          # it should be ok
print("oxygen: ", r_10.search("oxygen"))    # it has 'ox', it should be ok
print("run: ", r_10.search("run"))          # it has neither 'a' nor 'ox', it should be None

a:  <re.Match object; span=(0, 1), match='a'>
ox:  <re.Match object; span=(0, 1), match='o'>
aox:  <re.Match object; span=(0, 1), match='a'>
oxygen:  <re.Match object; span=(0, 1), match='o'>
run:  None
