In [1]:
# Import các thư viện cần thiết
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

In [23]:
# --- Tải các tài nguyên cần thiết của NLTK (chỉ cần chạy một lần) ---
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt') # Cần cho tokenizer, có thể đã được tải trước đó
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tienc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tienc\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tienc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\tienc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

# 1. Ví dụ về Stemming với Porter Stemmer ---


In [2]:
print("--- Stemming Examples ---")
porter = PorterStemmer()

--- Stemming Examples ---


In [3]:
words_to_stem = ["walking", "walked", "walks", "ran", "running", "bosses", "replacement", "unnecessary", "berry", "mice", "was", "is", "better"]

for word in words_to_stem:
    print(f"Stemming '{word}': {porter.stem(word)}")

print("\nStemming a sentence:")
sentence_for_stemming = "Lemmatization is more sophisticated than stemming."
tokens_for_stemming = sentence_for_stemming.split() # Chia câu thành các từ đơn giản

stemmed_sentence = [porter.stem(token) for token in tokens_for_stemming]
print(" ".join(stemmed_sentence))

Stemming 'walking': walk
Stemming 'walked': walk
Stemming 'walks': walk
Stemming 'ran': ran
Stemming 'running': run
Stemming 'bosses': boss
Stemming 'replacement': replac
Stemming 'unnecessary': unnecessari
Stemming 'berry': berri
Stemming 'mice': mice
Stemming 'was': wa
Stemming 'is': is
Stemming 'better': better

Stemming a sentence:
lemmat is more sophist than stemming.


# 2. Ví dụ về Lemmatization với WordNetLemmatizer ---

In [6]:
print("\n--- Lemmatization Examples ---")
lemmatizer = WordNetLemmatizer()

# Ví dụ cơ bản không có POS tag (mặc định là danh từ)
print(f"Lemmatizing 'walking' (default): {lemmatizer.lemmatize('walking')}")
print(f"Lemmatizing 'going' (default): {lemmatizer.lemmatize('going')}")
print(f"Lemmatizing 'mice' (default): {lemmatizer.lemmatize('mice')}") # Correctly lemmatizes irregular plural


# Ví dụ với POS tag (động từ)
print(f"Lemmatizing 'walking' (verb): {lemmatizer.lemmatize('walking', pos=wordnet.VERB)}")
print(f"Lemmatizing 'going' (verb): {lemmatizer.lemmatize('going', pos=wordnet.VERB)}")
print(f"Lemmatizing 'ran' (verb): {lemmatizer.lemmatize('ran', pos=wordnet.VERB)}") # Lemmatizes past tense

# Ví dụ so sánh Stemming và Lemmatization cho cùng từ
print(f"\nStemming 'was': {porter.stem('was')}")
print(f"Lemmatizing 'was' (verb): {lemmatizer.lemmatize('was', pos=wordnet.VERB)}") # Lemmatizes 'was' to 'be'

print(f"Stemming 'is': {porter.stem('is')}")
print(f"Lemmatizing 'is' (verb): {lemmatizer.lemmatize('is', pos=wordnet.VERB)}") # Lemmatizes 'is' to 'be'

print(f"Stemming 'better': {porter.stem('better')}")
print(f"Lemmatizing 'better' (adjective): {lemmatizer.lemmatize('better', pos=wordnet.ADJ)}") # Lemmatizes 'better' to 'good'


--- Lemmatization Examples ---
Lemmatizing 'walking' (default): walking
Lemmatizing 'going' (default): going
Lemmatizing 'mice' (default): mouse
Lemmatizing 'walking' (verb): walk
Lemmatizing 'going' (verb): go
Lemmatizing 'ran' (verb): run

Stemming 'was': wa
Lemmatizing 'was' (verb): be
Stemming 'is': is
Lemmatizing 'is' (verb): be
Stemming 'better': better
Lemmatizing 'better' (adjective): good


# 3. Ánh xạ POS tag của NLTK sang WordNet POS tag ---
    - NLTK's pos_tag trả về các tag theo chuẩn Treebank
    - WordNetLemmatizer sử dụng các tag của WordNet (n, v, a, r, s)

In [7]:
def get_wordnet_pos(treebank_tag):
    """
    Ánh xạ Treebank POS tags sang WordNet POS tags
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # Mặc định là danh từ nếu không xác định được
        return wordnet.NOUN

# 4. Ví dụ Lemmatization với POS tagging cho cả câu ---

In [24]:
print("\n--- Lemmatization Sentences with POS Tags ---")

sentence1 = "Donald Trump has a devoted following"
# Tokenize và POS tag câu
words_and_tags1 = nltk.pos_tag(nltk.word_tokenize(sentence1))
print(f"\nOriginal words and NLTK POS tags: {words_and_tags1}")

# Lemmatize từng từ sử dụng POS tag đã ánh xạ
lemmatized_sentence1 = [
    lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag))
    for word, tag in words_and_tags1
]
print(f"Lemmatized sentence: {' '.join(lemmatized_sentence1)}")


sentence2 = "The cat was following the bird as it flew by"
# Tokenize và POS tag câu
words_and_tags2 = nltk.pos_tag(nltk.word_tokenize(sentence2))
print(f"\nOriginal words and NLTK POS tags: {words_and_tags2}")

# Lemmatize từng từ sử dụng POS tag đã ánh xạ
lemmatized_sentence2 = [
    lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag))
    for word, tag in words_and_tags2
]
print(f"Lemmatized sentence: {' '.join(lemmatized_sentence2)}")


--- Lemmatization Sentences with POS Tags ---

Original words and NLTK POS tags: [('Donald', 'NNP'), ('Trump', 'NNP'), ('has', 'VBZ'), ('a', 'DT'), ('devoted', 'VBN'), ('following', 'NN')]
Lemmatized sentence: Donald Trump have a devote following

Original words and NLTK POS tags: [('The', 'DT'), ('cat', 'NN'), ('was', 'VBD'), ('following', 'VBG'), ('the', 'DT'), ('bird', 'NN'), ('as', 'IN'), ('it', 'PRP'), ('flew', 'VBD'), ('by', 'IN')]
Lemmatized sentence: The cat be follow the bird a it fly by


In [19]:
print(nltk.__version__)
nltk.download()

3.9.1
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True