In [None]:

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')


## 1. Tokenization – Sentence and Word

In [None]:

from nltk.tokenize import sent_tokenize, word_tokenize

text = "Natural Language Processing is amazing. It powers AI applications."
print("Sentences:", sent_tokenize(text))
print("Words:", word_tokenize(text))


## 2. Stemming – Porter Stemmer

In [None]:

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
words = ["connected", "connecting", "connection"]
print([stemmer.stem(w) for w in words])


## 3. Lemmatization – With POS

In [None]:

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

sentence = "The leaves are falling off the tree and flying around."
tokens = word_tokenize(sentence)
pos_tags = pos_tag(tokens)
lemmas = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
print(lemmas)


## 4. Stopword Removal – English

In [None]:

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
text = "This is a demonstration of how to remove stopwords."
filtered = [word for word in word_tokenize(text) if word.lower() not in stop_words]
print(filtered)


## 5. Part-of-Speech (POS) Tagging

In [None]:

from nltk import pos_tag

sentence = "John is writing a book about artificial intelligence."
tokens = word_tokenize(sentence)
print(pos_tag(tokens))


## 6. Extract Named Entities as Tuples

In [1]:

from nltk.tree import Tree

named_entities = []
for subtree in entities:
    if isinstance(subtree, Tree):
        entity_name = " ".join([token for token, pos in subtree.leaves()])
        entity_type = subtree.label()
        named_entities.append((entity_name, entity_type))

print(named_entities)


NameError: name 'entities' is not defined

### Summary Table

| Step                 | Function               | Method                            | Sample Output                    |
| -------------------- | ---------------------- | --------------------------------- | -------------------------------- |
| **Tokenization**     | Sentence/word split    | `sent_tokenize`, `word_tokenize`  | `["Hello", "world"]`             |
| **Stemming**         | Word root via rules    | `PorterStemmer().stem()`          | `"connected"` → `"connect"`      |
| **Lemmatization**    | Base form via WordNet  | `WordNetLemmatizer().lemmatize()` | `"running"` → `"run"` (with POS) |
| **Stopword Removal** | Remove low-value words | `word not in stopwords.words()`   | `"This is NLP"` → `"['NLP']"`    |
| **POS Tagging**      | Grammatical label      | `pos_tag()`                       | `("running", "VBG")`             |
