# Tokenization

In [19]:
# From a sample corpus, use NLTK library to do sentence and word tokenization
sample_corpus = """
Natural Language Processing (NLP) is a fascinating field of artificial intelligence that focuses on the interaction between computers and humans through natural language. The ultimate objective of NLP is to enable computers to understand, interpret, and generate human language in a valuable way. What a great technology!
"""

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag, ne_chunk

# Sentence tokenization
sentences = sent_tokenize(sample_corpus)
print("Sentence Tokenization:")
print(sentences)

# Word tokenization
words = word_tokenize(sample_corpus)
print("\nWord Tokenization:")
print(words)

# Part-of-speech tagging
pos_tags = pos_tag(words)
print("\nPOS Tagging:")
print(pos_tags)

# Tree (Chunking / Syntax Tree)
tree = ne_chunk(pos_tags)
print("\nTree Tokenization (Named Entity Chunking):")
print(tree)

Sentence Tokenization:
['\nNatural Language Processing (NLP) is a fascinating field of artificial intelligence that focuses on the interaction between computers and humans through natural language.', 'The ultimate objective of NLP is to enable computers to understand, interpret, and generate human language in a valuable way.', 'What a great technology!']

Word Tokenization:
['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'artificial', 'intelligence', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'through', 'natural', 'language', '.', 'The', 'ultimate', 'objective', 'of', 'NLP', 'is', 'to', 'enable', 'computers', 'to', 'understand', ',', 'interpret', ',', 'and', 'generate', 'human', 'language', 'in', 'a', 'valuable', 'way', '.', 'What', 'a', 'great', 'technology', '!']

POS Tagging:
[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ')

# Stemming

In [21]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, RegexpStemmer

# Sample words
words = ["running", "runs", "ran", "easily", "fairly", "happiness"]

# Porter Stemmer
porter = PorterStemmer()
porter_stems = [porter.stem(word) for word in words]
print("Porter Stemmer:", porter_stems)

# Lancaster Stemmer
lancaster = LancasterStemmer()
lancaster_stems = [lancaster.stem(word) for word in words]
print("Lancaster Stemmer:", lancaster_stems)

# Snowball Stemmer (English)
snowball = SnowballStemmer("english")
snowball_stems = [snowball.stem(word) for word in words]
print("Snowball Stemmer:", snowball_stems)

# Regexp Stemmer
stemmer = RegexpStemmer('(ing|ed|ly|ness|s)$')
regex_stems = [stemmer.stem(word) for word in words]
print("RegexpStemmer Output:", regex_stems)

Porter Stemmer: ['run', 'run', 'ran', 'easili', 'fairli', 'happi']
Lancaster Stemmer: ['run', 'run', 'ran', 'easy', 'fair', 'happy']
Snowball Stemmer: ['run', 'run', 'ran', 'easili', 'fair', 'happi']
RegexpStemmer Output: ['runn', 'run', 'ran', 'easi', 'fair', 'happi']


# Lemmatization

In [22]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag

# Sample words
words = ["running", "better", "cats", "geese", "was", "studies"]

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to map POS tags to WordNet POS
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default

# POS tagging
pos_tags = pos_tag(words)

# Lemmatization
lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]

print("Original Words: ", words)
print("POS Tags: ", pos_tags)
print("Lemmatized Words: ", lemmatized_words)

Original Words:  ['running', 'better', 'cats', 'geese', 'was', 'studies']
POS Tags:  [('running', 'VBG'), ('better', 'JJR'), ('cats', 'NNS'), ('geese', 'VBP'), ('was', 'VBD'), ('studies', 'NNS')]
Lemmatized Words:  ['run', 'good', 'cat', 'geese', 'be', 'study']


# Stopwords

In [23]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Sample text
text = "This is a simple example showing how to remove stopwords from a sentence using NLTK."

# Tokenize
words = word_tokenize(text)

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords
filtered_words = [word for word in words if word.lower() not in stop_words]

print("Original Words:", words)
print("After Stopword Removal:", filtered_words)

Original Words: ['This', 'is', 'a', 'simple', 'example', 'showing', 'how', 'to', 'remove', 'stopwords', 'from', 'a', 'sentence', 'using', 'NLTK', '.']
After Stopword Removal: ['simple', 'example', 'showing', 'remove', 'stopwords', 'sentence', 'using', 'NLTK', '.']
