**Remove punctuation and stop words from a paragraph using nltk.**

In [2]:
!pip install nltk



In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Sample paragraph
paragraph = "This is a sample paragraph. It contains punctuation, and stop words like 'is', 'a', and 'the'."

# Tokenize the paragraph into words
words = word_tokenize(paragraph)

# Convert to lowercase
words = [word.lower() for word in words]

# Remove punctuation
words = [word for word in words if word not in string.punctuation]

# Remove stop words
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]

# Output result
print(filtered_words)

['sample', 'paragraph', 'contains', 'punctuation', 'stop', 'words', 'like', "'is", "'the"]


**Perform stemming and lemmatization on user-input text.**

In [5]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')  # For lemmatizer
nltk.download('averaged_perceptron_tagger')  # Optional: for better lemmatization

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [6]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function to map NLTK POS tags to WordNet tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

# Get user input
text = input("Enter a sentence: ")

# Tokenize
tokens = word_tokenize(text)

# POS tagging
pos_tags = nltk.pos_tag(tokens)

# Perform stemming and lemmatization
print("\nWord\tStem\tLemma")
print("-" * 30)
for word, tag in pos_tags:
    stem = stemmer.stem(word)
    lemma = lemmatizer.lemmatize(word, get_wordnet_pos(tag))
    print(f"{word}\t{stem}\t{lemma}")

Enter a sentence: This is a Dr SAket class

Word	Stem	Lemma
------------------------------
This	thi	This
is	is	be
a	a	a
Dr	dr	Dr
SAket	saket	SAket
class	class	class


**Apply POS (Part of Speech) tagging using NLTK on a given sentence.**

**Common POS Tags (Short Guide)**  
Tag         Meaning    
NN  	Noun, singular  
NNS 	Noun, plural  
VB  	Verb, base form  
VBZ	    Verb, 3rd person  
VBD	    Verb, past tense  
JJ	    Adjective  
RB	    Adverb  
DT	    Determiner  
IN	    Preposition  
PRP	    Personal pronoun  

In [7]:
import nltk
from nltk.tokenize import word_tokenize

# Download necessary resources (only once)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Input sentence
sentence = input("Enter a sentence: ")

# Tokenize the sentence
tokens = word_tokenize(sentence)

# Perform POS tagging
pos_tags = nltk.pos_tag(tokens)

# Output
print("\nPOS Tags:")
for word, tag in pos_tags:
    print(f"{word} --> {tag}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Enter a sentence: This is Artificial INtelligence Lab

POS Tags:
This --> DT
is --> VBZ
Artificial --> JJ
INtelligence --> NNP
Lab --> NNP


**Build a simple text classifier using NLTK e.g. classify messages as spam / ham**

In [1]:
data = [
    ("Free money!!!", "spam"),
    ("Hi, how are you?", "ham"),
    ("Win a free iPhone now", "spam"),
    ("Are we still meeting tomorrow?", "ham"),
    ("Limited offer! Call now!", "spam"),
    ("Don't forget our lunch meeting", "ham"),
]

In [2]:
from nltk.tokenize import word_tokenize
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy


# Create feature extractor
def extract_features(text):
    words = word_tokenize(text.lower())
    return {word: True for word in words}

# Apply feature extractor
feature_set = [(extract_features(text), label) for (text, label) in data]

# Split into train and test
train_set = feature_set[:4]
test_set = feature_set[4:]

# Train classifier
classifier = NaiveBayesClassifier.train(train_set)

print("Accuracy:", accuracy(classifier, test_set))
classifier.show_most_informative_features()


msg = "Congratulations! You have won a free ticket"
features = extract_features(msg)
print("Prediction:", classifier.classify(features))

Accuracy: 1.0
Most Informative Features
                       ! = None              ham : spam   =      1.7 : 1.0
                       , = None             spam : ham    =      1.7 : 1.0
                       a = None              ham : spam   =      1.7 : 1.0
                      hi = None             spam : ham    =      1.7 : 1.0
                     how = None             spam : ham    =      1.7 : 1.0
                  iphone = None              ham : spam   =      1.7 : 1.0
                 meeting = None             spam : ham    =      1.7 : 1.0
                   money = None              ham : spam   =      1.7 : 1.0
                     now = None              ham : spam   =      1.7 : 1.0
                   still = None             spam : ham    =      1.7 : 1.0
Prediction: spam
