<a href="https://colab.research.google.com/github/Satyavyshnavi41/11239A041_DST_Lab/blob/main/NLP_programs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Lemmatization**

In [6]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
word = input("Enter a word: ")

# custom rule
if word == "cls":
    print("Lemmatized form: cl")
else:
    print("Lemmatized form:", lemmatizer.lemmatize(word))

Enter a word: cls
Lemmatized form: cl


# **Tokenization**

In [4]:
import nltk


nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
text = "Hello world, this is a simple tokenization example."
tokens = word_tokenize(text.lower())
tokens = [word for word in tokens if word.isalpha()]
print(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['hello', 'world', 'this', 'is', 'a', 'simple', 'tokenization', 'example']


# **Stemming**

In [None]:
inltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
text = "Hello world, this is a simple tokenization example."
tokens = word_tokenize(text.lower())
tokens = [word for word in tokens if word.isalpha()]
print(tokens)

# **Morphology**

In [7]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

words = ['running', 'runs', 'easily', 'fairies']

print("Original words:", words)

lemmas = [lemmatizer.lemmatize(w) for w in words]

print("Lemmatized words:", lemmas)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Original words: ['running', 'runs', 'easily', 'fairies']
Lemmatized words: ['running', 'run', 'easily', 'fairy']


# **Spelling correction**

In [8]:
# Step 1: Install TextBlob (only once in Colab)
!pip install textblob

# Step 2: Import and download data
from textblob import TextBlob
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

# Step 3: Define text with a spelling mistake
text = "I havv a spelling error"

# Step 4: Create TextBlob object
blob = TextBlob(text)

# Step 5: Correct the text
corrected_text = blob.correct()

# Step 6: Print output
print("original:", text)
print("corrected:", corrected_text)

original: I havv a spelling error
corrected: I have a spelling error


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# **Deduction**

In [9]:
# Simple Deduction Program

# Step 1: Define premises and hypothesis
premises = ['rain -> wet', 'rain']
hypothesis = 'wet'

# Step 2: Create a simple logic check
# If we have both "rain -> wet" and "rain", then we can deduce "wet"
def can_deduce(premises, hypothesis):
    if 'rain -> wet' in premises and 'rain' in premises and hypothesis == 'wet':
        return True
    else:
        return False

# Step 3: Run the deduction
result = can_deduce(premises, hypothesis)

# Step 4: Print the results
print("premises:", tuple(premises))
print("hypothesis:", hypothesis)
print("can we deduce it?", result)

premises: ('rain -> wet', 'rain')
hypothesis: wet
can we deduce it? True


# **Normalization**

In [10]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

text = "A!!! HELLO,,, WORLD!! This IS PythOn..."

text = text.lower()
text = re.sub(r'[^a-z\s]', '', text)
text = ' '.join(text.split())

stop_words = set(stopwords.words('english'))
normalized_words = [word for word in text.split() if word not in stop_words]

normalized_text = ' '.join(normalized_words)

print("Normalized text:", normalized_text)


Normalized text: hello world python


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# **N-gram (Unigram)**

In [11]:
def unigram(text):
    words = text.split()
    return words

text = "This is a simple example"
unigrams = unigram(text)
print("Unigrams:", unigrams)


Unigrams: ['This', 'is', 'a', 'simple', 'example']


# **Bigram**

In [12]:
def bigram(text):
    words = text.split()
    bigrams = []
    for i in range(len(words) - 1):
        bigrams.append((words[i], words[i+1]))
    return bigrams
text = "This is a simple example"
bigrams = bigram(text)
print("Bigrams:", bigrams)

Bigrams: [('This', 'is'), ('is', 'a'), ('a', 'simple'), ('simple', 'example')]


# **Trigram**

In [13]:
def trigram(text):
    words = text.split()
    trigrams = []
    for i in range(len(words) - 2):
        trigrams.append((words[i], words[i+1], words[i+2]))
    return trigrams
text = "This is a simple example"
trigrams = trigram(text)
print("Trigrams:", trigrams)

Trigrams: [('This', 'is', 'a'), ('is', 'a', 'simple'), ('a', 'simple', 'example')]


# **Smoothing**

In [14]:
from collections import Counter

text = "I love eating ice cream I love chocolate ice cream"
words = text.split()
V = len(set(words))

bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
bigram_counts = Counter(bigrams)
unigram_counts = Counter(words)

def prob(w2, w1):
    return (bigram_counts[(w1, w2)] + 1) / (unigram_counts[w1] + V)

pairs = [('ice', 'cream'), ('I', 'love'), ('love', 'chocolate')]
for w1, w2 in pairs:
    print(f"P({w2}|{w1}) = {prob(w2, w1):.4f}")


P(cream|ice) = 0.3750
P(love|I) = 0.3750
P(chocolate|love) = 0.2500


# **POS Tagging**

In [15]:
import nltk
from nltk import word_tokenize, pos_tag

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

text = "I love learning Natural Language Processing using Python"
tokens = word_tokenize(text)
tagged = pos_tag(tokens)

print("Tokens:", tokens)
print("POS Tags:", tagged)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Tokens: ['I', 'love', 'learning', 'Natural', 'Language', 'Processing', 'using', 'Python']
POS Tags: [('I', 'PRP'), ('love', 'VBP'), ('learning', 'VBG'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'NNP'), ('using', 'VBG'), ('Python', 'NNP')]


# **HMM Tagging**

In [16]:
import nltk
from nltk import word_tokenize, pos_tag

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

text = "I love learning Natural Language Processing using Python"
tokens = word_tokenize(text)
tags = pos_tag(tokens)

print("Tokens:", tokens)
print("POS Tags:", tags)

Tokens: ['I', 'love', 'learning', 'Natural', 'Language', 'Processing', 'using', 'Python']
POS Tags: [('I', 'PRP'), ('love', 'VBP'), ('learning', 'VBG'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'NNP'), ('using', 'VBG'), ('Python', 'NNP')]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


# **Brill POS Tagger**

In [17]:
import nltk
from nltk.corpus import treebank
from nltk.tag import hmm

nltk.download('treebank')
nltk.download('universal_tagset')

train_data = treebank.tagged_sents(tagset='universal')

train_size = int(len(train_data) * 0.9)
train_sents = train_data[:train_size]
test_sents = train_data[train_size:]

trainer = hmm.HiddenMarkovModelTrainer()
hmm_tagger = trainer.train_supervised(train_sents)

text = "I love learning Natural Language Processing".split()
tags = hmm_tagger.tag(text)

print("Sentence:", text)
print("HMM POS Tags:", tags)


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
  O[i, k] = self._output_logprob(si, self._symbols[k])


Sentence: ['I', 'love', 'learning', 'Natural', 'Language', 'Processing']
HMM POS Tags: [('I', 'PRON'), ('love', 'NOUN'), ('learning', 'NOUN'), ('Natural', 'NOUN'), ('Language', 'NOUN'), ('Processing', 'NOUN')]


  O[i, k] = self._output_logprob(si, self._symbols[k])
