**Lemmatization**

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

print("Lemmatized word:", WordNetLemmatizer().lemmatize(input("Enter a word: ")))


Enter a word: bank
Lemmatized word: bank


**Normalization**

In [None]:
import re

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

text = "Hello World!! This is Python"
print("Normalized text:", normalize_text(text))

Normalized text: hello world!! this is python


**Tokenization**

In [None]:
text = "Hello World This is simple tokenization example"
tokens = text.split()
print(tokens)

['Hello', 'World', 'This', 'is', 'simple', 'tokenization', 'example']


**Stemming**

In [None]:
import nltk
import warnings
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

warnings.filterwarnings("ignore")
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

text = "Cars running faster than the other animals"
words = word_tokenize(text)

ps = PorterStemmer()
stemmed = [ps.stem(w) for w in words]

print(stemmed)


['car', 'run', 'faster', 'than', 'the', 'other', 'anim']


**Morphology**

In [None]:
import spacy, warnings
warnings.filterwarnings("ignore")

nlp = spacy.load("en_core_web_sm")

text = "The cats are running quickly"
doc = nlp(text)

for token in doc:
    print(token.text, "→", token.lemma_, "|", token.pos_, "|", token.morph)


The → the | DET | Definite=Def|PronType=Art
cats → cat | NOUN | Number=Plur
are → be | AUX | Mood=Ind|Tense=Pres|VerbForm=Fin
running → run | VERB | Aspect=Prog|Tense=Pres|VerbForm=Part
quickly → quickly | ADV | 


**Spelling Correction**

In [None]:

!pip install textblob -q

from textblob import TextBlob

text = "I lik to lern naturall langauge procesing"
blob = TextBlob(text)

print(blob.correct())


I like to learn natural language processing


**Deduction**

In [None]:
import nltk
from nltk.sem import Expression
from nltk.inference import ResolutionProver

read_expr = Expression.fromstring

kb = [
    read_expr('man(Socrates)'),
    read_expr('all x (man(x) -> mortal(x))')
]

goal = read_expr('mortal(Socrates)')

print(ResolutionProver().prove(goal, kb))


True


**Unigram**

In [None]:
text = "I love natural language processing and I love coding"

words = text.split()

freq = {}
for word in words:
    freq[word] = freq.get(word, 0) + 1

print(freq)


{'I': 2, 'love': 2, 'natural': 1, 'language': 1, 'processing': 1, 'and': 1, 'coding': 1}


**Bigram**

In [None]:
text = "I love natural language processing and I love coding"

words = text.split()

bigrams = []
for i in range(len(words) - 1):
    bigrams.append((words[i], words[i+1]))

freq = {}
for bigram in bigrams:
    freq[bigram] = freq.get(bigram, 0) + 1

print(freq)


{('I', 'love'): 2, ('love', 'natural'): 1, ('natural', 'language'): 1, ('language', 'processing'): 1, ('processing', 'and'): 1, ('and', 'I'): 1, ('love', 'coding'): 1}


**Trigram**

In [None]:
text = "I love natural language processing and I love coding"

words = text.split()

trigrams = []
for i in range(len(words) - 2):
    trigrams.append((words[i], words[i+1], words[i+2]))

freq = {}
for trigram in trigrams:
    freq[trigram] = freq.get(trigram, 0) + 1

print(freq)


{('I', 'love', 'natural'): 1, ('love', 'natural', 'language'): 1, ('natural', 'language', 'processing'): 1, ('language', 'processing', 'and'): 1, ('processing', 'and', 'I'): 1, ('and', 'I', 'love'): 1, ('I', 'love', 'coding'): 1}


**N-gram Smoothing**

In [None]:
from collections import Counter

text = "I love NLP I love machine learning"
words = text.split()
V = len(set(words))

unigrams = Counter(words)
bigrams = Counter([(words[i], words[i+1]) for i in range(len(words)-1)])

def laplace_prob(w1, w2):
    return (bigrams[(w1, w2)] + 1) / (unigrams[w1] + V)

print("P(love | I) =", laplace_prob("I", "love"))
print("P(NLP | love) =", laplace_prob("love", "NLP"))
print("P(machine | NLP) =", laplace_prob("NLP", "machine"))
print("P(learning | machine) =", laplace_prob("machine", "learning"))
print("P(unknown | NLP) =", laplace_prob("NLP", "unknown"))  # unseen word


P(love | I) = 0.42857142857142855
P(NLP | love) = 0.2857142857142857
P(machine | NLP) = 0.16666666666666666
P(learning | machine) = 0.3333333333333333
P(unknown | NLP) = 0.16666666666666666


**POS Tagging**

In [None]:
import nltk

nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

text = "I love learning NLP"
words = nltk.word_tokenize(text)
print(nltk.pos_tag(words))


[('I', 'PRP'), ('love', 'VBP'), ('learning', 'VBG'), ('NLP', 'NNP')]


**HMM**

In [None]:
import nltk
from nltk.tag import hmm
import warnings

warnings.filterwarnings("ignore")

train_data = [[
    ('I', 'PRONOUN'),
    ('love', 'VERB'),
    ('dogs', 'NOUN')
], [
    ('You', 'PRONOUN'),
    ('love', 'VERB'),
    ('cats', 'NOUN')
]]

trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(train_data)

sentence = ['I', 'love', 'cats']
print(tagger.tag(sentence))


[('I', 'PRONOUN'), ('love', 'VERB'), ('cats', 'NOUN')]


**Brill POS Tagger**

In [None]:
import nltk
from nltk.tag import brill, brill_trainer, UnigramTagger

nltk.download('treebank')
nltk.download('universal_tagset')

data = nltk.corpus.treebank.tagged_sents(tagset='universal')[:3000]
uni = UnigramTagger(data)
tagger = brill_trainer.BrillTaggerTrainer(uni, brill.fntbl37()).train(data)

print(tagger.tag("I love learning NLP".split()))


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


[('I', 'PRON'), ('love', None), ('learning', 'NOUN'), ('NLP', None)]
