<a href="https://colab.research.google.com/github/Nithyaasri/NLP/blob/main/NLP_RECORD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **TOKENIZATION**

In [2]:
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")
text = "Hello, how are you?"
tokens = encoding.encode(text)
decoded_text = encoding.decode(tokens)
print("Original Text:", text)
print("Tokens:", tokens)
print("Decoded Text:", decoded_text)
print("Number of Tokens:", len(tokens))


Original Text: Hello, how are you?
Tokens: [9906, 11, 1268, 527, 499, 30]
Decoded Text: Hello, how are you?
Number of Tokens: 6


# **NORMALIZATION**

In [3]:
import re
text = "Cats are running faster than the cat. Dogs were barking loudly."
normalized = [re.sub(r'[^a-zA-Z]', '', word.lower()) for word in text.split()]
print("Normalization:", normalized)


Normalization: ['cats', 'are', 'running', 'faster', 'than', 'the', 'cat', 'dogs', 'were', 'barking', 'loudly']


# **STEMMING**

In [4]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

words = ['cats', 'running', 'faster', 'barking']
stems = [stemmer.stem(w) for w in words]
print("Stemming:", stems)


Stemming: ['cat', 'run', 'faster', 'bark']


# **LEMMATIZATION**

In [5]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
words = ['cats', 'running', 'better', 'barking']
lemmas = [lemmatizer.lemmatize(w) for w in words]
print("Lemmatization:", lemmas)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Lemmatization: ['cat', 'running', 'better', 'barking']


# **MORPHOLOGY**

In [6]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

words = ['running', 'dogs', 'better', 'easily']

print("Word     | Stem     | Lemma")
for w in words:
    print(f"{w:9} | {stemmer.stem(w):9} | {lemmatizer.lemmatize(w):9}")


Word     | Stem     | Lemma
running   | run       | running  
dogs      | dog       | dog      
better    | better    | better   
easily    | easili    | easily   


# **N-GRAMS (Unigram, Bigram, Trigram)**

In [9]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")

from nltk.tokenize import word_tokenize
from nltk import ngrams

text = "Dogs bark loudly at night"
tokens = word_tokenize(text.lower())

unigrams = list(ngrams(tokens, 1))
bigrams  = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))
print(unigrams)
print(bigrams)
print(trigrams)


[('dogs',), ('bark',), ('loudly',), ('at',), ('night',)]
[('dogs', 'bark'), ('bark', 'loudly'), ('loudly', 'at'), ('at', 'night')]
[('dogs', 'bark', 'loudly'), ('bark', 'loudly', 'at'), ('loudly', 'at', 'night')]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# **N-GRAM SMOOTHING (Laplace / Add-One Smoothing)**

In [11]:
def ngram_smooth(text, n=2):
    words = text.split()
    N = len(words) - n + 1
    ngrams = [tuple(words[i:i+n]) for i in range(N)]
    V = len(set(ngrams))
    for ng in sorted(set(ngrams)):
        count = ngrams.count(ng)
        print(ng, "->", round((count+1)/(N+V), 4))

text = "I love NLP and I love Python"
ngram_smooth(text, 2)


('I', 'love') -> 0.2727
('NLP', 'and') -> 0.1818
('and', 'I') -> 0.1818
('love', 'NLP') -> 0.1818
('love', 'Python') -> 0.1818


# **PART OF SPEECH TAGGING (POS)**

In [12]:
import spacy

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Input sentence
text = "The quick brown fox jumps over the lazy dog."

# Process the text
doc = nlp(text)

# Print word and its POS tag
for token in doc:
    print(token.text, "→", token.pos_)


The → DET
quick → ADJ
brown → ADJ
fox → NOUN
jumps → VERB
over → ADP
the → DET
lazy → ADJ
dog → NOUN
. → PUNCT


# **HIDDEN MARKEV MODEL**

In [13]:
import warnings
from nltk.tag import hmm

warnings.filterwarnings("ignore")

train = [[('The','DT'),('dog','NN'),('barks','VBZ')],
         [('A','DT'),('cat','NN'),('meows','VBZ')]]

tagger = hmm.HiddenMarkovModelTrainer().train_supervised(train)
print(tagger.tag(['A','dog','meows']))


[('A', 'DT'), ('dog', 'NN'), ('meows', 'VBZ')]


# **BENDING POST TAGGING**

In [14]:

import nltk
from nltk.tag import brill, brill_trainer

# Training data
data = [[('The','DT'),('dog','NN'),('barks','VBZ')],
        [('A','DT'),('cat','NN'),('meows','VBZ')]]

# Base tagger
base = nltk.UnigramTagger(data)

# Train Brill tagger
tagger = brill_trainer.BrillTaggerTrainer(base, brill.fntbl37()).train(data)

# Test
print(tagger.tag(['The','cat','barks']))

[('The', 'DT'), ('cat', 'NN'), ('barks', 'VBZ')]


# **SPELLING CORRECTION**

In [15]:
from textblob import TextBlob

# Sample text with spelling errors
text = "I havv goood speling."

# Create a TextBlob object
blob = TextBlob(text)

# Correct the spelling
corrected_text = blob.correct()

# Output the corrected text
print("Original:", text)
print("Corrected:", corrected_text)

Original: I havv goood speling.
Corrected: I have good spelling.


# **DEDUCTION**

In [16]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

ps = PorterStemmer()

def stem(text):
    return [ps.stem(w) for w in word_tokenize(text.lower())]

def deduce(p, h):
    return "entailment" if h in p else "no entailment"

print("Stems:", stem("running runner runs easily fairer"))
print("Deduction:", deduce("All men are mortal Socrates is a man", "Socrates is mortal"))


Stems: ['run', 'runner', 'run', 'easili', 'fairer']
Deduction: no entailment
