<a href="https://colab.research.google.com/github/Rakshithbodakuntla/segmentation_tokenization/blob/main/Segmentation_and_tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:


!pip install nltk -q
import nltk

# Download all needed resources safely
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pandas as pd

sentences = [
    "The duck will duck under the table.",
    "I will book a room to read a book."
]

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def penn_to_wn(tag):
    if tag.startswith('J'): return wn.ADJ
    if tag.startswith('V'): return wn.VERB
    if tag.startswith('N'): return wn.NOUN
    if tag.startswith('R'): return wn.ADV
    return wn.NOUN

def process_sentence(sentence):
    tokens = word_tokenize(sentence)
    pos_tags = pos_tag(tokens)  # <-- now works fine
    data = []
    for tok, tag in pos_tags:
        if tok.lower() in stop_words:
            continue
        wn_pos = penn_to_wn(tag)
        lemma = lemmatizer.lemmatize(tok.lower(), pos=wn_pos)
        stem = stemmer.stem(tok.lower())
        comment = ""
        if tok.lower() in ["duck", "book"]:
            comment = "Ambiguous (NOUN/VERB) – context disambiguates."
        if stem != lemma:
            comment += " Lemma keeps valid form."
        data.append([tok, stem, lemma, tag, comment.strip()])
    df = pd.DataFrame(data, columns=["token", "stem", "lemma", "POS", "comment"])
    return df

for s in sentences:
    print(f"\n=== Sentence: {s} ===")
    display(process_sentence(s))

print("""
Summary:
• PorterStemmer is faster but produces non-words (e.g., 'table' → 'tabl').
• Lemmatizer uses POS + WordNet, keeping valid forms.
• Lemmatization improves POS/NER features at slight speed cost.
""")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



=== Sentence: The duck will duck under the table. ===


Unnamed: 0,token,stem,lemma,POS,comment
0,duck,duck,duck,NN,Ambiguous (NOUN/VERB) – context disambiguates.
1,duck,duck,duck,VB,Ambiguous (NOUN/VERB) – context disambiguates.
2,table,tabl,table,NN,Lemma keeps valid form.
3,.,.,.,.,



=== Sentence: I will book a room to read a book. ===


Unnamed: 0,token,stem,lemma,POS,comment
0,book,book,book,NN,Ambiguous (NOUN/VERB) – context disambiguates.
1,room,room,room,NN,
2,read,read,read,VB,
3,book,book,book,NN,Ambiguous (NOUN/VERB) – context disambiguates.
4,.,.,.,.,



Summary:
• PorterStemmer is faster but produces non-words (e.g., 'table' → 'tabl').
• Lemmatizer uses POS + WordNet, keeping valid forms.
• Lemmatization improves POS/NER features at slight speed cost.

