<a href="https://colab.research.google.com/github/RamcharanChandragiri/NATURAL-LANGUAGE-PROCESSING/blob/main/NLP_ASSIGNMENT_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import spacy
import pandas as pd

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer


In [None]:
# Download required NLTK resources
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Load spaCy English model
!python -m spacy download en_core_web_sm



Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m87.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
nlp = spacy.load("en_core_web_sm")


In [None]:
texts = {
    "Emily Dickinson":
    "Because I could not stop for Death – He kindly stopped for me – "
    "The Carriage held but just Ourselves – And Immortality.",

    "William Shakespeare":
    "To be, or not to be, that is the question: "
    "Whether 'tis nobler in the mind to suffer.",

    "Robert Frost":
    "Two roads diverged in a yellow wood, "
    "And sorry I could not travel both."
}


In [None]:
for author, text in texts.items():
    print(f"\n{author}:\n{text}")



Emily Dickinson:
Because I could not stop for Death – He kindly stopped for me – The Carriage held but just Ourselves – And Immortality.

William Shakespeare:
To be, or not to be, that is the question: Whether 'tis nobler in the mind to suffer.

Robert Frost:
Two roads diverged in a yellow wood, And sorry I could not travel both.


In [None]:
import nltk
nltk.download('punkt_tab')
for author, text in texts.items():
    sentences = sent_tokenize(text)
    print(f"\n{author} - Sentences:")
    for s in sentences:
        print("-", s)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Emily Dickinson - Sentences:
- Because I could not stop for Death – He kindly stopped for me – The Carriage held but just Ourselves – And Immortality.

William Shakespeare - Sentences:
- To be, or not to be, that is the question: Whether 'tis nobler in the mind to suffer.

Robert Frost - Sentences:
- Two roads diverged in a yellow wood, And sorry I could not travel both.


In [None]:
for author, text in texts.items():
    words = word_tokenize(text)
    print(f"\n{author} - Words:")
    print(words)



Emily Dickinson - Words:
['Because', 'I', 'could', 'not', 'stop', 'for', 'Death', '–', 'He', 'kindly', 'stopped', 'for', 'me', '–', 'The', 'Carriage', 'held', 'but', 'just', 'Ourselves', '–', 'And', 'Immortality', '.']

William Shakespeare - Words:
['To', 'be', ',', 'or', 'not', 'to', 'be', ',', 'that', 'is', 'the', 'question', ':', 'Whether', "'t", 'is', 'nobler', 'in', 'the', 'mind', 'to', 'suffer', '.']

Robert Frost - Words:
['Two', 'roads', 'diverged', 'in', 'a', 'yellow', 'wood', ',', 'And', 'sorry', 'I', 'could', 'not', 'travel', 'both', '.']


In [None]:
stemmer = PorterStemmer()

for author, text in texts.items():
    words = word_tokenize(text)
    stems = [stemmer.stem(word) for word in words if word.isalpha()]
    print(f"\n{author} - Stemmed Words:")
    print(stems)



Emily Dickinson - Stemmed Words:
['becaus', 'i', 'could', 'not', 'stop', 'for', 'death', 'he', 'kindli', 'stop', 'for', 'me', 'the', 'carriag', 'held', 'but', 'just', 'ourselv', 'and', 'immort']

William Shakespeare - Stemmed Words:
['to', 'be', 'or', 'not', 'to', 'be', 'that', 'is', 'the', 'question', 'whether', 'is', 'nobler', 'in', 'the', 'mind', 'to', 'suffer']

Robert Frost - Stemmed Words:
['two', 'road', 'diverg', 'in', 'a', 'yellow', 'wood', 'and', 'sorri', 'i', 'could', 'not', 'travel', 'both']


In [None]:
for author, text in texts.items():
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if token.is_alpha]
    print(f"\n{author} - Lemmatized Words:")
    print(lemmas)



Emily Dickinson - Lemmatized Words:
['because', 'I', 'could', 'not', 'stop', 'for', 'death', 'he', 'kindly', 'stop', 'for', 'I', 'the', 'Carriage', 'hold', 'but', 'just', 'ourselves', 'and', 'immortality']

William Shakespeare - Lemmatized Words:
['to', 'be', 'or', 'not', 'to', 'be', 'that', 'be', 'the', 'question', 'whether', 'tis', 'nobler', 'in', 'the', 'mind', 'to', 'suffer']

Robert Frost - Lemmatized Words:
['two', 'road', 'diverge', 'in', 'a', 'yellow', 'wood', 'and', 'sorry', 'I', 'could', 'not', 'travel', 'both']


In [None]:
sample_text = texts["Emily Dickinson"]
words = word_tokenize(sample_text)

comparison = pd.DataFrame({
    "Original": [w for w in words if w.isalpha()],
    "Stemmed": [stemmer.stem(w) for w in words if w.isalpha()],
    "Lemmatized": [token.lemma_ for token in nlp(sample_text) if token.is_alpha]
})

comparison


Unnamed: 0,Original,Stemmed,Lemmatized
0,Because,becaus,because
1,I,i,I
2,could,could,could
3,not,not,not
4,stop,stop,stop
5,for,for,for
6,Death,death,death
7,He,he,he
8,kindly,kindli,kindly
9,stopped,stop,stop
