In [None]:

!pip install nltk spacy

import nltk
# Download required resources
nltk.download('punkt')
nltk.download('punkt_tab')   # Fix for tokenization error
nltk.download('stopwords')
nltk.download('wordnet')

import spacy
spacy.cli.download("en_core_web_sm")

corpus = [
    "The quick brown fox jumps over the lazy dog.",
    "Artificial Intelligence and Natural Language Processing are changing the world!",
    "NLTK and SpaCy are popular libraries for text preprocessing in Python."
]

print("Original Corpus:")
for doc in corpus:
    print("-", doc)

from nltk.tokenize import word_tokenize

tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]
print("\nTokenized Corpus:")
for tokens in tokenized_corpus:
    print(tokens)


from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

filtered_corpus = [[word for word in tokens if word.isalpha() and word not in stop_words]
                   for tokens in tokenized_corpus]

print("\nAfter Stop Word Removal:")
for tokens in filtered_corpus:
    print(tokens)

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

stemmed_corpus = [[stemmer.stem(word) for word in tokens] for tokens in filtered_corpus]

print("\nAfter Stemming:")
for tokens in stemmed_corpus:
    print(tokens)

nlp = spacy.load("en_core_web_sm")

lemmatized_corpus = []
for doc in corpus:
    spacy_doc = nlp(doc.lower())
    lemmatized_corpus.append([token.lemma_ for token in spacy_doc if token.is_alpha and token.text not in stop_words])

print("\nAfter Lemmatization:")
for tokens in lemmatized_corpus:
    print(tokens)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Original Corpus:
- The quick brown fox jumps over the lazy dog.
- Artificial Intelligence and Natural Language Processing are changing the world!
- NLTK and SpaCy are popular libraries for text preprocessing in Python.

Tokenized Corpus:
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']
['artificial', 'intelligence', 'and', 'natural', 'language', 'processing', 'are', 'changing', 'the', 'world', '!']
['nltk', 'and', 'spacy', 'are', 'popular', 'libraries', 'for', 'text', 'preprocessing', 'in', 'python', '.']

After Stop Word Removal:
['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']
['artificial', 'intelli