In [4]:
# Install required libraries (if not already installed)
# !pip install nltk spacy
# !python -m spacy download en_core_web_sm

import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

# Load spaCy English model
nlp = spacy.load('en_core_web_sm')

# Sample Raw Text
raw_text = """
Nike, Inc. is an American multinational corporation that is engaged in the design, development, manufacturing,
and worldwide marketing and sales of footwear, apparel, equipment, accessories, and services. The company is
headquartered near Beaverton, Oregon, in the Portland metropolitan area. It is one of the world's largest
suppliers of athletic shoes and apparel and a major manufacturer of sports equipment. Nike sponsors many high-profile
athletes and sports teams around the world, and its brand is recognized globally. The company was founded on
January 25, 1964, as Blue Ribbon Sports and officially became Nike, Inc. in 1971. Nike's mission statement is
“To bring inspiration and innovation to every athlete in the world.”
"""

# ---------------- NLTK Preprocessing ----------------

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download punkt_tab explicitly

# 1. Tokenization
sentences = sent_tokenize(raw_text)
words = word_tokenize(raw_text)

# 2. Convert to lowercase
words = [word.lower() for word in words]

# 3. Remove punctuation
words = [word for word in words if word.isalnum()]

# 4. Remove stopwords
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words]

# 5. Stemming
ps = PorterStemmer()
stemmed_words = [ps.stem(word) for word in words]

# 6. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print("NLTK Tokenized Words:", words)
print("Stemmed Words:", stemmed_words)
print("Lemmatized Words:", lemmatized_words)

# ---------------- spaCy Preprocessing ----------------

doc = nlp(raw_text)

# Tokenization, Lowercasing, Removing stopwords and punctuation
spacy_tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]

# Named Entity Recognition (Optional)
entities = [(ent.text, ent.label_) for ent in doc.ents]

print("spaCy Processed Tokens:", spacy_tokens)
print("Named Entities:", entities)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


NLTK Tokenized Words: ['nike', 'american', 'multinational', 'corporation', 'engaged', 'design', 'development', 'manufacturing', 'worldwide', 'marketing', 'sales', 'footwear', 'apparel', 'equipment', 'accessories', 'services', 'company', 'headquartered', 'near', 'beaverton', 'oregon', 'portland', 'metropolitan', 'area', 'one', 'world', 'largest', 'suppliers', 'athletic', 'shoes', 'apparel', 'major', 'manufacturer', 'sports', 'equipment', 'nike', 'sponsors', 'many', 'athletes', 'sports', 'teams', 'around', 'world', 'brand', 'recognized', 'globally', 'company', 'founded', 'january', '25', '1964', 'blue', 'ribbon', 'sports', 'officially', 'became', 'nike', '1971', 'nike', 'mission', 'statement', 'bring', 'inspiration', 'innovation', 'every', 'athlete', 'world']
Stemmed Words: ['nike', 'american', 'multin', 'corpor', 'engag', 'design', 'develop', 'manufactur', 'worldwid', 'market', 'sale', 'footwear', 'apparel', 'equip', 'accessori', 'servic', 'compani', 'headquart', 'near', 'beaverton', 'o