# NLP Interview Prep: NLTK vs spaCy
Side-by-side comparison notebook

## Setup

In [1]:

# Install required libraries (run once)
# !pip install nltk spacy
# !python -m spacy download en_core_web_sm

In [2]:
import nltk

resources = [
    'punkt',
    'punkt_tab',
    'stopwords',
    'wordnet',
    'averaged_perceptron_tagger_eng',
    'maxent_ne_chunker',
    'words'
]

for r in resources:
    nltk.download(r)

[nltk_data] Downloading package punkt to C:\Users\Rohan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Rohan/nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rohan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Rohan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Rohan/nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Rohan/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to C:\Users\Rohan/nltk_data...
[nltk_data]   Package words is already up-to-date!


## Sample Text

In [3]:

text = "Natural Language Processing is a subfield of Artificial Intelligence that helps machines understand human language"
print(text)


Natural Language Processing is a subfield of Artificial Intelligence that helps machines understand human language


## Tokenization

In [4]:
# NLTK Tokenization
from nltk.tokenize import word_tokenize
tokens_nltk = word_tokenize(text)
print("NLTK Tokens:", tokens_nltk)


NLTK Tokens: ['Natural', 'Language', 'Processing', 'is', 'a', 'subfield', 'of', 'Artificial', 'Intelligence', 'that', 'helps', 'machines', 'understand', 'human', 'language']


In [None]:
# spaCy Tokenization
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
tokens_spacy = [token.text for token in doc]
print("spaCy Tokens:", tokens_spacy)


Document:  Natural Language Processing is a subfield of Artificial Intelligence that helps machines understand human language
spaCy Tokens: ['Natural', 'Language', 'Processing', 'is', 'a', 'subfield', 'of', 'Artificial', 'Intelligence', 'that', 'helps', 'machines', 'understand', 'human', 'language']


## Stopwords Removal

In [6]:
# NLTK Stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
filtered_nltk = [w for w in tokens_nltk if w.lower() not in stop_words]
print("NLTK Filtered:", filtered_nltk)


NLTK Filtered: ['Natural', 'Language', 'Processing', 'subfield', 'Artificial', 'Intelligence', 'helps', 'machines', 'understand', 'human', 'language']


In [7]:
# spaCy Stopwords
filtered_spacy = [token.text for token in doc if not token.is_stop]
print("spaCy Filtered:", filtered_spacy)


spaCy Filtered: ['Natural', 'Language', 'Processing', 'subfield', 'Artificial', 'Intelligence', 'helps', 'machines', 'understand', 'human', 'language']


## Stemming (NLTK only)

In [8]:

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in filtered_nltk]
print("Stemmed Words:", stemmed)


Stemmed Words: ['natur', 'languag', 'process', 'subfield', 'artifici', 'intellig', 'help', 'machin', 'understand', 'human', 'languag']


## Lemmatization

In [9]:

# NLTK Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_nltk = [lemmatizer.lemmatize(w) for w in filtered_nltk]
print("NLTK Lemmas:", lemmatized_nltk)


NLTK Lemmas: ['Natural', 'Language', 'Processing', 'subfield', 'Artificial', 'Intelligence', 'help', 'machine', 'understand', 'human', 'language']


In [10]:

# spaCy Lemmatization
lemmatized_spacy = [token.lemma_ for token in doc if not token.is_stop]
print("spaCy Lemmas:", lemmatized_spacy)


spaCy Lemmas: ['Natural', 'Language', 'processing', 'subfield', 'Artificial', 'Intelligence', 'help', 'machine', 'understand', 'human', 'language']


| **NLTK Tag** | **spaCy Tag** | **English Meaning**                | **Example**   |
| ------------ | ------------- | ---------------------------------- | ------------- |
| NN / NNS     | NOUN          | Common noun (singular / plural)    | dog, dogs     |
| NNP / NNPS   | PROPN         | Proper noun (name of person/place) | India, Rohan  |
| VB*          | VERB          | Main verb (all verb forms)         | run, eat      |
| MD           | AUX           | Auxiliary / modal verb             | can, should   |
| JJ*          | ADJ           | Adjective (describes noun)         | beautiful     |
| RB*          | ADV           | Adverb (describes verb/adj)        | quickly       |
| PRP          | PRON          | Pronoun                            | he, she, they |
| DT           | DET           | Determiner                         | the, a, this  |
| IN           | ADP           | Preposition / postposition         | in, on, at    |
| CC           | CCONJ         | Coordinating conjunction           | and, but      |
| CD           | NUM           | Number                             | one, 10       |
| UH           | INTJ          | Interjection                       | wow, oh       |


## POS Tagging

In [11]:
# NLTK POS Tagging
from nltk import pos_tag
pos_nltk = pos_tag(tokens_nltk)
print("NLTK POS:", pos_nltk)


NLTK POS: [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('subfield', 'NN'), ('of', 'IN'), ('Artificial', 'NNP'), ('Intelligence', 'NNP'), ('that', 'WDT'), ('helps', 'VBZ'), ('machines', 'NNS'), ('understand', 'JJ'), ('human', 'JJ'), ('language', 'NN')]


In [12]:

# spaCy POS Tagging
pos_spacy = [(token.text, token.pos_) for token in doc]
print("spaCy POS:", pos_spacy)


spaCy POS: [('Natural', 'PROPN'), ('Language', 'PROPN'), ('Processing', 'NOUN'), ('is', 'AUX'), ('a', 'DET'), ('subfield', 'NOUN'), ('of', 'ADP'), ('Artificial', 'PROPN'), ('Intelligence', 'PROPN'), ('that', 'PRON'), ('helps', 'VERB'), ('machines', 'NOUN'), ('understand', 'VERB'), ('human', 'ADJ'), ('language', 'NOUN')]


## Named Entity Recognition

In [13]:

# spaCy NER
for ent in doc.ents:
    print(ent.text, ent.label_)


Natural Language Processing ORG
Artificial Intelligence ORG
