In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem import PorterStemmer
import nltk

In [None]:
# Load the spaCy model
try:
  nlp = spacy.load("en_core_web_sm")
except OSError:
   print("Downloading 'en_core_web_sm' model. Please wait...")
   spacy.cli.download("en_core_web_sm")
   nlp = spacy.load("en_core_web_sm")



In [None]:
# Initialize the stemmer
stemmer = PorterStemmer()

In [None]:
# --- INPUT TEXT ---
text = "Jack and Jill went up the hill"
print(f"--- Original Text ---\n'{text}'\n")

--- Original Text ---
'Jack and Jill went up the hill'



In [None]:
# Process the text with spaCy
doc = nlp(text)

In [None]:
# Initialize lists to store results
tokens = []
clean_tokens = []
lemmas = []
stems = []

In [None]:
# --- 2. Tokenization, Lemmatization, Stop Word Removal, and Stemming ---
print("--- Step-by-Step Processing ---")
for token in doc:
# 1. Tokenization (The token object itself is a token)
 tokens.append(token.text)

# 3. Lemmatization (using token.lemma_)
 lemma = token.lemma_
 lemmas.append(lemma)

# 4. Stop Word Removal (check if the token is a stop word and if it's punctuation/whitespace)
# Lowercasing is good practice for stop word check
 is_stop = token.is_stop or token.is_punct or token.is_space
 if not is_stop:
# Append the lemmatized form after stop-word removal for 'clean' tokens
  clean_tokens.append(lemma)

# 2. Stemming (using NLTK's PorterStemmer on the token text)
  stem = stemmer.stem(token.text)
  stems.append(stem)

--- Step-by-Step Processing ---


In [None]:
# --- 5. Display Results ---

## Tokenization
print("\n## Tokenization")

print(tokens)
print("------------------------------")




## Tokenization
['Jack', 'and', 'Jill', 'went', 'up', 'the', 'hill']
------------------------------


In [None]:
## Lemmatization (Token + Lemma)
print("\n## Lemmatization (Token & Lemma)")
print(f"{'Token':<7} | {'Lemma':<7}")
print(f"{'-'*7} | {'-'*7}")
for t, l in zip(tokens, lemmas):
 print(f"{t:<7} | {l:<7}")
 print("------------------------------")




## Lemmatization (Token & Lemma)
Token   | Lemma  
------- | -------
Jack    | Jack   
------------------------------
and     | and    
------------------------------
Jill    | Jill   
------------------------------
went    | go     
------------------------------
up      | up     
------------------------------
the     | the    
------------------------------
hill    | hill   
------------------------------


In [None]:
## Stop Word Removal (Clean Tokens)
print("\n## Stop Word Removal & Final Clean List (Lemmatized)")
print(clean_tokens)
print("------------------------------")




## Stop Word Removal & Final Clean List (Lemmatized)
['Jack', 'Jill', 'go', 'hill']
------------------------------


In [None]:
## Stemming (Token + Stem)
print("\n## Stemming (Token & Stem - using NLTK)")
# Note: Stemming is generally less accurate than lemmatization, e.g., 'running' -> 'run' (lemma) vs 'run' (stem)
print(f"{'Token':<7} | {'Stem':<7}")
print(f"{'-'*7} | {'-'*7}")
for t, s in zip(tokens, stems):
  print(f"{t:<7} | {s:<7}")
  print("------------------------------")


## Stemming (Token & Stem - using NLTK)
Token   | Stem   
------- | -------
Jack    | jack   
------------------------------
and     | jill   
------------------------------
Jill    | went   
------------------------------
went    | hill   
------------------------------
