In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# Download required NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng') # Added to fix the LookupError for eng version

text = "John enjoys playing football while Mary loves reading books in the library."

# 1. Tokenization
tokens = word_tokenize(text)

# 2. Remove stopwords
stop_words = set(stopwords.words("english"))
filtered_tokens = [t for t in tokens if t.lower() not in stop_words]

# 3. POS tagging (before lemmatization, to use correct POS)
tagged = pos_tag(filtered_tokens)

# Helper: convert POS tag to WordNet POS
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'   # adjective
    elif treebank_tag.startswith('V'):
        return 'v'   # verb
    elif treebank_tag.startswith('N'):
        return 'n'   # noun
    elif treebank_tag.startswith('R'):
        return 'r'   # adverb
    else:
        return None

lemmatizer = WordNetLemmatizer()

# 4. Lemmatize + keep only verbs and nouns
final_tokens = []
for word, tag in tagged:
    wn_pos = get_wordnet_pos(tag)
    if wn_pos in ['n', 'v']:   # keep only nouns & verbs
        lemma = lemmatizer.lemmatize(word, wn_pos)
        final_tokens.append(lemma)

print("Final tokens:", final_tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Final tokens: ['John', 'enjoy', 'play', 'football', 'Mary', 'love', 'read', 'book']


In [5]:
import spacy

# Load spaCy English model with NER capability
nlp = spacy.load("en_core_web_sm")

text = "Chris met Alex at Apple headquarters in California. He told him about the new iPhone launch."

# Process text
doc = nlp(text)

print("=== Named Entities ===")
for ent in doc.ents:
    print(f"{ent.text}  -->  {ent.label_}")

# Pronoun ambiguity detection
pronouns = {"he", "she", "they", "him", "her", "them"}

tokens_lower = [token.text.lower() for token in doc]

if any(p in tokens_lower for p in pronouns):
    print("\nWarning: Possible pronoun ambiguity detected!")


=== Named Entities ===
Chris  -->  PERSON
Alex  -->  PERSON
Apple  -->  ORG
California  -->  GPE
iPhone  -->  ORG

