In [None]:
# -*- coding: utf-8 -*-
"""Untitled9.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1NFhRV8TqKwY443qwR_FT8N4iMoHeb5rY
"""

import spacy
from spacy.lang.en.stop_words import STOP_WORDS as english_stopwords

# --- NOTE ---
# This implementation uses spaCy, which is more robust in environments
# where NLTK data downloads fail. It requires the 'en_core_web_sm' model.
# If you run into an OSError, you might need to run:
# python -m spacy download en_core_web_sm
# ------------

# Load the small English model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("ERROR: spaCy model 'en_core_web_sm' not found.")
    print("Please install it by running: python -m spacy download en_core_web_sm")
    exit()

# Q1. Input Text
input_text = "John enjoys playing football while Mary loves reading books in the library."

# Process the text using spaCy
doc = nlp(input_text)

# Define the required POS categories (Verbs and Nouns)
REQUIRED_POS = {"VERB", "NOUN"}

# 1. Segment into tokens (spaCy handles this automatically during doc processing)
# 2. Remove stopwords
# 3. Apply lemmatization (spaCy's token.lemma_ attribute)
# 4. Keep only verbs and nouns (using token.pos_ attribute)

final_tokens = []
all_tokens = [token.text for token in doc]

# Process tokens sequentially
for token in doc:
    # 1. Filter out tokens that are punctuation or pure whitespace
    if token.is_punct or token.is_space:
        continue

    # 2. Remove stopwords (using spaCy's defined stop words list)
    if token.text.lower() in english_stopwords:
        continue

    # 4. Keep only verbs and nouns (token.pos_ is the Universal POS tag)
    if token.pos_ in REQUIRED_POS:
        # 3. Apply lemmatization (token.lemma_ gives the base form)
        # We ensure the lemma is converted to lowercase for consistency
        lemma = token.lemma_.lower()
        final_tokens.append(lemma)


print(f"Input Text: {input_text}\n")
print(f"1. Tokens (Raw): {all_tokens}")
print(f"2. & 4. Tokens filtered by stop-words/punctuation and POS (Nouns/Verbs):")
print(f"   (Intermediate step not strictly printed, but filtered results are shown below)")

print("\n--- Final Result ---")
# Note: John and Mary are proper nouns, which spaCy correctly lemmatizes to themselves.
# 'playing' becomes 'play' (VERB), 'reading' becomes 'read' (VERB).
print(f"Final Tokens (Lemmatized Nouns and Verbs Only): {final_tokens}")

# Expected Output: ['John', 'enjoy', 'play', 'football', 'Mary', 'love', 'read', 'book', 'library']

import spacy

# --- NOTE ---
# You need to install spaCy and download a model if you haven't already:
# pip install spacy
# python -m spacy download en_core_web_sm
# ------------

# Load a pre-trained spaCy model for English
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Error: The spaCy model 'en_core_web_sm' is not found.")
    print("Please run: 'python -m spacy download en_core_web_sm'")
    exit()

# Q2. Input Text
input_text = "Chris met Alex at Apple headquarters in California. He told him about the new iPhone launch."

# Process the text with the spaCy model
doc = nlp(input_text)

# 1. Perform Named Entity Recognition (NER)
print(f"Input Text: {input_text}\n")
print("--- Named Entity Recognition (NER) Results ---")

entities_found = False
for ent in doc.ents:
    # Use spacy.explain(ent.label_) to provide helpful context for the entity type
    print(f"Entity: {ent.text}, Label: {ent.label_} (e.g., {spacy.explain(ent.label_)})")
    entities_found = True

if not entities_found:
    print("No named entities were detected.")


# 2. Disambiguation Prompt Check
# The required pronouns to check for ambiguity
pronouns_to_check = {"he", "she", "they"}
ambiguity_detected = False

# Iterate through tokens in the document
for token in doc:
    # Check if the lowercase token text is in our set of pronouns
    if token.text.lower() in pronouns_to_check:
        ambiguity_detected = True
        break

print("\n--- Pronoun Ambiguity Check ---")
if ambiguity_detected:
    # Print the required warning message
    print('Warning: Possible pronoun ambiguity detected!')
else:
    print('No target pronouns ("he", "she", "they") detected.')

Input Text: John enjoys playing football while Mary loves reading books in the library.

1. Tokens (Raw): ['John', 'enjoys', 'playing', 'football', 'while', 'Mary', 'loves', 'reading', 'books', 'in', 'the', 'library', '.']
2. & 4. Tokens filtered by stop-words/punctuation and POS (Nouns/Verbs):
   (Intermediate step not strictly printed, but filtered results are shown below)

--- Final Result ---
Final Tokens (Lemmatized Nouns and Verbs Only): ['enjoy', 'play', 'football', 'read', 'book', 'library']
Input Text: Chris met Alex at Apple headquarters in California. He told him about the new iPhone launch.

--- Named Entity Recognition (NER) Results ---
Entity: Chris, Label: PERSON (e.g., People, including fictional)
Entity: Alex, Label: PERSON (e.g., People, including fictional)
Entity: Apple, Label: ORG (e.g., Companies, agencies, institutions, etc.)
Entity: California, Label: GPE (e.g., Countries, cities, states)
Entity: iPhone, Label: ORG (e.g., Companies, agencies, institutions, etc.)