In [5]:
# Install required NLP libraries in Google Colab
!pip install nltk spacy

# Download spaCy English model
!python -m spacy download en_core_web_sm

# Download NLTK resources
import nltk
import nltk
nltk.download('punkt_tab')

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')



Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# Required NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

def get_wordnet_pos(tag):
    if tag.startswith("N"):
        return "n"
    if tag.startswith("V"):
        return "v"
    return None

def preprocess_text(text):

    print("===== Q1 Output Steps =====")

    # 1. SEGMENT INTO TOKENS
    tokens = word_tokenize(text)
    print("1. Segmented Tokens:")
    print(tokens, "\n")

    # 2. REMOVE STOPWORDS
    stop_words = set(stopwords.words("english"))
    filtered = [w for w in tokens if w.lower() not in stop_words]
    print("2. After Stopword Removal:")
    print(filtered, "\n")

    # 3. LEMMATIZATION (all tokens after stopwords)
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(w.lower()) for w in filtered]
    print("3. Lemmatized Tokens (before POS filtering):")
    print(lemmas, "\n")

    # 4. KEEP ONLY VERBS & NOUNS (with proper POS-based lemmatization)
    pos_tags = pos_tag(filtered)
    final_output = []

    for word, tag in pos_tags:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag in ("n", "v"):
            lemma = lemmatizer.lemmatize(word.lower(), wn_tag)
            final_output.append(lemma)

    print("4. Final Output (Only Verbs & Nouns, Lemmatized):")
    print(final_output)

    return tokens, filtered, lemmas, final_output


# INPUT TEXT
text_q1 = "John enjoys playing football while Mary loves reading books in the library."

# RUN PIPELINE
tokens, filtered, lemmas, final_output = preprocess_text(text_q1)


===== Q1 Output Steps =====
1. Segmented Tokens:
['John', 'enjoys', 'playing', 'football', 'while', 'Mary', 'loves', 'reading', 'books', 'in', 'the', 'library', '.'] 

2. After Stopword Removal:
['John', 'enjoys', 'playing', 'football', 'Mary', 'loves', 'reading', 'books', 'library', '.'] 

3. Lemmatized Tokens (before POS filtering):
['john', 'enjoys', 'playing', 'football', 'mary', 'love', 'reading', 'book', 'library', '.'] 

4. Final Output (Only Verbs & Nouns, Lemmatized):
['john', 'enjoy', 'play', 'football', 'mary', 'love', 'read', 'book']


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
"""
Q2: Named Entity Recognition (NER) + Pronoun ambiguity warning

Tasks:
1. Perform Named Entity Recognition (NER)
2. If the text contains a pronoun ("he", "she", "they", "him", "her", "them"),
   print: "Warning: Possible pronoun ambiguity detected!"

Input text:
"Chris met Alex at Apple headquarters in California. He told him about the new iPhone launch."
"""

import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

def analyze_text(text: str):
    """
    Run NER and check for pronoun ambiguity in the given text.
    Prints:
    - all named entities
    - warning if ambiguous pronouns are present
    """
    doc = nlp(text)

    # 1. Named Entity Recognition (NER)
    print("===== Q2 Output =====")
    print("1. Named Entities:\n")
    for ent in doc.ents:
        print(f"  {ent.text:20s} -> {ent.label_}")

    # 2. Pronoun ambiguity detection
    pronouns = {"he", "she", "they", "him", "her", "them"}
    tokens_lower = [token.text.lower() for token in doc]

    print("\n2. Pronoun Ambiguity Check:")
    print("Tokens:", tokens_lower)

    if any(p in tokens_lower for p in pronouns):
        print("\nWarning: Possible pronoun ambiguity detected!")
    else:
        print("\nNo ambiguous pronouns detected.")

# Input text for Q2
text_q2 = "Chris met Alex at Apple headquarters in California. He told him about the new iPhone launch."

# Run analysis
analyze_text(text_q2)


===== Q2 Output =====
1. Named Entities:

  Chris                -> PERSON
  Alex                 -> PERSON
  Apple                -> ORG
  California           -> GPE
  iPhone               -> ORG

2. Pronoun Ambiguity Check:
Tokens: ['chris', 'met', 'alex', 'at', 'apple', 'headquarters', 'in', 'california', '.', 'he', 'told', 'him', 'about', 'the', 'new', 'iphone', 'launch', '.']

