In [21]:
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams

In [2]:
# Download NLTK stopwords if not already

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to C:\Users\Pramoda A
[nltk_data]     S\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
# in  python bash :- python -m spacy download en_core_web_sm

# Load spacy model

nlp = spacy.load("en_core_web_sm")

In [23]:
# Example text

text = "Apple is looking at buying a startup in New York to expand its AI business."

In [24]:
tokens = nltk.word_tokenize(text)
print("1. Tokens:")
print(tokens)


1. Tokens:
['Apple', 'is', 'looking', 'at', 'buying', 'a', 'startup', 'in', 'New', 'York', 'to', 'expand', 'its', 'AI', 'business', '.']


In [25]:
# ---------------- 2. Stopword Removal ----------------

stop_words = set(stopwords.words("english"))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words and token.isalpha()]
print("\n2. After Stopword Removal:", filtered_tokens)


2. After Stopword Removal: ['Apple', 'looking', 'buying', 'startup', 'New', 'York', 'expand', 'AI', 'business']


In [26]:
# PROPN → Proper Noun
## Example: Apple, New York, AI
## (Names of people, organizations, places, etc.)

# AUX → Auxiliary Verb (helping verb)
## Example: is, was, have, will
## (Used with main verbs to form tenses, moods, voices.)

# VERB → Verb
## Example: looking, buying, expand
## (Action or state words.)

# ADP → Adposition (prepositions and postpositions)
## Example: at, in, on, under
## (Show relationship in time/place/direction.)

# OUN → Common Noun
## Example: startup, business
## (General things, not proper names.)


# ---------------- 3. POS Tagging ----------------

print("\n3. POS Tagging:")
for token in doc:
    print(f"{token.text:15} -> {token.pos_}")


3. POS Tagging:
Apple           -> PROPN
is              -> AUX
looking         -> VERB
at              -> ADP
buying          -> VERB
a               -> DET
startup         -> NOUN
in              -> ADP
New             -> PROPN
York            -> PROPN
to              -> PART
expand          -> VERB
its             -> PRON
AI              -> PROPN
business        -> NOUN
.               -> PUNCT


In [27]:
# ---------------- 4. Named Entity Recognition (NER) ----------------

print("\n4. Named Entities:")
for ent in doc.ents:
    print(f"{ent.text:20} -> {ent.label_}")


4. Named Entities:
Apple                -> ORG
New York             -> GPE
AI                   -> GPE


In [28]:
# ---------------- 5. N-grams Generation ----------------

# Example: bigrams (n=2) and trigrams (n=3)

print("\n5. N-grams:\n")
bigrams = list(ngrams(filtered_tokens, 2))
trigrams = list(ngrams(filtered_tokens, 3))

print("Bigrams:\n", bigrams)
print("\n\nTrigrams:\n", trigrams)


5. N-grams:

Bigrams:
 [('Apple', 'looking'), ('looking', 'buying'), ('buying', 'startup'), ('startup', 'New'), ('New', 'York'), ('York', 'expand'), ('expand', 'AI'), ('AI', 'business')]


Trigrams:
 [('Apple', 'looking', 'buying'), ('looking', 'buying', 'startup'), ('buying', 'startup', 'New'), ('startup', 'New', 'York'), ('New', 'York', 'expand'), ('York', 'expand', 'AI'), ('expand', 'AI', 'business')]
