In [4]:
import nltk, spacy
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag, ngrams

In [5]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [6]:
text = "Apple is looking at buying U.K. startup for $1 billion in 2025."

In [7]:
# Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)

Tokens: ['Apple', 'is', 'looking', 'at', 'buying', 'U.K.', 'startup', 'for', '$', '1', 'billion', 'in', '2025', '.']


In [8]:
# Stopword Removal
filtered = [w for w in tokens if w.isalpha() and w.lower() not in stopwords.words("english")]
print("After Stopword Removal:", filtered)

After Stopword Removal: ['Apple', 'looking', 'buying', 'startup', 'billion']


In [15]:
# POS Tagging
print("POS Tags:",pos_tag(filtered))

# NNP → Proper noun (singular)
# VBG → Verb (gerund/present participle)
# NN → Common noun (singular)
# CD → Cardinal number

POS Tags: [('Apple', 'NNP'), ('looking', 'VBG'), ('buying', 'VBG'), ('startup', 'NN'), ('billion', 'CD')]


In [10]:
# Named Entity Recognition
doc = nlp(text)
print("NER:", [(ent.text, ent.label_) for ent in doc.ents])

NER: [('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY'), ('2025', 'DATE')]


In [14]:
# N-grams
print("Unigram :",list(ngrams(filtered,1)))
print("\nBigrams:", list(ngrams(filtered, 2)))
print("\nTrigrams:", list(ngrams(filtered, 3)))

Unigram : [('Apple',), ('looking',), ('buying',), ('startup',), ('billion',)]

Bigrams: [('Apple', 'looking'), ('looking', 'buying'), ('buying', 'startup'), ('startup', 'billion')]

Trigrams: [('Apple', 'looking', 'buying'), ('looking', 'buying', 'startup'), ('buying', 'startup', 'billion')]
