# Write a NLP Program to demostrate following tasks

## a. Tokenization removal of stop words, punchuation, POS & NER Tags¶
## b. Bag of Words, TF-IDF Vectorisation & Ngrams

## Import Libraries

In [2]:
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams

import re
#import nltk
#from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy

## Download required NLTK resources

In [3]:
# Download NLTK stopwords if not already

nltk.download("stopwords")

nltk.download('punkt')
#nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package stopwords to C:\Users\Pramoda A
[nltk_data]     S\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Pramoda A
[nltk_data]     S\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Pramoda A S\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to C:\Users\Pramoda
[nltk_data]     A S\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to C:\Users\Pramoda A
[nltk_data]     S\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [4]:
# in  python bash :- python -m spacy download en_core_web_sm

# Load spacy model

nlp = spacy.load("en_core_web_sm")

## Sample Text

In [5]:
# Example text

text = """Apple is looking at buying U.K. startup for $1 billion. 
          Artificial Intelligence is the future of technology!"""

## a) Tokenization, Stop Words & Punctuation Removal

#   ------------------------------------------------

## Tokenize text

In [6]:
# ---------------- 1. Tokenization ----------------

doc = nlp(text)
tokens = [token.text for token in doc]
print("\n1. Tokens:", tokens)


1. Tokens: ['Apple', 'is', 'looking', 'at', 'buying', 'U.K.', 'startup', 'for', '$', '1', 'billion', '.', '\n          ', 'Artificial', 'Intelligence', 'is', 'the', 'future', 'of', 'technology', '!']


In [7]:
tokens = nltk.word_tokenize(text)
print("1. Tokens:")
print(tokens)


1. Tokens:
['Apple', 'is', 'looking', 'at', 'buying', 'U.K.', 'startup', 'for', '$', '1', 'billion', '.', 'Artificial', 'Intelligence', 'is', 'the', 'future', 'of', 'technology', '!']


## Remove stop words and punctuation

In [8]:
# ---------------- 2. Stopword and Punctuation Removal ----------------

stop_words = set(stopwords.words("english"))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words and token.isalpha()]
print("\n3. After Stopword and Punctuation Removal:", filtered_tokens)


3. After Stopword and Punctuation Removal: ['Apple', 'looking', 'buying', 'startup', 'billion', 'Artificial', 'Intelligence', 'future', 'technology']


## POS Tagging & NER using Spacy
# ------------------------------

## POS tagging

In [10]:
# ---------------- 3. POS Tagging ----------------

print("\n3. POS Tagging:")
for token in doc:
    print(f"{token.text:15} -> {token.pos_}")


# PROPN → Proper Noun
## Example: Apple, New York, AI
## (Names of people, organizations, places, etc.)

# AUX → Auxiliary Verb (helping verb)
## Example: is, was, have, will
## (Used with main verbs to form tenses, moods, voices.)

# VERB → Verb
## Example: looking, buying, expand
## (Action or state words.)

# ADP → Adposition (prepositions and postpositions)
## Example: at, in, on, under
## (Show relationship in time/place/direction.)

# OUN → Common Noun
## Example: startup, business
## (General things, not proper names.)



3. POS Tagging:
Apple           -> PROPN
is              -> AUX
looking         -> VERB
at              -> ADP
buying          -> VERB
U.K.            -> PROPN
startup         -> VERB
for             -> ADP
$               -> SYM
1               -> NUM
billion         -> NUM
.               -> PUNCT

               -> SPACE
Artificial      -> PROPN
Intelligence    -> PROPN
is              -> AUX
the             -> DET
future          -> NOUN
of              -> ADP
technology      -> NOUN
!               -> PUNCT


## Named Entity Recognition

In [12]:
# ---------------- 4. Named Entity Recognition (NER) ----------------

print("\n4. Named Entities:")
for ent in doc.ents:
    print(f"{ent.text:20} -> {ent.label_}")


4. Named Entities:
Apple                -> ORG
U.K.                 -> GPE
$1 billion           -> MONEY
Artificial Intelligence -> PERSON


## -------------------------------
# Part B: Bag of Words & TF-IDF Vectorization
# -------------------------------

## Bag of Words

In [20]:
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform([text])
print("\n--- Bag of Words ---")
print(vectorizer.get_feature_names_out())
print(bow.toarray())


--- Bag of Words ---
['apple' 'artificial' 'at' 'billion' 'buying' 'for' 'future'
 'intelligence' 'is' 'looking' 'of' 'startup' 'technology' 'the']
[[1 1 1 1 1 1 1 1 2 1 1 1 1 1]]


## TF-IDF

In [16]:
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform([text])
print("\n--- TF-IDF ---")
print(tfidf_vectorizer.get_feature_names_out())
print(tfidf.toarray())


--- TF-IDF ---
['apple' 'artificial' 'at' 'billion' 'buying' 'for' 'future'
 'intelligence' 'is' 'looking' 'of' 'startup' 'technology' 'the']
[[0.24253563 0.24253563 0.24253563 0.24253563 0.24253563 0.24253563
  0.24253563 0.24253563 0.48507125 0.24253563 0.24253563 0.24253563
  0.24253563 0.24253563]]


## N-grams (Bigrams and Trigrams)

In [19]:
# ---------------- 5. N-grams Generation ----------------

# Example: bigrams (n=2) and trigrams (n=3)

print("\n5. N-grams:\n")
unigrams = list(ngrams(filtered_tokens, 1))
bigrams = list(ngrams(filtered_tokens, 2))
trigrams = list(ngrams(filtered_tokens, 3))

print("Unigrams:\n", unigrams)
print("\n\nBigrams:\n", bigrams)
print("\n\nTrigrams:\n", trigrams)


5. N-grams:

Unigrams:
 [('Apple',), ('looking',), ('buying',), ('startup',), ('billion',), ('Artificial',), ('Intelligence',), ('future',), ('technology',)]


Bigrams:
 [('Apple', 'looking'), ('looking', 'buying'), ('buying', 'startup'), ('startup', 'billion'), ('billion', 'Artificial'), ('Artificial', 'Intelligence'), ('Intelligence', 'future'), ('future', 'technology')]


Trigrams:
 [('Apple', 'looking', 'buying'), ('looking', 'buying', 'startup'), ('buying', 'startup', 'billion'), ('startup', 'billion', 'Artificial'), ('billion', 'Artificial', 'Intelligence'), ('Artificial', 'Intelligence', 'future'), ('Intelligence', 'future', 'technology')]
