# -------------Experiment - 13-----------

# Write a NLP Program to demostrate following tasks
## a. Tokenization removal of stop words, punchuation, POS & NER Tags
## b. Bag of Words, TF-IDF Vectorisation & Ngrams

## Import Libraries

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy

## Download required NLTK resources

In [13]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to C:\Users\Pramoda A
[nltk_data]     S\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Pramoda A
[nltk_data]     S\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Pramoda A S\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to C:\Users\Pramoda
[nltk_data]     A S\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to C:\Users\Pramoda A
[nltk_data]     S\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

## Sample Text

In [14]:
text = """Apple is looking at buying U.K. startup for $1 billion. 
          Artificial Intelligence is the future of technology!"""

# a) Tokenization, Stop Words & Punctuation Removal
## ------------------------------------------------
## Tokenize text

In [15]:
tokens = word_tokenize(text)
print("--- Tokens ---")
print(tokens)

--- Tokens ---
['Apple', 'is', 'looking', 'at', 'buying', 'U.K.', 'startup', 'for', '$', '1', 'billion', '.', 'Artificial', 'Intelligence', 'is', 'the', 'future', 'of', 'technology', '!']


## Remove stop words and punctuation

In [16]:
stop_words = set(stopwords.words('english'))
punctuation_regex = re.compile(r'[\W_]+')  # matches punctuation

In [17]:
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and not punctuation_regex.match(word)]
print("\n--- Tokens after Stopword & Punctuation Removal ---")
print(filtered_tokens)


--- Tokens after Stopword & Punctuation Removal ---
['Apple', 'looking', 'buying', 'U.K.', 'startup', '1', 'billion', 'Artificial', 'Intelligence', 'future', 'technology']


## POS Tagging & NER using Spacy

In [18]:
nlp = spacy.load('en_core_web_sm')  # Load small English model
doc = nlp(text)

## POS tagging

In [19]:
print("\n--- POS Tagging ---")
for token in doc:
    print(f"{token.text:<12} --> {token.pos_}")


--- POS Tagging ---
Apple        --> PROPN
is           --> AUX
looking      --> VERB
at           --> ADP
buying       --> VERB
U.K.         --> PROPN
startup      --> VERB
for          --> ADP
$            --> SYM
1            --> NUM
billion      --> NUM
.            --> PUNCT

            --> SPACE
Artificial   --> PROPN
Intelligence --> PROPN
is           --> AUX
the          --> DET
future       --> NOUN
of           --> ADP
technology   --> NOUN
!            --> PUNCT


## Named Entity Recognition

In [20]:
print("\n--- Named Entities ---")
for ent in doc.ents:
    print(f"{ent.text:<15} --> {ent.label_}")


--- Named Entities ---
Apple           --> ORG
U.K.            --> GPE
$1 billion      --> MONEY
Artificial Intelligence --> PERSON


# b. Bag of Words, TF-IDF Vectorisation & Ngrams¶

## Bag of Words

In [21]:
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform([text])
print("\n--- Bag of Words ---")
print(vectorizer.get_feature_names_out())
print(bow.toarray())


--- Bag of Words ---
['apple' 'artificial' 'at' 'billion' 'buying' 'for' 'future'
 'intelligence' 'is' 'looking' 'of' 'startup' 'technology' 'the']
[[1 1 1 1 1 1 1 1 2 1 1 1 1 1]]


## TF-IDF

In [22]:
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform([text])
print("\n--- TF-IDF ---")
print(tfidf_vectorizer.get_feature_names_out())
print(tfidf.toarray())


--- TF-IDF ---
['apple' 'artificial' 'at' 'billion' 'buying' 'for' 'future'
 'intelligence' 'is' 'looking' 'of' 'startup' 'technology' 'the']
[[0.24253563 0.24253563 0.24253563 0.24253563 0.24253563 0.24253563
  0.24253563 0.24253563 0.48507125 0.24253563 0.24253563 0.24253563
  0.24253563 0.24253563]]


## N-grams (Bigrams and Trigrams)

In [23]:
ngram_vectorizer = CountVectorizer(ngram_range=(2, 3))
ngrams = ngram_vectorizer.fit_transform([text])
print("\n--- N-grams (Bigrams and Trigrams) ---")
print(ngram_vectorizer.get_feature_names_out())
print(ngrams.toarray())


--- N-grams (Bigrams and Trigrams) ---
['apple is' 'apple is looking' 'artificial intelligence'
 'artificial intelligence is' 'at buying' 'at buying startup'
 'billion artificial' 'billion artificial intelligence' 'buying startup'
 'buying startup for' 'for billion' 'for billion artificial' 'future of'
 'future of technology' 'intelligence is' 'intelligence is the'
 'is looking' 'is looking at' 'is the' 'is the future' 'looking at'
 'looking at buying' 'of technology' 'startup for' 'startup for billion'
 'the future' 'the future of']
[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]
