# Write a NLP Program to demostrate following tasks

## a. Tokenization removal of stop words, punchuation, POS & NER Tags¶
## b. Bag of Words, TF-IDF Vectorisation & Ngrams

## Import Libraries

In [1]:
import re 
import nltk 
import spacy 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

## Download required NLTK resources

In [12]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")
nltk.download("maxent_ne_chunker")
nltk.download("words")

[nltk_data] Downloading package punkt to C:\Users\Pramoda A
[nltk_data]     S\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Pramoda A
[nltk_data]     S\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Pramoda A S\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to C:\Users\Pramoda
[nltk_data]     A S\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to C:\Users\Pramoda A
[nltk_data]     S\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

# Sample Text

In [2]:
text="""Apple is looking at buying U.K. startup for $1 billion. 
          Artificial Intelligence is the future of technology!"""

## Tokenize text

In [3]:
tokens=word_tokenize(text)
print("---Tokens----")
print(tokens)

---Tokens----
['Apple', 'is', 'looking', 'at', 'buying', 'U.K.', 'startup', 'for', '$', '1', 'billion', '.', 'Artificial', 'Intelligence', 'is', 'the', 'future', 'of', 'technology', '!']


## Remove stop words and punctuation

In [4]:
filtered=[w for w in tokens if w.isalpha() and w.lower() not in stopwords.words("english")]
print("After stopwords removel :",filtered)

After stopwords removel : ['Apple', 'looking', 'buying', 'startup', 'billion', 'Artificial', 'Intelligence', 'future', 'technology']


## POS tagging

In [5]:
nlp=spacy.load('en_core_web_sm')

In [6]:
doc=nlp(text)

In [7]:
print("POS Tagging")
for token in doc:
    print(f"{token.text:<12} -->{token.pos_}")

POS Tagging
Apple        -->PROPN
is           -->AUX
looking      -->VERB
at           -->ADP
buying       -->VERB
U.K.         -->PROPN
startup      -->VERB
for          -->ADP
$            -->SYM
1            -->NUM
billion      -->NUM
.            -->PUNCT

            -->SPACE
Artificial   -->PROPN
Intelligence -->PROPN
is           -->AUX
the          -->DET
future       -->NOUN
of           -->ADP
technology   -->NOUN
!            -->PUNCT


## Named Entity Recognition

In [8]:
print("-----Named Entity Recognition------\n")
for ent in doc.ents: 
    print(f"{ent.text:<12}  {ent.label_}")

-----Named Entity Recognition------

Apple         ORG
U.K.          GPE
$1 billion    MONEY
Artificial Intelligence  PERSON


## Bag of Words

In [9]:
vet=CountVectorizer()
bow=vet.fit_transform([text])
print(vet.get_feature_names_out())
print(bow.toarray())

['apple' 'artificial' 'at' 'billion' 'buying' 'for' 'future'
 'intelligence' 'is' 'looking' 'of' 'startup' 'technology' 'the']
[[1 1 1 1 1 1 1 1 2 1 1 1 1 1]]


## TF-IDF

In [10]:
tf_idf=TfidfVectorizer()
tfidf=tf_idf.fit_transform([text])
print(tf_idf.get_feature_names_out())
print(tfidf.toarray())

['apple' 'artificial' 'at' 'billion' 'buying' 'for' 'future'
 'intelligence' 'is' 'looking' 'of' 'startup' 'technology' 'the']
[[0.24253563 0.24253563 0.24253563 0.24253563 0.24253563 0.24253563
  0.24253563 0.24253563 0.48507125 0.24253563 0.24253563 0.24253563
  0.24253563 0.24253563]]


## N-grams (Bigrams and Trigrams)

In [11]:
print("Unigram :",list(ngrams(filtered,1)))
print("\n\nBigrams is :",list(ngrams(filtered,2)))
print("\n\nTrigrams is :",list(ngrams(filtered,3)))

Unigram : [('Apple',), ('looking',), ('buying',), ('startup',), ('billion',), ('Artificial',), ('Intelligence',), ('future',), ('technology',)]


Bigrams is : [('Apple', 'looking'), ('looking', 'buying'), ('buying', 'startup'), ('startup', 'billion'), ('billion', 'Artificial'), ('Artificial', 'Intelligence'), ('Intelligence', 'future'), ('future', 'technology')]


Trigrams is : [('Apple', 'looking', 'buying'), ('looking', 'buying', 'startup'), ('buying', 'startup', 'billion'), ('startup', 'billion', 'Artificial'), ('billion', 'Artificial', 'Intelligence'), ('Artificial', 'Intelligence', 'future'), ('Intelligence', 'future', 'technology')]
