In [61]:
import re
import nltk,spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [64]:
text="""Apple is looking ,buying U K . startup for 1$ billon.
      Artificial Intelligence is the feture techonology"""

In [65]:
# Tokenization 

tokens=word_tokenize(text)
print("--------Tokenization--------")
print(tokens)

--------Tokenization--------
['Apple', 'is', 'looking', ',', 'buying', 'U', 'K', '.', 'startup', 'for', '1', '$', 'billon', '.', 'Artificial', 'Intelligence', 'is', 'the', 'feture', 'techonology']


In [66]:
# Stop Words and Punctuations 

filtered=[w for w in tokens if w.isalpha() 
          and w.lower() not in stopwords.words()]
print("-------Removel Stop Words and Punctuations--------")
print("\n",filtered)

-------Removel Stop Words and Punctuations--------

 ['Apple', 'buying', 'startup', 'billon', 'Artificial', 'Intelligence', 'feture', 'techonology']


In [67]:
# POS Tagging 

nlp=spacy.load('en_core_web_sm')
doc=nlp(text)

In [68]:
print("--------POS Tagging-------\n")
for token in doc:
    print(f"{token.text:12}--> {token.pos_}")

--------POS Tagging-------

Apple       --> PROPN
is          --> AUX
looking     --> VERB
,           --> PUNCT
buying      --> VERB
U           --> NOUN
K           --> NOUN
.           --> PUNCT
startup     --> VERB
for         --> ADP
1           --> NUM
$           --> SYM
billon      --> NOUN
.           --> PUNCT

           --> SPACE
Artificial  --> PROPN
Intelligence--> PROPN
is          --> AUX
the         --> DET
feture      --> NOUN
techonology --> NOUN


In [69]:
# Name Entity Recognition 

print("----Name Entity Recognition-----\n")
for ent in doc.ents:
    print(f"{ent.text:<12} --> {ent.label_}")

----Name Entity Recognition-----

Apple        --> ORG
U K          --> ORG
1$           --> MONEY
Artificial Intelligence --> ORG


In [71]:
# Bag of words 

vect=CountVectorizer()
bow=vect.fit_transform([text])
print(vect.get_feature_names_out())
print(bow.toarray())

['apple' 'artificial' 'billon' 'buying' 'feture' 'for' 'intelligence' 'is'
 'looking' 'startup' 'techonology' 'the']
[[1 1 1 1 1 1 1 2 1 1 1 1]]


In [72]:
# TF - IDF 

tf_idf=TfidfVectorizer()
tf=tf_idf.fit_transform([text])
print(tf_idf.get_feature_names_out())
print(tf.toarray())

['apple' 'artificial' 'billon' 'buying' 'feture' 'for' 'intelligence' 'is'
 'looking' 'startup' 'techonology' 'the']
[[0.25819889 0.25819889 0.25819889 0.25819889 0.25819889 0.25819889
  0.25819889 0.51639778 0.25819889 0.25819889 0.25819889 0.25819889]]


In [73]:
# N-grams

print("Unigram is :\n",list(ngrams(filtered,1)))
print("\nBigrams is :\n",list(ngrams(filtered,2)))
print("\nTrigrams is :\n",list(ngrams(filtered,3)))

Unigram is :
 [('Apple',), ('buying',), ('startup',), ('billon',), ('Artificial',), ('Intelligence',), ('feture',), ('techonology',)]

Bigrams is :
 [('Apple', 'buying'), ('buying', 'startup'), ('startup', 'billon'), ('billon', 'Artificial'), ('Artificial', 'Intelligence'), ('Intelligence', 'feture'), ('feture', 'techonology')]

Trigrams is :
 [('Apple', 'buying', 'startup'), ('buying', 'startup', 'billon'), ('startup', 'billon', 'Artificial'), ('billon', 'Artificial', 'Intelligence'), ('Artificial', 'Intelligence', 'feture'), ('Intelligence', 'feture', 'techonology')]
