In [1]:
text1="natural Language Processing is a subfield of AI"
tag1="NLP"
text2="Computer Vision is a subfield of AI"
tag2="CV"

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Define the two texts and their corresponding tags
text1 = "Natural Language Processing is a subfield of AI"
tag1 = "NLP"
text2 = "Computer Vision is a subfield of AI"
tag2 = "CV"

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the texts and transform them into feature vectors
X = vectorizer.fit_transform([text1, text2])

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Get the frequency table as a matrix
frequency_table = X.toarray()

# Create a DataFrame to hold the frequency table
df = pd.DataFrame(frequency_table, columns=feature_names)

# Add the text and tag columns
df['Text'] = [text1, text2]
df['Tag'] = [tag1, tag2]

# Print the DataFrame
print(df)


   ai  computer  is  language  natural  of  processing  subfield  vision  \
0   1         0   1         1        1   1           1         1       0   
1   1         1   1         0        0   1           0         1       1   

                                              Text  Tag  
0  Natural Language Processing is a subfield of AI  NLP  
1              Computer Vision is a subfield of AI   CV  


In [7]:
df

Unnamed: 0,ai,computer,is,language,natural,of,processing,subfield,vision,Text,Tag
0,1,0,1,1,1,1,1,1,0,Natural Language Processing is a subfield of AI,NLP
1,1,1,1,0,0,1,0,1,1,Computer Vision is a subfield of AI,CV


**Enhancing Vectorizer with Lemmatizer**

In [2]:
import nltk
nltk.download(['punkt','wordnet']) #lexical DB for english lang #punkt- tokenizer moel fir various lang
nltk.download('omw-1.4')#open multilingual wordnet- 1.4- Wordnet for multiple languages

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [6]:
#pukt- tokenizer
#wordnet- lemmatizer
from nltk.stem import WordNetLemmatizer
lemm= WordNetLemmatizer()

print(lemm.lemmatize("mouse"))
print(lemm.lemmatize("feet"))
print(lemm.lemmatize("caring"))
print(lemm.lemmatize("misery"))
print(lemm.lemmatize("houses"))
print(lemm.lemmatize("joking"))

mouse
foot
caring
misery
house
joking


In [39]:
sentence="My grandma is very caring. The striped bats are hanging on their feet"

#Tokenization
li_words=nltk.word_tokenize(sentence)
print(li_words)

['My', 'grandma', 'is', 'very', 'caring', '.', 'The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet']


In [36]:
#lemmatization
output=[lemm.lemmatize(w,get_wordnet_pos_tag(w)) for w in li_words]
print(output)

P
N
V
R
V
.
D
V
N
V
V
I
P
N
['My', 'gandma', 'be', 'very', 'care', '.', 'The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot']


**Provide POs(Part Of Speech ) tag as second arument to lemmatize()**

In [12]:
print(lemm.lemmatize("caring","v"))

care


In [15]:
print(lemm.lemmatize("stripes","v"))
print(lemm.lemmatize("stripes","n"))

strip
stripe


In [17]:
print(lemm.lemmatize("hanging","v"))
print(lemm.lemmatize("are","v"))

hang
be


**WordNet lemmatizer with POS tag**

In [18]:
nltk.download('averaged_perception_tagger')

[nltk_data] Error loading averaged_perception_tagger: Package
[nltk_data]     'averaged_perception_tagger' not found in index


False

In [20]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [29]:

def get_wordnet_pos_tag(word):
  tag=nltk.pos_tag([word])
  return tag

get_wordnet_pos_tag("caring")
get_wordnet_pos_tag("beautiful")
get_wordnet_pos_tag("he")
get_wordnet_pos_tag("other")

[('other', 'JJ')]

In [34]:


from nltk.corpus.reader import wordnet
def get_wordnet_pos_tag(word):
  tag=nltk.pos_tag([word])[0][1][0]
  tag_dict={
      "J":wordnet.ADJ,
      "R":wordnet.ADV,
      "N":wordnet.NOUN,
      "V":wordnet.VERB
  }
  print(tag)
  return tag_dict.get(tag,wordnet.NOUN)#default tag-NOUN- when the word not classified as any predefined part of speech
get_wordnet_pos_tag("caring")
# get_wordnet_pos_tag("beautiful")
# get_wordnet_pos_tag("he")
# get_wordnet_pos_tag("other")

V


'v'

In [35]:
lemm=WordNetLemmatizer()
word="brave"
print(lemm.lemmatize(word,get_wordnet_pos_tag(word)))

N
brave


**Spacy Lemmatizer**

In [40]:
import spacy
sp_nlp=spacy.load('en_core_web_sm')#updated model name
sentence="My grandma is very caring. The striped bats are hanging on their feet"

#parse he sentence using the lang model-'en_core_web_
doc=sp_nlp(sentence)#various nlp tasks- tokenization happens
print(doc)

#extract lemma(base form) for each token in sentence
output=[token.lemma_ for token in doc]
print(output)

My grandma is very caring. The striped bats are hanging on their feet
['my', 'grandma', 'be', 'very', 'caring', '.', 'the', 'stripe', 'bat', 'be', 'hang', 'on', 'their', 'foot']
