In [14]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\NIKITA\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\NIKITA\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [16]:
doc = "Artificial Intelligence was introduced in 1956. Today, it is widely used in many fields such as healthcare, education, and robotics."


In [18]:
tokens = word_tokenize(doc)
print("Tokens:", tokens)


Tokens: ['Artificial', 'Intelligence', 'was', 'introduced', 'in', '1956', '.', 'Today', ',', 'it', 'is', 'widely', 'used', 'in', 'many', 'fields', 'such', 'as', 'healthcare', ',', 'education', ',', 'and', 'robotics', '.']


In [22]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\NIKITA\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


POS Tags: [('Artificial', 'JJ'), ('Intelligence', 'NNP'), ('was', 'VBD'), ('introduced', 'VBN'), ('in', 'IN'), ('1956', 'CD'), ('.', '.'), ('Today', 'NN'), (',', ','), ('it', 'PRP'), ('is', 'VBZ'), ('widely', 'RB'), ('used', 'VBN'), ('in', 'IN'), ('many', 'JJ'), ('fields', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('healthcare', 'NN'), (',', ','), ('education', 'NN'), (',', ','), ('and', 'CC'), ('robotics', 'NNS'), ('.', '.')]


In [24]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w.lower() not in stop_words]
print("Filtered Tokens:", filtered_tokens)


Filtered Tokens: ['Artificial', 'Intelligence', 'introduced', '1956', '.', 'Today', ',', 'widely', 'used', 'many', 'fields', 'healthcare', ',', 'education', ',', 'robotics', '.']


In [26]:
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in filtered_tokens]
print("Stemmed:", stemmed)


Stemmed: ['artifici', 'intellig', 'introduc', '1956', '.', 'today', ',', 'wide', 'use', 'mani', 'field', 'healthcar', ',', 'educ', ',', 'robot', '.']


In [28]:
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w.lower()) for w in filtered_tokens]
print("Lemmatized:", lemmatized)


Lemmatized: ['artificial', 'intelligence', 'introduced', '1956', '.', 'today', ',', 'widely', 'used', 'many', 'field', 'healthcare', ',', 'education', ',', 'robotics', '.']


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [39]:
documents = [
    "Artificial Intelligence is used in healthcare.",
    "Robotics and AI are the future of technology.",
    "Machine learning is a branch of Artificial Intelligence."
]


In [41]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Feature names (terms)
print("Vocabulary Terms:", vectorizer.get_feature_names_out())

# TF-IDF Matrix
import pandas as pd
df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:")
print(df)


Vocabulary Terms: ['ai' 'and' 'are' 'artificial' 'branch' 'future' 'healthcare' 'in'
 'intelligence' 'is' 'learning' 'machine' 'of' 'robotics' 'technology'
 'the' 'used']

TF-IDF Matrix:
         ai       and       are  artificial    branch    future  healthcare  \
0  0.000000  0.000000  0.000000    0.349498  0.000000  0.000000    0.459548   
1  0.363255  0.363255  0.363255    0.000000  0.000000  0.363255    0.000000   
2  0.000000  0.000000  0.000000    0.329928  0.433816  0.000000    0.000000   

         in  intelligence        is  learning   machine        of  robotics  \
0  0.459548      0.349498  0.349498  0.000000  0.000000  0.000000  0.000000   
1  0.000000      0.000000  0.000000  0.000000  0.000000  0.276265  0.363255   
2  0.000000      0.329928  0.329928  0.433816  0.433816  0.329928  0.000000   

   technology       the      used  
0    0.000000  0.000000  0.459548  
1    0.363255  0.363255  0.000000  
2    0.000000  0.000000  0.000000  
