In [None]:
# Importing necessary libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

In [None]:

# Sample document
document = "Natural language processing (NLP) is a field of computer science, artificial intelligence, and computational linguistics concerned with the interactions between computers and human (natural) languages."

In [None]:
# Tokenization
import nltk
nltk.download('punkt')

tokens = word_tokenize(document)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# POS tagging
import nltk
nltk.download('averaged_perceptron_tagger')
  
pos = pos_tag(tokens)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# Stop words removal
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words("english"))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

In [None]:
# Lemmatization
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Printing the results
print("Original Document: \n", document)
print("\nTokenization: \n", tokens)
print("\nPOS Tagging: \n", pos)
print("\nStop Words Removal: \n", filtered_tokens)
print("\nStemming: \n", stemmed_tokens)
print("\nLemmatization: \n", lemmatized_tokens)

Original Document: 
 Natural language processing (NLP) is a field of computer science, artificial intelligence, and computational linguistics concerned with the interactions between computers and human (natural) languages.

Tokenization: 
 ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'computer', 'science', ',', 'artificial', 'intelligence', ',', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', '.']

POS Tagging: 
 [('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('field', 'NN'), ('of', 'IN'), ('computer', 'NN'), ('science', 'NN'), (',', ','), ('artificial', 'JJ'), ('intelligence', 'NN'), (',', ','), ('and', 'CC'), ('computational', 'JJ'), ('linguistics', 'NNS'), ('concerned', 'VBN'), ('with', 'IN'), ('the', 'DT'), ('interactions', 'NNS'), ('between', 'IN'), ('compu

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Creating a TfidfVectorizer object
vectorizer = TfidfVectorizer()

In [None]:
# Applying the vectorizer to the document
vector = vectorizer.fit_transform([document])

In [None]:
# Extracting the feature names and tf-idf scores
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = vector.toarray()[0]

In [None]:
# Printing the feature names and tf-idf scores
print("Term Frequency-Inverse Document Frequency (TF-IDF):")
for i in range(len(feature_names)):
    print(feature_names[i], ":", tfidf_scores[i])

Term Frequency-Inverse Document Frequency (TF-IDF):
and : 0.3779644730092272
artificial : 0.1889822365046136
between : 0.1889822365046136
computational : 0.1889822365046136
computer : 0.1889822365046136
computers : 0.1889822365046136
concerned : 0.1889822365046136
field : 0.1889822365046136
human : 0.1889822365046136
intelligence : 0.1889822365046136
interactions : 0.1889822365046136
is : 0.1889822365046136
language : 0.1889822365046136
languages : 0.1889822365046136
linguistics : 0.1889822365046136
natural : 0.3779644730092272
nlp : 0.1889822365046136
of : 0.1889822365046136
processing : 0.1889822365046136
science : 0.1889822365046136
the : 0.1889822365046136
with : 0.1889822365046136
