In [111]:
# Import necessary libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
#from sklearn.feature_extraction.text import CountVectorizer

In [112]:
# Sample document
sample_document = """Runing eat ate Natural language processing (NLP) is a field of artificial intelligence that focuses on the 
interaction between computers and humans using natural language.Runing eat ate"""

#sentence = "Hello I am Gayatri Deshmukh. I am from Nanded District. I will be an Engineer in few months."

## Tokenization

In [113]:
# Tokenization
tokens = word_tokenize(sample_document)
#print("Original Tokens:", tokens)
tokens

['Runing',
 'eat',
 'ate',
 'Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'is',
 'a',
 'field',
 'of',
 'artificial',
 'intelligence',
 'that',
 'focuses',
 'on',
 'the',
 'interaction',
 'between',
 'computers',
 'and',
 'humans',
 'using',
 'natural',
 'language.Runing',
 'eat',
 'ate']

## POS Tagging

In [114]:
# POS Tagging
pos_tags = pos_tag(tokens)
#print("POS Tags:", pos_tags)
pos_tags

[('Runing', 'VBG'),
 ('eat', 'NN'),
 ('ate', 'JJ'),
 ('Natural', 'NNP'),
 ('language', 'NN'),
 ('processing', 'NN'),
 ('(', '('),
 ('NLP', 'NNP'),
 (')', ')'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('field', 'NN'),
 ('of', 'IN'),
 ('artificial', 'JJ'),
 ('intelligence', 'NN'),
 ('that', 'WDT'),
 ('focuses', 'VBZ'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('interaction', 'NN'),
 ('between', 'IN'),
 ('computers', 'NNS'),
 ('and', 'CC'),
 ('humans', 'NNS'),
 ('using', 'VBG'),
 ('natural', 'JJ'),
 ('language.Runing', 'VBG'),
 ('eat', 'NN'),
 ('ate', 'NN')]

## Stop Words Removal

In [115]:
# Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = []

for word in tokens:
    if word not in stop_words:
        filtered_tokens.append(word)
        
#print("After Stop Words Removal:", filtered_tokens)
filtered_tokens

['Runing',
 'eat',
 'ate',
 'Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'field',
 'artificial',
 'intelligence',
 'focuses',
 'interaction',
 'computers',
 'humans',
 'using',
 'natural',
 'language.Runing',
 'eat',
 'ate']

## Stemming

In [116]:
# Stemming
porter_stemmer = PorterStemmer()
stemmed_tokens = []

for word in filtered_tokens:
    stemmed=porter_stemmer.stem(word)
    stemmed_tokens.append(stemmed)
    
#print("After Stemming:", stemmed_tokens)
stemmed_tokens

['rune',
 'eat',
 'ate',
 'natur',
 'languag',
 'process',
 '(',
 'nlp',
 ')',
 'field',
 'artifici',
 'intellig',
 'focus',
 'interact',
 'comput',
 'human',
 'use',
 'natur',
 'language.run',
 'eat',
 'ate']

## Lemmatization

In [117]:
# Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized_tokens = []

for word in filtered_tokens:
    lemmatized=wordnet_lemmatizer.lemmatize(word)
    lemmatized_tokens.append(lemmatized)

#print("After Lemmatization:", lemmatized_tokens)
lemmatized_tokens

['Runing',
 'eat',
 'ate',
 'Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'field',
 'artificial',
 'intelligence',
 'focus',
 'interaction',
 'computer',
 'human',
 'using',
 'natural',
 'language.Runing',
 'eat',
 'ate']

## Create representation using TF-IDF

In [118]:
documents = [sample_document]

# create object
tfidf = TfidfVectorizer()

# get tf-df values
result = tfidf.fit_transform(documents)
feature_names = tfidf.get_feature_names_out()

print("\nTF-IDF Representation:",result.toarray())
print("\nFeature Names:", feature_names)


TF-IDF Representation: [[0.16439899 0.16439899 0.32879797 0.16439899 0.16439899 0.32879797
  0.16439899 0.16439899 0.16439899 0.16439899 0.16439899 0.16439899
  0.32879797 0.32879797 0.16439899 0.16439899 0.16439899 0.16439899
  0.32879797 0.16439899 0.16439899 0.16439899]]

Feature Names: ['and' 'artificial' 'ate' 'between' 'computers' 'eat' 'field' 'focuses'
 'humans' 'intelligence' 'interaction' 'is' 'language' 'natural' 'nlp'
 'of' 'on' 'processing' 'runing' 'that' 'the' 'using']
