In [1]:
# Cell 1: Import necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# Cell 2: Download necessary resources for NLTK
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\suraj\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\suraj\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\suraj\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\suraj\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
# Cell 3: Load the sample document
sample_document = """
Natural language processing (NLP) is a field of artificial intelligence (AI) that focuses on 
the interaction between computers and humans through natural language. The ultimate objective 
of NLP is to enable computers to understand, interpret, and generate human language in a way 
that is both meaningful and useful.
"""

print("Sample Document:")
print(sample_document)


Sample Document:

Natural language processing (NLP) is a field of artificial intelligence (AI) that focuses on 
the interaction between computers and humans through natural language. The ultimate objective 
of NLP is to enable computers to understand, interpret, and generate human language in a way 
that is both meaningful and useful.



In [4]:
# Cell 4: Tokenization
tokens = word_tokenize(sample_document)
print("Tokenization:")
print(tokens)


Tokenization:
['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', '(', 'AI', ')', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'through', 'natural', 'language', '.', 'The', 'ultimate', 'objective', 'of', 'NLP', 'is', 'to', 'enable', 'computers', 'to', 'understand', ',', 'interpret', ',', 'and', 'generate', 'human', 'language', 'in', 'a', 'way', 'that', 'is', 'both', 'meaningful', 'and', 'useful', '.']


In [5]:
# Cell 5: POS Tagging
pos_tags = pos_tag(tokens)
print("POS Tagging:")
print(pos_tags)


POS Tagging:
[('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('field', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('(', '('), ('AI', 'NNP'), (')', ')'), ('that', 'WDT'), ('focuses', 'VBZ'), ('on', 'IN'), ('the', 'DT'), ('interaction', 'NN'), ('between', 'IN'), ('computers', 'NNS'), ('and', 'CC'), ('humans', 'NNS'), ('through', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('.', '.'), ('The', 'DT'), ('ultimate', 'JJ'), ('objective', 'NN'), ('of', 'IN'), ('NLP', 'NNP'), ('is', 'VBZ'), ('to', 'TO'), ('enable', 'JJ'), ('computers', 'NNS'), ('to', 'TO'), ('understand', 'VB'), (',', ','), ('interpret', 'VB'), (',', ','), ('and', 'CC'), ('generate', 'VB'), ('human', 'JJ'), ('language', 'NN'), ('in', 'IN'), ('a', 'DT'), ('way', 'NN'), ('that', 'WDT'), ('is', 'VBZ'), ('both', 'DT'), ('meaningful', 'JJ'), ('and', 'CC'), ('useful', 'JJ'), ('.', '.')]


In [6]:
# Cell 6: Stopwords Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
print("Tokens after Stopwords Removal:")
print(filtered_tokens)


Tokens after Stopwords Removal:
['Natural', 'language', 'processing', '(', 'NLP', ')', 'field', 'artificial', 'intelligence', '(', 'AI', ')', 'focuses', 'interaction', 'computers', 'humans', 'natural', 'language', '.', 'ultimate', 'objective', 'NLP', 'enable', 'computers', 'understand', ',', 'interpret', ',', 'generate', 'human', 'language', 'way', 'meaningful', 'useful', '.']


In [7]:
# Cell 7: Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
print("Stemmed Tokens:")
print(stemmed_tokens)


Stemmed Tokens:
['natur', 'languag', 'process', '(', 'nlp', ')', 'field', 'artifici', 'intellig', '(', 'ai', ')', 'focus', 'interact', 'comput', 'human', 'natur', 'languag', '.', 'ultim', 'object', 'nlp', 'enabl', 'comput', 'understand', ',', 'interpret', ',', 'gener', 'human', 'languag', 'way', 'meaning', 'use', '.']


In [8]:
# Cell 8: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
print("Lemmatized Tokens:")
print(lemmatized_tokens)


Lemmatized Tokens:
['Natural', 'language', 'processing', '(', 'NLP', ')', 'field', 'artificial', 'intelligence', '(', 'AI', ')', 'focus', 'interaction', 'computer', 'human', 'natural', 'language', '.', 'ultimate', 'objective', 'NLP', 'enable', 'computer', 'understand', ',', 'interpret', ',', 'generate', 'human', 'language', 'way', 'meaningful', 'useful', '.']


In [9]:
# Cell 9: Calculate Term Frequency and Inverse Document Frequency (TF-IDF)
documents = [sample_document]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print("TF-IDF Representation:")
print(tfidf_matrix.toarray())


TF-IDF Representation:
[[0.1132277  0.33968311 0.1132277  0.1132277  0.1132277  0.22645541
  0.1132277  0.1132277  0.1132277  0.1132277  0.1132277  0.1132277
  0.1132277  0.1132277  0.1132277  0.1132277  0.33968311 0.33968311
  0.1132277  0.22645541 0.22645541 0.1132277  0.22645541 0.1132277
  0.1132277  0.22645541 0.22645541 0.1132277  0.22645541 0.1132277
  0.1132277  0.1132277  0.1132277 ]]
