In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [6]:
# Sample Document
document = """Natural Language Processing (NLP) is a sub-field of Artificial Intelligence (AI) that deals with the interaction 
between computers and human language. It involves various tasks such as text analysis, speech recognition, and machine translation."""

In [7]:
# Tokenization
tokens = word_tokenize(document)
print("Tokenized Words:", tokens)

Tokenized Words: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'sub-field', 'of', 'Artificial', 'Intelligence', '(', 'AI', ')', 'that', 'deals', 'with', 'the', 'interaction', 'between', 'computers', 'and', 'human', 'language', '.', 'It', 'involves', 'various', 'tasks', 'such', 'as', 'text', 'analysis', ',', 'speech', 'recognition', ',', 'and', 'machine', 'translation', '.']


In [8]:
# POS Tagging
pos_tags = nltk.pos_tag(tokens)
print("\nPOS Tags:", pos_tags)


POS Tags: [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('sub-field', 'NN'), ('of', 'IN'), ('Artificial', 'JJ'), ('Intelligence', 'NNP'), ('(', '('), ('AI', 'NNP'), (')', ')'), ('that', 'IN'), ('deals', 'NNS'), ('with', 'IN'), ('the', 'DT'), ('interaction', 'NN'), ('between', 'IN'), ('computers', 'NNS'), ('and', 'CC'), ('human', 'JJ'), ('language', 'NN'), ('.', '.'), ('It', 'PRP'), ('involves', 'VBZ'), ('various', 'JJ'), ('tasks', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('text', 'JJ'), ('analysis', 'NN'), (',', ','), ('speech', 'NN'), ('recognition', 'NN'), (',', ','), ('and', 'CC'), ('machine', 'NN'), ('translation', 'NN'), ('.', '.')]


In [9]:
# Stopwords Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word not in string.punctuation]
print("\nFiltered Tokens (Stopwords Removed):", filtered_tokens)


Filtered Tokens (Stopwords Removed): ['Natural', 'Language', 'Processing', 'NLP', 'sub-field', 'Artificial', 'Intelligence', 'AI', 'deals', 'interaction', 'computers', 'human', 'language', 'involves', 'various', 'tasks', 'text', 'analysis', 'speech', 'recognition', 'machine', 'translation']


In [10]:
# Stemming (using Porter Stemmer)
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("\nStemmed Tokens:", stemmed_tokens)


Stemmed Tokens: ['natur', 'languag', 'process', 'nlp', 'sub-field', 'artifici', 'intellig', 'ai', 'deal', 'interact', 'comput', 'human', 'languag', 'involv', 'variou', 'task', 'text', 'analysi', 'speech', 'recognit', 'machin', 'translat']


In [11]:
# Lemmatization (using WordNet Lemmatizer)
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("\nLemmatized Tokens:", lemmatized_tokens)


Lemmatized Tokens: ['Natural', 'Language', 'Processing', 'NLP', 'sub-field', 'Artificial', 'Intelligence', 'AI', 'deal', 'interaction', 'computer', 'human', 'language', 'involves', 'various', 'task', 'text', 'analysis', 'speech', 'recognition', 'machine', 'translation']


In [12]:
# Term Frequency (TF)
fdist = FreqDist(filtered_tokens)
print("\nTerm Frequency (TF):")
for word, freq in fdist.items():
    print(f"{word}: {freq}")


Term Frequency (TF):
Natural: 1
Language: 1
Processing: 1
NLP: 1
sub-field: 1
Artificial: 1
Intelligence: 1
AI: 1
deals: 1
interaction: 1
computers: 1
human: 1
language: 1
involves: 1
various: 1
tasks: 1
text: 1
analysis: 1
speech: 1
recognition: 1
machine: 1
translation: 1


In [13]:
# Inverse Document Frequency (IDF)
# For IDF calculation, let's assume a small corpus with multiple documents.
# We will use a dummy corpus of 3 documents.
corpus = [
    "Natural Language Processing is a field of study.",
    "NLP and AI are related fields.",
    "Text analysis is a key task in NLP."
]

In [14]:
# Create TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(corpus)

# Show the TF-IDF values for the words in the document
print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())

# Get the feature names (words in the vocabulary)
print("\nFeature Names (Words in the Corpus):")
print(vectorizer.get_feature_names_out())


TF-IDF Matrix:
[[0.         0.         0.4472136  0.         0.         0.4472136
  0.4472136  0.         0.4472136  0.         0.4472136  0.
  0.        ]
 [0.52863461 0.         0.         0.52863461 0.         0.
  0.         0.40204024 0.         0.52863461 0.         0.
  0.        ]
 [0.         0.46735098 0.         0.         0.46735098 0.
  0.         0.35543247 0.         0.         0.         0.46735098
  0.46735098]]

Feature Names (Words in the Corpus):
['ai' 'analysis' 'field' 'fields' 'key' 'language' 'natural' 'nlp'
 'processing' 'related' 'study' 'task' 'text']
