# Document Preprocessing and Representation
This notebook demonstrates document preprocessing techniques including tokenization, POS tagging, stop word removal, stemming, and lemmatization. It also includes computation of Term Frequency (TF) and Inverse Document Frequency (IDF).

In [None]:
# Install necessary libraries
!pip install nltk sklearn

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
# Sample document
document = "Natural language processing (NLP) is a field of artificial intelligence that gives machines the ability to read and understand human language."

In [None]:
# Tokenization
tokens = word_tokenize(document)
print('Tokens:', tokens)

In [None]:
# POS Tagging
pos_tags = pos_tag(tokens)
print('POS Tags:', pos_tags)

In [None]:
# Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print('Filtered Tokens:', filtered_tokens)

In [None]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print('Stemmed Tokens:', stemmed_tokens)

In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print('Lemmatized Tokens:', lemmatized_tokens)

In [None]:
# Term Frequency and Inverse Document Frequency
corpus = [document]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print('TF-IDF Matrix:', X.toarray())
print('Feature Names:', vectorizer.get_feature_names_out())