# Text Analytics
1. Extract Sample document and apply following document preprocessing methods:
Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse Document
Frequency.


In [6]:
# Step 1: Install and Import Necessary Libraries

import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

# Download required NLTK packages
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet_tab')
nltk.download('omw-1.4_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Error loading wordnet_tab: Package 'wordnet_tab' not

False

In [7]:
# Step 2: Document Preprocessing
# We will take a sample document/text and perform the operation.

# Sample Document
text = "The quick brown foxes are jumping over the lazy dogs. Data Analytics is a great field to study."

# 1. Tokenization (Breaking text into words)
tokens = word_tokenize(text)
print("Tokens:", tokens)

# 2. POS Tagging (Identifying Parts of Speech)
pos_tags = pos_tag(tokens)
print("\nPOS Tags:", pos_tags)

# 3. Stop Words Removal (Removing common words like 'the', 'is', 'are')
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w.lower() not in stop_words and w.isalpha()]
print("\nAfter Stop Words Removal:", filtered_tokens)

# 4. Stemming (Cutting words to their root form)
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(w) for w in filtered_tokens]
print("\nAfter Stemming:", stemmed_words)

# 5. Lemmatization (Reducing words to meaningful base forms)
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(w) for w in filtered_tokens]
print("\nAfter Lemmatization:", lemmatized_words)

Tokens: ['The', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'the', 'lazy', 'dogs', '.', 'Data', 'Analytics', 'is', 'a', 'great', 'field', 'to', 'study', '.']

POS Tags: [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('foxes', 'NNS'), ('are', 'VBP'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dogs', 'NNS'), ('.', '.'), ('Data', 'NNP'), ('Analytics', 'NNPS'), ('is', 'VBZ'), ('a', 'DT'), ('great', 'JJ'), ('field', 'NN'), ('to', 'TO'), ('study', 'VB'), ('.', '.')]

After Stop Words Removal: ['quick', 'brown', 'foxes', 'jumping', 'lazy', 'dogs', 'Data', 'Analytics', 'great', 'field', 'study']

After Stemming: ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', 'data', 'analyt', 'great', 'field', 'studi']

After Lemmatization: ['quick', 'brown', 'fox', 'jumping', 'lazy', 'dog', 'Data', 'Analytics', 'great', 'field', 'study']


In [8]:
# Step 3: TF-IDF Representation

# Sample Corpus (collection of documents)
corpus = [
    "The quick brown fox jumps over the lazy dog",
    "Data Analytics is about extracting insights from data",
    "The dog and the fox are friends"
]

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer()

# Calculate TF-IDF
result = tfidf.fit_transform(corpus)

# Convert to a readable DataFrame
tfidf_df = pd.DataFrame(result.toarray(), columns=tfidf.get_feature_names_out())

print("\n--- TF-IDF Representation ---")
display(tfidf_df)


--- TF-IDF Representation ---


Unnamed: 0,about,analytics,and,are,brown,data,dog,extracting,fox,friends,from,insights,is,jumps,lazy,over,quick,the
0,0.0,0.0,0.0,0.0,0.343596,0.0,0.261314,0.0,0.261314,0.0,0.0,0.0,0.0,0.343596,0.343596,0.343596,0.343596,0.522627
1,0.316228,0.316228,0.0,0.0,0.0,0.632456,0.0,0.316228,0.0,0.0,0.316228,0.316228,0.316228,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.393129,0.393129,0.0,0.0,0.298984,0.0,0.298984,0.393129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.597969
