In [2]:
# -*- coding: utf-8 -*-
# DSBDA_PR7: Text Analytics

# Import necessary libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
import pandas as pd
import math
from sklearn.feature_extraction.text import TfidfVectorizer

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

# 1) Document Preprocessing
# Sample Document
document = "Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence."

print("Original Document:\n", document)

# Tokenization
tokens = word_tokenize(document)
print("\nTokens:\n", tokens)

# POS Tagging
pos_tags = pos_tag(tokens)
print("\nPOS Tags:\n", pos_tags)

# Stopwords Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]
print("\nAfter Stopwords Removal:\n", filtered_tokens)

# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
print("\nAfter Stemming:\n", stemmed_words)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("\nAfter Lemmatization:\n", lemmatized_words)

# 2) TF, IDF, TF-IDF Calculation (Manual)

# Sample Corpus
documentA = 'Jupiter is the largest Planet'
documentB = 'Mars is the fourth planet from the Sun'

bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1

numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

def computeIDF(documents):
    N = len(documents)
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

# Calculate TF, IDF, TF-IDF
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)
idfs = computeIDF([numOfWordsA, numOfWordsB])
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

# Display in DataFrame
df_manual = pd.DataFrame([tfidfA, tfidfB])
print("\nManual TF-IDF DataFrame:\n", df_manual)

# 3) TF-IDF using TfidfVectorizer (Scikit-learn)

# Define the corpus
corpus = [documentA, documentB]

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the corpus
tfidf_matrix = vectorizer.fit_transform(corpus)

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Convert matrix to array
tfidf_array = tfidf_matrix.toarray()

# Display feature names and values for Document 1
print("\nFeature Names and their corresponding TF-IDF values for Document 1:")
for idx, word in enumerate(feature_names):
    print(f"{word}: {tfidf_array[0][idx]}")

# Optionally, display the full matrix as a DataFrame
df_vectorizer = pd.DataFrame(tfidf_array, columns=feature_names)
print("\nTF-IDF Matrix using TfidfVectorizer:\n", df_vectorizer)


Original Document:
 Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence.

Tokens:
 ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'Artificial', 'Intelligence', '.']

POS Tags:
 [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('fascinating', 'JJ'), ('field', 'NN'), ('of', 'IN'), ('Artificial', 'JJ'), ('Intelligence', 'NNP'), ('.', '.')]

After Stopwords Removal:
 ['Natural', 'Language', 'Processing', 'NLP', 'fascinating', 'field', 'Artificial', 'Intelligence']

After Stemming:
 ['natur', 'languag', 'process', 'nlp', 'fascin', 'field', 'artifici', 'intellig']

After Lemmatization:
 ['Natural', 'Language', 'Processing', 'NLP', 'fascinating', 'field', 'Artificial', 'Intelligence']

Manual TF-IDF DataFrame:
        Mars   is   largest    fourth      from  the    Planet       Sun  \
0  0.000000  0.0  0.138629  0.000000  0.000000 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\devka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\devka\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\devka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\devka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
