In [222]:
import pandas as pd
import nltk  # Natural Language Toolkit for NLP tasks
import re    # Regular Expressions for text preprocessing


In [223]:
nltk.download('punkt')  # For tokenization
nltk.download('stopwords')  # For stop word list
nltk.download('wordnet')  # For lemmatization
nltk.download('averaged_perceptron_tagger')  # For POS tagging


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [224]:
with open('doc_01.txt','r') as file:
  text = file.read()
with open('doc_02.txt','r')as file:
  text1 = file.read()
  
# print('The given sentences are: \n', text)


🔹 Sentence Tokenization

In [225]:
from nltk.tokenize import sent_tokenize
text = "Hi onkar. How was your day"
tokenized_text = sent_tokenize(text)
print("\n Sentence Tokenization: \n", tokenized_text)



 Sentence Tokenization: 
 ['Hi onkar.', 'How was your day']


🔹 Word Tokenization

In [226]:
from nltk.tokenize import word_tokenize
text = "Hi onkar. How was your day"
tokenized_word = word_tokenize(text)
print('\nWord Tokeniztion: \n', tokenized_word)



Word Tokeniztion: 
 ['Hi', 'onkar', '.', 'How', 'was', 'your', 'day']


🔹 Stop Word Removal

In [227]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

# print("Text file : ",text)
text = "Hi onkar. How was your day"
text = re.sub('[^a-zA-Z]', ' ', text)
# print("Text file : ",text)

tokens = word_tokenize(text.lower())
filtered_text = []

for w in tokens:
    if w not in stop_words:
        filtered_text.append(w)



print ("Tokenized Sentence:", tokens)
print ("Filterd Sentence:", filtered_text)


Tokenized Sentence: ['hi', 'onkar', 'how', 'was', 'your', 'day']
Filterd Sentence: ['hi', 'onkar', 'day']


🔹 Stemming

In [228]:
from nltk.stem import PorterStemmer
e_words = ["wait", "waiting", "waited", "waits"]
ps = PorterStemmer()
for w in e_words:
    rootWord = ps.stem(w)
    print('Stemming for ', w, ': ', rootWord)


Stemming for  wait :  wait
Stemming for  waiting :  wait
Stemming for  waited :  wait
Stemming for  waits :  wait


🔹 Lemmatization

In [229]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)

for w in tokenization:
    print("Lemma for {} is {}".format(w, wordnet_lemmatizer.lemmatize(w)))


Lemma for studies is study
Lemma for studying is studying
Lemma for cries is cry
Lemma for cry is cry


1. TF (Term Frequency) :- Term Frequency measures how frequently a word occurs in a document.

          TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)

2. IDF (Inverse Document Frequency) :- Inverse Document Frequency measures how important a word is by reducing the weight of common words.

                           IDF(t) = log(N / df(t))

    Where:
    - N: Total number of documents
    - df(t): Number of documents containing the term t

3. TF-IDF (Term Frequency × Inverse Document Frequency)
Combines both: TF tells how often, IDF tells how rare, so together:

                            TF-IDF(t) = TF(t) × IDF(t)


In [230]:
def computeTF(wordDict, bagOfWords):
    tf = {}
    n = len(bagOfWords)
    
    for word, count in wordDict.items():
        tf[word] = count / float(n)
        
    return tf


In [231]:
import math

def computeIDF(documents):
    idf = {}
    N = len(documents)

    # Count how many documents contain each word
    for doc in documents:
        for word, val in doc.items():
            if val > 0:
                idf[word] = idf.get(word, 0) + 1

    # Apply IDF formula: log(N / df(t))
    for word, val in idf.items():
        idf[word] = math.log(N / float(val))

    return idf



In [232]:

def computeTFIDF(tf, idfs):
    tfidf = {}
    
    for word, val in tf.items():
        tfidf[word] = val * idfs[word]
        
    return tfidf


🔹 TF-IDF from Scratch

In [234]:
#--------------------------------------------------------------------------------------------------
# Algorithm to Create Representation of Documents by Calculating TF-IDF
#--------------------------------------------------------------------------------------------------

# Step 1: Import the necessary libraries.
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 2: Initialize the Documents.
text = 'Jupiter is the largest planet'
text1 = 'Mars is the fourth planet from the Sun'

# Step 3: Create Bag of Words (BoW) for Document A and B using word tokenization.
bagOfWordsA = text.split(' ')
bagOfWordsB = text1.split(' ')

# Step 4: Create a collection of unique words from Document A and B.
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

# Step 5: Create a dictionary of word occurrences for each document in the corpus.
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1  # How many times each word is repeated in document A

numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1  # How many times each word is repeated in document B

# Step 6: Compute the Term Frequency (TF) for each document.
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

# Step 7: Display the Term Frequencies.
print('---------------- Term Frequency ----------------------')
df = pd.DataFrame([tfA, tfB])
print(df.T)

# Step 8: Compute the Inverse Document Frequency (IDF).
idfs = computeIDF([numOfWordsA, numOfWordsB])
print('---------------- Inverse Document Frequency ----------------------')
print(pd.DataFrame(idfs, index=["IDF"]).T)


# Step 9: Compute the TF-IDF scores.
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

# Step 10: Display the TF-IDF scores.
print('------------------- TF-IDF --------------------------------------')
df = pd.DataFrame([tfidfA, tfidfB])
print(df.T)


---------------- Term Frequency ----------------------
           0      1
Mars     0.0  0.125
the      0.2  0.250
Sun      0.0  0.125
Jupiter  0.2  0.000
fourth   0.0  0.125
planet   0.2  0.125
is       0.2  0.125
largest  0.2  0.000
from     0.0  0.125
---------------- Inverse Document Frequency ----------------------
              IDF
the      0.000000
Jupiter  0.693147
planet   0.000000
is       0.000000
largest  0.693147
Mars     0.693147
Sun      0.693147
fourth   0.693147
from     0.693147
------------------- TF-IDF --------------------------------------
                0         1
Mars     0.000000  0.086643
the      0.000000  0.000000
Sun      0.000000  0.086643
Jupiter  0.138629  0.000000
fourth   0.000000  0.086643
planet   0.000000  0.000000
is       0.000000  0.000000
largest  0.138629  0.000000
from     0.000000  0.086643
