In [3]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Download the stopwords
nltk.download('stopwords')

# Define the documents
documentA = 'Jupiter is the largest Planet'
documentB = 'Mars is the fourth planet from the Sun'

# Combine the documents
documents = [documentA, documentB]

# Remove stop words and convert to lowercase
stop_words = set(stopwords.words('english'))
documents = [' '.join([word.lower() for word in document.split() if word.lower() not in stop_words]) for document in documents]

# Calculate TF, IDF, and TFIDF scores
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(documents)

# Print the TF, IDF, and TFIDF scores
print("TF:")
print(vectorizer.transform(documents).toarray())
print("IDF:")
print(vectorizer.idf_)
print("TFIDF:")
print(tfidf.toarray())


TF:
[[0.         0.6316672  0.6316672  0.         0.44943642 0.        ]
 [0.53404633 0.         0.         0.53404633 0.37997836 0.53404633]]
IDF:
[1.40546511 1.40546511 1.40546511 1.40546511 1.         1.40546511]
TFIDF:
[[0.         0.6316672  0.6316672  0.         0.44943642 0.        ]
 [0.53404633 0.         0.         0.53404633 0.37997836 0.53404633]]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [5]:
from collections import Counter
import math

# Define documents
document_A = 'Jupiter is the largest Planet'
document_B = 'Mars is the fourth planet from the Sun'

# Define stop words
stop_words = {'is', 'the', 'from'}

# Tokenize documents, remove stop words, and convert to lowercase
def preprocess_document(document):
    tokens = document.lower().split()
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

tokens_A = preprocess_document(document_A)
tokens_B = preprocess_document(document_B)

# Calculate Term Frequency (TF)
def calculate_tf(tokens):
    tf = Counter(tokens)
    total_terms = len(tokens)
    tf = {term: freq / total_terms for term, freq in tf.items()}
    return tf

tf_A = calculate_tf(tokens_A)
tf_B = calculate_tf(tokens_B)

# Calculate Inverse Document Frequency (IDF)
corpus = [tokens_A, tokens_B]
unique_terms = set().union(*corpus)

def calculate_idf(term):
    doc_freq = sum(1 for document in corpus if term in document)
    return math.log(len(corpus) / doc_freq)

idf = {term: calculate_idf(term) for term in unique_terms}

# Calculate TF-IDF
def calculate_tfidf(tf, idf):
    tfidf = {term: tf[term] * idf[term] for term in tf}
    return tfidf

tfidf_A = calculate_tfidf(tf_A, idf)
tfidf_B = calculate_tfidf(tf_B, idf)

# Print results
print("Document A:")
print("TF:", tf_A)
print("IDF:", idf)
print("TF-IDF:", tfidf_A)
print()

print("Document B:")
print("TF:", tf_B)
print("IDF:", idf)
print("TF-IDF:", tfidf_B)


Document A:
TF: {'jupiter': 0.3333333333333333, 'largest': 0.3333333333333333, 'planet': 0.3333333333333333}
IDF: {'fourth': 0.6931471805599453, 'mars': 0.6931471805599453, 'planet': 0.0, 'largest': 0.6931471805599453, 'jupiter': 0.6931471805599453, 'sun': 0.6931471805599453}
TF-IDF: {'jupiter': 0.23104906018664842, 'largest': 0.23104906018664842, 'planet': 0.0}

Document B:
TF: {'mars': 0.25, 'fourth': 0.25, 'planet': 0.25, 'sun': 0.25}
IDF: {'fourth': 0.6931471805599453, 'mars': 0.6931471805599453, 'planet': 0.0, 'largest': 0.6931471805599453, 'jupiter': 0.6931471805599453, 'sun': 0.6931471805599453}
TF-IDF: {'mars': 0.17328679513998632, 'fourth': 0.17328679513998632, 'planet': 0.0, 'sun': 0.17328679513998632}
