In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Sample corpus
corpus = [
    "natural language processing is a field of artificial intelligence",
    "text mining is the process of deriving useful information from text",
    "machine learning is a method of data analysis",
    "artificial intelligence can simulate human cognition",
    "text data processing is an important task in natural language"
]

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Find the index of the words we want to compare
word1 = "language"
word2 = "processing"

outputs = []

# (b) Cosine similarity between two words
if word1 in feature_names and word2 in feature_names:
    idx1 = np.where(feature_names == word1)[0][0]
    idx2 = np.where(feature_names == word2)[0][0]

    # Get word vectors (column vectors from TF-IDF matrix)
    word1_vector = X[:, idx1].toarray()
    word2_vector = X[:, idx2].toarray()

    # Compute cosine similarity
    similarity = cosine_similarity(word1_vector.T, word2_vector.T)[0][0]
    outputs.append(f"(b) Cosine Similarity Between Two Words\n")
    outputs.append(f"Word 1: `{word1}`\nWord 2: `{word2}`\n**Cosine Similarity:** **{similarity:.4f}**\n")
else:
    outputs.append(f"One of the words '{word1}' or '{word2}' not found in vocabulary.")

# (c) Top 5 words with highest TF-IDF scores across all documents
tfidf_scores = np.asarray(X.sum(axis=0)).flatten()
top_indices = np.argsort(tfidf_scores)[::-1][:5]

top_words = [(feature_names[idx], tfidf_scores[idx]) for idx in top_indices]
outputs.append(f"(c) Top 5 Words with Highest TF-IDF Scores Across the Corpus\n")
outputs.append(f"| Rank | Word         | TF-IDF Score |\n")
outputs.append(f"|------|--------------|--------------|\n")
for i, (word, score) in enumerate(top_words, 1):
    outputs.append(f"| {i}    | {word.ljust(12)} | {score:.4f}       |\n")

# Print the output in the desired format
print("\n".join(outputs))


(b) Cosine Similarity Between Two Words

Word 1: `language`
Word 2: `processing`
**Cosine Similarity:** **1.0000**

(c) Top 5 Words with Highest TF-IDF Scores Across the Corpus

| Rank | Word         | TF-IDF Score |

|------|--------------|--------------|

| 1    | is           | 0.8732       |

| 2    | of           | 0.7946       |

| 3    | text         | 0.7943       |

| 4    | intelligence | 0.7105       |

| 5    | artificial   | 0.7105       |

