1) Create a Tf-IDF matrix for the following documents.

    a)   'the man went out for a walk'
    
    b)   'the children sat around the fire'
    


In [1]:
# Import CountVectorizer from scikit-learn
# CountVectorizer converts text documents into a matrix of token counts (Bag-of-Words model)
from sklearn.feature_extraction.text import CountVectorizer

# Define the two text documents
document1 = 'the man went out for a walk for'
document2 = 'the children sat around the fire'

# Initialize CountVectorizer
# This will break text into tokens (words) and count how often each appears
vectorizer = CountVectorizer()

# Fit the vectorizer to the documents and transform them into a term-document matrix
# Each row = a document
# Each column = a unique word (token) from the whole corpus
# Each value = frequency of that word in the document
tf_matrix = vectorizer.fit_transform([document1, document2])

# Convert the sparse matrix (compressed format) into a dense array for easier reading
tf_array = tf_matrix.toarray()

# Print the array
# Example:
# [[1 0 0 1 ...]   -> word counts for document1
#  [0 1 1 0 ...]]  -> word counts for document2
print(tf_array)

# If you want to see which words correspond to the columns, you can print:
print(vectorizer.get_feature_names_out())

[[0 0 0 2 1 1 0 1 1 1]
 [1 1 1 0 0 0 1 2 0 0]]
['around' 'children' 'fire' 'for' 'man' 'out' 'sat' 'the' 'walk' 'went']


In [2]:
import numpy as np

In [3]:
# Import TfidfVectorizer from scikit-learn
# TF-IDF = Term Frequency – Inverse Document Frequency
# It not only counts words but also reduces the weight of common words
# (like 'the', 'for') while giving importance to unique words
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np   # for rounding values when printing

# Define the two text documents
document1 = 'the man went out for a walk'
document2 = 'the children sat around the fire'

# Initialize TfidfVectorizer
# This will tokenize the documents, calculate TF-IDF for each word
vectorizer = TfidfVectorizer()

# Fit the vectorizer on the documents and transform them into a TF-IDF matrix
# Each row = document
# Each column = unique word across all documents
# Each value = TF-IDF score of that word in the document
tfidf_matrix = vectorizer.fit_transform([document1, document2])

# Convert sparse matrix into a dense array
tfidf_array = tfidf_matrix.toarray()

# Print the TF-IDF matrix rounded to 2 decimal places for readability
print(np.round(tfidf_array, 2))

# Print the feature (word) names to understand the column order
print(vectorizer.get_feature_names_out())

[[0.   0.   0.   0.43 0.43 0.43 0.   0.3  0.43 0.43]
 [0.41 0.41 0.41 0.   0.   0.   0.41 0.58 0.   0.  ]]
['around' 'children' 'fire' 'for' 'man' 'out' 'sat' 'the' 'walk' 'went']


2) Find the Cosine Similarity for above documents.

In [4]:
# Import TfidfVectorizer for TF-IDF calculation
from sklearn.feature_extraction.text import TfidfVectorizer

# Import cosine_similarity to measure similarity between vectors
from sklearn.metrics.pairwise import cosine_similarity

# Define two documents
document1 = 'the man went out for a walk'
document2 = 'the children sat around the fire'

# Step 1: Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Step 2: Fit and transform documents into TF-IDF matrix
# Each row = a document, each column = a unique word across all documents
tfidf_matrix = vectorizer.fit_transform([document1, document2])

# Step 3: Compute cosine similarity between the two documents
# tfidf_matrix[0] = vector for document1
# tfidf_matrix[1] = vector for document2
cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

# Step 4: Print similarity score
print(cosine_sim)

[[0.17578608]]
