In [46]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Import Libraries**

In [47]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

**Step 1: Define the Dataset**

In [48]:
documents = [
    "scary green crocodile",
    "scary green big",
    "small crocodile"
]

**Step 2: Compute TF-IDF Matrix**

In [49]:
vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(documents)

print("TF-IDF Matrix:\n", tfidf_matrix.toarray())

TF-IDF Matrix:
 [[0.         0.57735027 0.57735027 0.57735027 0.        ]
 [0.68091856 0.         0.51785612 0.51785612 0.        ]
 [0.         0.60534851 0.         0.         0.79596054]]


**Step 3: Visualize the Terms and Their Corresponding Index**

In [50]:
terms = vectorizer.get_feature_names_out()

print("\nTerms in the corpus:\n", terms)


Terms in the corpus:
 ['big' 'crocodile' 'green' 'scary' 'small']


**Step 4: Compute the semantic vector for the text "green crocodile"**

In [51]:
text = ["green crocodile"]
text_vector = vectorizer.transform(text)
print("\nTF-IDF Vector for 'green crocodile':\n", text_vector.toarray())


TF-IDF Vector for 'green crocodile':
 [[0.         0.70710678 0.70710678 0.         0.        ]]


**Step 5: Compute cosine similarity between two documents ("big crocodile” and “scary crocodile")**

In [52]:
text_1 = ["big crocodile"]
text_vector_1 = vectorizer.transform(text_1)

text_2 = ["scary crocodile"]
text_vector_2 = vectorizer.transform(text_2)

cosine_sim = cosine_similarity(text_vector_1, text_vector_2)
print("\nCosine Similarity between 'big crocodile' and 'scary crocodile':", cosine_sim[0][0])


Cosine Similarity between 'big crocodile' and 'scary crocodile': 0.4280460350631186


# Using User defined functionality instead of using built in functions

In [53]:
import numpy as np
import pandas as pd
from math import log2, sqrt

# Step 1: Document collection
documents = {
    "d1": "scary green crocodile",
    "d2": "scary green big",
    "d3": "small crocodile"
}

# Tokenize documents
tokenized_docs = {k: v.split() for k, v in documents.items()}

# Step 2: Vocabulary (alphabetical)
vocab = sorted(set(word for doc in tokenized_docs.values() for word in doc))
print("Vocabulary:", vocab)


# Step 3: Term Frequency (TF)
tf = pd.DataFrame(0, index=vocab, columns=documents.keys())

for doc, words in tokenized_docs.items():
    for word in words:
        tf.loc[word, doc] += 1

print("\nTF Matrix:")
print(tf)

# Step 4: IDF computation
N = len(documents)
idf = {}

for term in vocab:
    df = sum(term in doc for doc in tokenized_docs.values())
    idf[term] = log2(N / df)

idf_series = pd.Series(idf)
print("\nIDF:")
print(idf_series)

# Step 5: TF-IDF Matrix
tfidf = tf.multiply(idf_series, axis=0)
print("\nTF-IDF Matrix:")
print(tfidf)

# Utility functions
def vector_length(vec):
    return sqrt(sum(v**2 for v in vec))

def normalize(vec):
    length = vector_length(vec)
    return vec / length if length != 0 else vec

def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (vector_length(v1) * vector_length(v2))


# Question 1: Semantic Interpreter
# "green crocodile"
query1 = ["green", "crocodile"]
query_vec = np.array([idf[word] if word in query1 else 0 for word in vocab])

print("\nTF-IDF Vector for 'green crocodile':")
print(query_vec)

print("\nNormalized Vector:")
print(normalize(query_vec))


# Question 2: Similarity
# "big crocodile" vs "scary crocodile"
q1 = ["big", "crocodile"]
q2 = ["scary", "crocodile"]

vec1 = np.array([idf[word] if word in q1 else 0 for word in vocab])
vec2 = np.array([idf[word] if word in q2 else 0 for word in vocab])

print("\nVector for 'big crocodile':", vec1)
print("Vector for 'scary crocodile':", vec2)

print("\nNormalized 'big crocodile':", normalize(vec1))
print("Normalized 'scary crocodile':", normalize(vec2))

similarity = cosine_similarity(vec1, vec2)
print("\nCosine Similarity:", round(similarity, 3))

Vocabulary: ['big', 'crocodile', 'green', 'scary', 'small']

TF Matrix:
           d1  d2  d3
big         0   1   0
crocodile   1   0   1
green       1   1   0
scary       1   1   0
small       0   0   1

IDF:
big          1.584963
crocodile    0.584963
green        0.584963
scary        0.584963
small        1.584963
dtype: float64

TF-IDF Matrix:
                 d1        d2        d3
big        0.000000  1.584963  0.000000
crocodile  0.584963  0.000000  0.584963
green      0.584963  0.584963  0.000000
scary      0.584963  0.584963  0.000000
small      0.000000  0.000000  1.584963

TF-IDF Vector for 'green crocodile':
[0.        0.5849625 0.5849625 0.        0.       ]

Normalized Vector:
[0.         0.70710678 0.70710678 0.         0.        ]

Vector for 'big crocodile': [1.5849625 0.5849625 0.        0.        0.       ]
Vector for 'scary crocodile': [0.        0.5849625 0.        0.5849625 0.       ]

Normalized 'big crocodile': [0.9381454  0.34624155 0.         0.         0.   

# The answer are coming as different because of Normalization of the built-in functions.