In [1]:
import tensorflow as tf
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from sklearn.metrics import jaccard_score
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
tf.random.set_seed(42)
np.random.seed(42)

In [3]:
sample_texts = [
    "Machine learning is a subset of artificial intelligence",
    "Deep learning uses neural networks with multiple layers",
    "Natural language processing helps computers understand human language",
    "Computer vision enables machines to interpret visual information",
    "Artificial intelligence mimics human cognitive functions",
    "Neural networks are inspired by biological brain structures",
    "Data science combines statistics and programming",
    "Python is a popular programming language for AI",
    "TensorFlow is an open-source machine learning framework",
    "Supervised learning uses labeled training data"
]

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(sample_texts)

texts_list = list(dataset.as_numpy_iterator())
texts_list = [text.decode('utf-8') if isinstance(text, bytes) else text for text in texts_list]

In [5]:
print("Sample texts from TF dataset:")
for i, text in enumerate(texts_list):
    print(f"{i+1}: {text}")

Sample texts from TF dataset:
1: Machine learning is a subset of artificial intelligence
2: Deep learning uses neural networks with multiple layers
3: Natural language processing helps computers understand human language
4: Computer vision enables machines to interpret visual information
5: Artificial intelligence mimics human cognitive functions
6: Neural networks are inspired by biological brain structures
7: Data science combines statistics and programming
8: Python is a popular programming language for AI
9: TensorFlow is an open-source machine learning framework
10: Supervised learning uses labeled training data


In [6]:
# Create TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
tfidf_matrix = vectorizer.fit_transform(texts_list)

print(f"\nTF-IDF Matrix shape: {tfidf_matrix.shape}")
print(f"Feature names (first 10): {vectorizer.get_feature_names_out()[:10]}")


TF-IDF Matrix shape: (10, 47)
Feature names (first 10): ['ai' 'artificial' 'biological' 'brain' 'cognitive' 'combines' 'computer'
 'computers' 'data' 'deep']


In [None]:
# COSINE SIMILARITY
def compute_cosine_similarity(matrix):
    return cosine_similarity(matrix)

# EUCLIDEAN DISTANCE
def compute_euclidean_distance(matrix):
    return euclidean_distances(matrix)

# MANHATTAN DISTANCE
def compute_manhattan_distance(matrix):
    return manhattan_distances(matrix)


# def compute_jaccard_similarity(matrix):
#     # Convert the TF-IDF matrix to binary (0/1) format
#     binary_matrix = (matrix > 0).astype(int)
#     n_samples = binary_matrix.shape[0]
#     jaccard_matrix = np.zeros((n_samples, n_samples))
    
#     for i in range(n_samples):
#         for j in range(n_samples):
#             intersection = binary_matrix[i].multiply(binary_matrix[j]).sum()
#             union = binary_matrix[i].sum() + binary_matrix[j].sum() - intersection
#             jaccard_matrix[i, j] = intersection / union if union > 0 else 0
    
#     return jaccard_matrix

In [None]:
# Computing all similarity measures
cosine_sim = compute_cosine_similarity(tfidf_matrix)
euclidean_dist = compute_euclidean_distance(tfidf_matrix)
manhattan_dist = compute_manhattan_distance(tfidf_matrix)
# jaccard_sim = compute_jaccard_similarity(tfidf_matrix)


print(cosine_sim)
print(euclidean_dist)
print(manhattan_dist)
# print(jaccard_sim)

[[1.         0.09726226 0.         0.         0.33484    0.
  0.         0.         0.26892355 0.10421179]
 [0.09726226 1.         0.         0.         0.         0.26161023
  0.         0.         0.08129964 0.22171488]
 [0.         0.         1.         0.         0.10831465 0.
  0.         0.23357478 0.         0.        ]
 [0.         0.         0.         1.         0.         0.
  0.         0.         0.         0.        ]
 [0.33484    0.         0.10831465 0.         1.         0.
  0.         0.         0.         0.        ]
 [0.         0.26161023 0.         0.         0.         1.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  1.         0.16256576 0.         0.15511632]
 [0.         0.         0.23357478 0.         0.         0.
  0.16256576 1.         0.         0.        ]
 [0.26892355 0.08129964 0.         0.         0.         0.
  0.         0.         1.         0.08710862]
 [0.10421179 0.22171488 0.   

In [None]:
print("\nCOSINE SIMILARITY MATRIX:")
print("-" * 40)
df_cosine = pd.DataFrame(cosine_sim, 
                        index=[f"Text_{i+1}" for i in range(len(texts_list))],
                        columns=[f"Text_{i+1}" for i in range(len(texts_list))])
print(df_cosine.round(3))



COSINE SIMILARITY MATRIX:
----------------------------------------
         Text_1  Text_2  Text_3  Text_4  Text_5  Text_6  Text_7  Text_8  \
Text_1    1.000   0.097   0.000     0.0   0.335   0.000   0.000   0.000   
Text_2    0.097   1.000   0.000     0.0   0.000   0.262   0.000   0.000   
Text_3    0.000   0.000   1.000     0.0   0.108   0.000   0.000   0.234   
Text_4    0.000   0.000   0.000     1.0   0.000   0.000   0.000   0.000   
Text_5    0.335   0.000   0.108     0.0   1.000   0.000   0.000   0.000   
Text_6    0.000   0.262   0.000     0.0   0.000   1.000   0.000   0.000   
Text_7    0.000   0.000   0.000     0.0   0.000   0.000   1.000   0.163   
Text_8    0.000   0.000   0.234     0.0   0.000   0.000   0.163   1.000   
Text_9    0.269   0.081   0.000     0.0   0.000   0.000   0.000   0.000   
Text_10   0.104   0.222   0.000     0.0   0.000   0.000   0.155   0.000   

         Text_9  Text_10  
Text_1    0.269    0.104  
Text_2    0.081    0.222  
Text_3    0.000    0.000 