#Step 1. Parsing

In [10]:
import numpy as np
import pandas as pd

In [12]:
import tensorflow as tf

def load_mnist_keras():
    # Load MNIST dataset from TensorFlow/Keras
    (train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

    # latten the images to shape (num_samples, 784)
    train_images = train_images.reshape(-1, 28*28).astype('float32')
    test_images = test_images.reshape(-1, 28*28).astype('float32')

    #Normalize images to the range [0, 1]
    train_images /= 255.0
    test_images /= 255.0

    return (train_images, train_labels), (test_images, test_labels)

#Load the MNIST dataset
(train_images, train_labels), (test_images, test_labels) = load_mnist_keras()

#Print the shape to verify
print(train_images.shape)
print(test_images.shape)


(60000, 784)
(10000, 784)


In [14]:
#20 NG Dataset

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Load 20 Newsgroups data
def load_20ng():
    newsgroups = fetch_20newsgroups(subset='all')  # Load both training and test data
    return newsgroups.data, newsgroups.target

text_data, labels = load_20ng()
print(len(text_data))

18846


In [16]:
#Parsing 20 NG Dataset
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#Downloaded necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

#Tokenize the text and remove stopwords
def tokenize_and_remove_stopwords(text_data):
    stop_words = set(stopwords.words('english'))  #List of common stopwords
    tokenized_docs = []

    for doc in text_data:
        #Tokenize the document
        tokens = nltk.word_tokenize(doc.lower())  #Convert to lowercase and tokenize
        #Remove stopwords and non-alphabetical tokens
        filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
        tokenized_docs.append(' '.join(filtered_tokens))  #Join back to form a clean document string

    return tokenized_docs

#Vectorize the text
def vectorize_text(tokenized_docs, method='tf-idf'):
    if method == 'tf':
        vectorizer = CountVectorizer()  #Counts term frequencies
    elif method == 'tf-idf':
        vectorizer = CountVectorizer()
        tf_matrix = vectorizer.fit_transform(tokenized_docs)
        tfidf_transformer = TfidfTransformer()
        tfidf_matrix = tfidf_transformer.fit_transform(tf_matrix)  #Transform to TF-IDF
        return tfidf_matrix, vectorizer
    else:
        raise ValueError("Method must be 'tf' or 'tf-idf'")

    return vectorizer.fit_transform(tokenized_docs)  #Term-Document Matrix (TDM) for TF

# Parse the 20 Newsgroups dataset
def parse_20ng(text_data):
    #Tokenize and remove stopwords
    tokenized_data = tokenize_and_remove_stopwords(text_data)

    #Vectorize the text using TF-IDF
    tfidf_matrix, vectorizer = vectorize_text(tokenized_data, method='tf-idf')

    return tfidf_matrix, vectorizer

# Example usage for 20 Newsgroups
tfidf_matrix, vectorizer = parse_20ng(text_data)

# Verify the shape of the term-document matrix
print(f"Shape of the TF-IDF matrix: {tfidf_matrix.shape}")  # (num_documents, num_terms)

#To see the feature names
print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())}")
print()
print(f"Sample features: {vectorizer.get_feature_names_out()[:10]}")  #Print first 10 words



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vaishnavibhutada/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vaishnavibhutada/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/vaishnavibhutada/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Shape of the TF-IDF matrix: (18846, 93622)
Vocabulary size: 93622

Sample features: ['aa' 'aaa' 'aaaa' 'aaaaa' 'aaaaaaaaaaaa'
 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaauuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuugggggggggggggggg'
 'aaaaagggghhhh' 'aaaaarrrrgh' 'aaaah' 'aaaahhh']


# Step 2. Normalization

The MNIST images are already in a range of [0, 1], which means they have already been shifted and scaled. However, we can perform zero mean, unit variance normalization for better consistency.

Zero mean, unit variance normalization involves:

Subtracting the mean of each pixel across all images.
Dividing by the standard deviation to scale the pixel values to a variance of 1.
For MNIST, we'll apply this normalization to each pixel across all images.

In [17]:
def normalize_mnist(images):
    #Zero mean, unit variance normalization
    mean = np.mean(images, axis=0)
    std = np.std(images, axis=0)

    #Prevent division by zero by replacing 0 standard deviation with 1 (no scaling)
    std[std == 0] = 1

    #Normalize images
    normalized_images = (images - mean) / std
    return normalized_images

#Normalize the training and test images
train_images_normalized = normalize_mnist(train_images)
test_images_normalized = normalize_mnist(test_images)

#Check the results
print(f"Mean of normalized train images: {np.mean(train_images_normalized)}")
print(f"Std of normalized train images: {np.std(train_images_normalized)}")


Mean of normalized train images: -5.346976195141906e-06
Std of normalized train images: 0.956339955329895


For the 20 Newsgroups datas- Term Frequency (TF) normalization.

The TF normalization involves:

Mapping each term to its frequency in the document (using CountVectorizer from sklearn).
TF normalization will ensure the sum of the term frequencies in each document is 1. This is important for comparing the relative importance of terms across documents.


In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

def normalize_20ng(text_data):
    #Tokenize and vectorize text data (CountVectorizer)
    count_vectorizer = CountVectorizer()
    tf_matrix = count_vectorizer.fit_transform(text_data)

    #Apply TF-IDF normalization
    tfidf_transformer = TfidfTransformer()
    tfidf_matrix = tfidf_transformer.fit_transform(tf_matrix)  # Normalize TF to TF-IDF

    return tfidf_matrix, count_vectorizer

#Normalize the 20 Newsgroups text data
tfidf_matrix, count_vectorizer = normalize_20ng(text_data)

#Check the shape of the resulting normalized TF-IDF matrix
print(f"Shape of the TF-IDF matrix: {tfidf_matrix.shape}")  # (num_documents, num_terms)


Shape of the TF-IDF matrix: (18846, 173762)


# Step 3. Pairwise similarities

For MNIST:

Using a Library with Batch Processing.

In [24]:
from scipy.spatial.distance import cdist
import numpy as np

def compute_distances_in_batches(data, batch_size=1000):
    num_samples = data.shape[0]
    distance_matrix = np.zeros((num_samples, num_samples), dtype=np.float32)

    for i in range(0, num_samples, batch_size):
        end_i = min(i + batch_size, num_samples)
        distances_batch = cdist(data[i:end_i], data, metric='euclidean')
        distance_matrix[i:end_i, :] = distances_batch

    return distance_matrix

#Compute the pairwise distance matrix in batches
batch_size = 1000
train_subset = train_images_normalized[:5000]
distance_matrix = compute_distances_in_batches(train_subset, batch_size=batch_size)

#Print the shape of the resulting matrix
print(f"Shape of the Euclidean distance matrix (batch computation): {distance_matrix.shape}")


Shape of the Euclidean distance matrix (batch computation): (5000, 5000)


Euclidean Distance Implementation from Scratch.

In [27]:
def euclidean_distance(x, y):
    return np.sqrt(np.sum((x - y) ** 2))

#Computing pairwise Euclidean distances between the first 100 images to avoid memory overload
num_samples = 100
euclidean_distances_batch = np.zeros((num_samples, num_samples))

for i in range(num_samples):
    for j in range(num_samples):
        euclidean_distances_batch[i, j] = euclidean_distance(train_images_normalized[i], train_images_normalized[j])

print(f"Shape of the Euclidean distance matrix: {euclidean_distances_batch.shape}")


Shape of the Euclidean distance matrix: (100, 100)


For 20 NG Dataset:

Cosine Similarity using a Library (sklearn.metrics.pairwise)

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

#Compute pairwise cosine similarities between the documents
cosine_similarities = cosine_similarity(tfidf_matrix)

#Print the shape of the similarity matrix
print(f"Shape of the cosine similarity matrix: {cosine_similarities.shape}")

Shape of the cosine similarity matrix: (18846, 18846)


In [32]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load 20 Newsgroups data
newsgroups = fetch_20newsgroups(subset='all')
text_data = newsgroups.data

#Create the TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Use 5000 most frequent terms
tfidf_matrix = vectorizer.fit_transform(text_data)

#Function for computing cosine similarity in batches
def cosine_similarity_batch(tfidf_matrix, batch_size=300):
    num_samples = tfidf_matrix.shape[0]
    cosine_similarities_batch = np.zeros((num_samples, num_samples))

    #Process the documents in smaller batches to reduce memory usage
    for start_idx in range(0, num_samples, batch_size):
        end_idx = min(start_idx + batch_size, num_samples)
        batch_data = tfidf_matrix[start_idx:end_idx].toarray()  # Convert batch to dense array

        #Compute cosine similarity between the batch and all other documents
        cosine_similarities_batch[start_idx:end_idx, :] = cosine_similarity(batch_data, tfidf_matrix.toarray())

    return cosine_similarities_batch

#Compute cosine similarity matrix in batches
cosine_similarities_batch = cosine_similarity_batch(tfidf_matrix, batch_size=500)

#Print the shape of the cosine similarity matrix
print(f"Shape of the cosine similarity matrix (batch): {cosine_similarities_batch.shape}")


Shape of the cosine similarity matrix (batch): (18846, 18846)
