In [1]:
import pandas as pd
import numpy as np
import time
df = pd.read_csv('Asgmnt1_data.txt', delimiter= '\s+', header=None)
mean = df.mean(axis=1)
std = df.std(axis=1)
#z-normalize the data -row
df_normalized_row = df.sub(mean, axis=0).div(std, axis=0)
#data after normalization -row
df_normalized_row
#Coverting to numpy array for making the computation faster
df_normalized_via_row = df_normalized_row.to_numpy()
df_normalized_via_row

array([[ 0.4911766 ,  0.3636397 ,  0.37809822, ...,  0.41535439,
         0.64793504,  0.50583951],
       [ 1.39632783,  1.43178836,  1.38412021, ...,  0.84542555,
         1.18690811,  0.67765512],
       [-0.58058685, -0.36755852, -0.0866899 , ..., -0.72934593,
        -0.33252614, -0.44072337],
       ...,
       [ 1.42372546,  0.65207554,  2.04933949, ...,  0.43594731,
         1.06606527,  1.05473797],
       [ 1.39222862,  0.99039621,  1.04004292, ...,  1.51321035,
         1.5725976 ,  1.44654169],
       [-0.82101825, -1.67915237, -0.42537814, ...,         nan,
                nan,         nan]])

In [3]:
# Get the number of rows in the dataset
num_rows = df_normalized_row.shape[0]

# Initialize an empty matrix to store pairwise distances
distance_matrix = np.zeros((num_rows, num_rows))

# Start timing
start_time = time.time()

# Loop over each pair of rows and calculate the Euclidean distance
for i in range(num_rows):
    for j in range(i+1, num_rows):
        # Calculate Euclidean distance between row i and row j
        distance = np.sqrt(np.sum((df_normalized_via_row[i] - df_normalized_via_row[j])**2))
        distance_matrix[i, j] = distance
        distance_matrix[j, i] = distance  # Symmetric matrix

# End timing
end_time = time.time()

# Calculate the total time taken
total_time = end_time - start_time

# Print the total time taken to compute the distances
print(f"Total time taken: {total_time} seconds")


Total time taken: 743.8335795402527 seconds


In [5]:
import numpy as np

def haar_matrix(n):
    if n == 1:
        return np.array([[1]])  # Base case: 1x1 matrix

    # Recursive call to build a smaller Haar matrix (n // 2 x n // 2)
    h = haar_matrix(n // 2)

    # Construct the larger Haar matrix using Kronecker products
    h_n = np.kron(h, [1, 1])       # Top part of the Haar matrix
    h_i = np.kron(np.eye(len(h)), [1, -1])  # Bottom part of the Haar matrix

    # Stack the two parts vertically
    h = np.vstack((h_n, h_i))

    h = np.where(np.abs(h) < 1e-10, 0,h)

    return h

# Example: Haar matrix of size 128
H_128 = haar_matrix(128)
print(H_128)

wavelet_transformed_data = np.dot(df_normalized_via_row, H_128.T)

# Extract the first 4 coefficients from each row of the transformed data
first_4_coeffs = wavelet_transformed_data[:, :4]

#Euclidean Distance
def calculate_distance_matrix(data):
    num_rows = data.shape[0]
    distance_matrix = np.zeros((num_rows, num_rows))

    # Calculate Euclidean distance manually
    for i in range(num_rows):
        for j in range(i+1, num_rows):
            distance = np.sqrt(np.sum((data[i] - data[j])**2))
            distance_matrix[i, j] = distance
            distance_matrix[j, i] = distance  # Symmetric matrix

    return distance_matrix

start_time_haar = time.time()
distance_matrix_haar = calculate_distance_matrix(first_4_coeffs)
end_time_haar = time.time()
total_time_haar = end_time_haar - start_time_haar
print(f"Total time taken: {total_time_haar} seconds")

[[ 1.  1.  1. ...  1.  1.  1.]
 [ 1.  1.  1. ... -1. -1. -1.]
 [ 1.  1.  1. ...  0.  0.  0.]
 ...
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ... -1.  0.  0.]
 [ 0.  0.  0. ...  0.  1. -1.]]
Total time taken: 642.1854093074799 seconds


In [6]:
import numpy as np
df = pd.read_csv('Asgmnt1_data.txt', delimiter= '\s+', header=None)
mean = df.mean()
std = df.std()
#z-normalize the data
df_normalized_column = (df - mean) / std

#data after normalization(column-wise)
df_normalized_column
#Normalized data
df_normalized_via_column = df_normalized_column.to_numpy()

#Calculate the covaraince Matrix
covariance_matrix = np.cov(df_normalized_via_column.T)

# Step 3: Compute eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
print("\nEigenvalues:")
print(eigenvalues)
print("\nEigenvectors:")
print(eigenvectors)

#Compute eigenvalues and eigenvectores
sorted_indices = np.argsort(eigenvalues)[::-1]  # Sort in decreasing order
sorted_eigenvalues = eigenvalues[sorted_indices]
sorted_eigenvectors = eigenvectors[:, sorted_indices]

#Taking the four best component
projection_matrix = sorted_eigenvectors[:, 4]

# Step 6: Transform the data
pca_transformed_data = np.dot(df_normalized_via_column, projection_matrix)

#Start Timing
start_time_pca = time.time()

#Calculating the euclidean distance
distance_matrix_pca = calculate_distance_matrix(pca_transformed_data)

#End Timing
end_time_pca = time.time()

#Calculate the total time taken
total_time_pca = end_time_pca - start_time_pca

# Print the total time taken to compute the distances
print(f"Total time taken: {total_time_pca} seconds")






Eigenvalues:
[15.75853108 15.49237748 14.64556782 15.23780446 14.93670024 14.98608171
 15.03749835 15.09203417  0.06702205  0.04730519  0.06625801  0.06559853
  0.06551893  0.04776062  0.04816555  0.04837356  0.06554934  0.06514276
  0.06501041  0.06453494  0.04869488  0.04880068  0.06429286  0.06401076
  0.06371539  0.06365308  0.04906479  0.04926638  0.04947537  0.0496184
  0.0633893   0.06327601  0.04987207  0.04998513  0.0631074   0.06302514
  0.05013238  0.05031353  0.06277233  0.06250616  0.06250895  0.05056675
  0.05072494  0.05077392  0.06223782  0.06211791  0.05093404  0.06181665
  0.05111389  0.06164216  0.05133244  0.06160703  0.06141289  0.05135011
  0.05143669  0.06120095  0.05159326  0.05179837  0.06112103  0.0611281
  0.06107008  0.05191324  0.0608873   0.06077231  0.06070176  0.06060993
  0.05225682  0.06044081  0.05231422  0.06016603  0.06003234  0.06009866
  0.0525527   0.05263988  0.05272699  0.0596395   0.05296504  0.0593632
  0.05931893  0.05923106  0.05308891  0.

In [7]:
import numpy as np
import random

#Number of random pairs for comparing
num_pairs = 1600000
size = distance_matrix.shape[0]  # Assuming square matrices

# Function to compare pairs and count valid relationships
def check_relationships(euclidean, haar, pca, num_pairs):
    count_haar = 0
    count_pca = 0

    for _ in range(num_pairs):
        # Randomly pick two pairs of indices (i, j) and (k, l)
        i, j = random.randint(0, size-1), random.randint(0, size-1)
        k, l = random.randint(0, size-1), random.randint(0, size-1)

        # Get the Euclidean relationship
        euclidean_relationship = euclidean[i,j] > euclidean[k,l]
        haar_relationship = haar[i,j] > haar[k,l]
        pca_relationship = pca[i,j] > pca[k,l]
        # Check if the Haar relationship matches the Euclidean relationship
        if haar_relationship == euclidean_relationship:
            count_haar += 1

        # Check if the PCA relationship matches the Euclidean relationship
        if pca_relationship == euclidean_relationship:
            count_pca += 1

    # Compute the percentage of times the relationships hold
    haar_percentage = (count_haar / num_pairs) * 100
    pca_percentage = (count_pca / num_pairs) * 100

    return haar_percentage, pca_percentage

# Call the function and get the results
haar_percentage, pca_percentage = check_relationships(distance_matrix, distance_matrix_haar, distance_matrix_pca, num_pairs)

# Display the result
print(f"Haar holds {haar_percentage}%.")
print(f"PCA  holds {pca_percentage}%.")


Haar holds 60.4576875%.
PCA  holds 55.2965%.
