In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances

expr = pd.read_csv('static/CAFs.txt', sep='\t')

In [20]:
X_train = expr.values[:, :-1]
X_train = np.log(X_train+1)
n = X_train.shape[0]
print("This dataset contains {0} samples".format(n))
Y_train = expr.values[:, -1:]
print("\nDimensions of the data points: {0}, labels: {1}".format(X_train.shape, Y_train.shape))

This dataset contains 716 samples

Dimensions of the data points: (716, 557), labels: (716, 1)


In [21]:
# Matrix of the squared Euclidean distances
dist = np.square(euclidean_distances(X_train, X_train))

In [30]:
#Get the squared Euclidean distance to the nearest neighbor
rho = [sorted(dist[i])[1] for i in range(dist.shape[0])]

In [32]:
print(dist[0:4, 0:4])
print()
print(rho[0:4])

[[   0.          914.95016311 1477.46836099 3036.91172176]
 [ 914.95016311    0.         1307.39294642 2960.41559961]
 [1477.46836099 1307.39294642    0.         2678.34442573]
 [3036.91172176 2960.41559961 2678.34442573    0.        ]]

[805.2464562222542, 652.4022952321459, 1036.9011547563534, 1244.8783774968015]


In [34]:
def prob_high_dim(sigma, dist_row):
    """
    For each row of Euclidean distance matrix (dist_row) compute
    probability in high dimensions (1D array)
    """
    d = dist[dist_row] - rho[dist_row]; d[d < 0] = 0
    return np.exp(-d/sigma)

In [35]:
def k(prob):
    """
    Compute n_neighbor = k (scalar) for each 1D array of high-dimensional probability
    """
    return np.power(2, np.sum(prob))

In [None]:
def sigma_binary_search(k_of_sigma, fixed_k):
    """
    Solve equation k_of_sigma(sigma) = fixed_k 
    with respect to sigma by the binary search algorithm
    """
    sigma_lower_limit = 0; sigma_upper_limit = 1000
    for i in range(20):
        approx_sigma = (sigma_lower_limit + sigma_upper_limit) / 2
        if k_of_sigma(approx_sigma) < fixed_k:
            sigma_lower_limit = approx_sigma
        else:
            sigma_upper_limit = approx_sigma
        if np.abs(fixed_k - k_of_sigma(approx_sigma)) <= 1e-5:
            break
    return approx_sigma