In [1]:
import numpy as np
import setigen as stg
from blimpy import Waterfall
import matplotlib.pyplot as plt
import random
import os
from astropy import units as u
from tqdm import tqdm
from sklearn.metrics import silhouette_score
import tensorflow as tf

os.environ["CUDA_VISIBLE_DEVICES"]="0"
num_classes = 100
num_samples_per_class = 1000


2023-09-07 06:42:11.040402: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def painting(data):
    all_data = []
    labels = []
    for c in range(num_classes):
        drift = 2*random.random()*(-1)**random.randint(0,2)
        snr = random.randint(100, 150)
        width = random.randint(20, 50)
        for s in range(num_samples_per_class):
            index = random.randint(0, data.shape[0]-1)
            window = data[index, :,:]
            
            start = random.randint(50, 180)
            
            frame = stg.Frame.from_data(df=2.7939677238464355*u.Hz,
                                        dt=18.253611008*u.s,
                                        fch1=1289*u.MHz,
                                        ascending=True,
                                        data=window)
            frame.add_signal(stg.constant_path(
                                        f_start=frame.get_frequency(index=start),
                                       drift_rate=drift*u.Hz/u.s),
                                      stg.constant_t_profile(level=frame.get_intensity(snr=snr)),
                                      stg.gaussian_f_profile(width=width*u.Hz),
                                      stg.constant_bp_profile(level=1))
            all_data.append(frame.data)
            labels.append(c)
    all_data = np.array(all_data)
    labels = np.vstack(labels)
    return all_data, labels

In [3]:
import cv2
import numpy as np

def transform(data):
    final = []
    for i in range(data.shape[0]):
        img = np.repeat(data[i,:, :], 3, axis=2)
        res = cv2.resize(img, dsize=(224, 224), interpolation=cv2.INTER_CUBIC)
        final.append(res)
    return np.array(final)


In [4]:
from tqdm import tqdm
import cv2 as cv
def SIFT(data):
    clusters = 800
    centroids = np.load("centroids.npy")
    sift = cv.SIFT_create()
    bf = cv.BFMatcher()
    # bf = cv.DescriptorMatcher_create(cv.DescriptorMatcher_FLANNBASED)
    histograms = []
    for i in range(data.shape[0]):
        counter = np.zeros((clusters,), dtype=np.uint32)
        temp =  data[i, :, :]*255
        temp = temp.astype('uint8')
        kp, des = sift.detectAndCompute(temp, None)
        if des is None:
            histograms.append(counter)
            continue
        matches = bf.knnMatch(des, centroids, k=1)
        for match in matches:
            counter[match[0].trainIdx] += 1
        counter_sum = np.sum(counter)
        counter = [float(n)/counter_sum for n in counter]
        histograms.append(counter)
        
    return np.float32(histograms)

In [5]:
from tqdm import tqdm
import gc

def normalize(data):
    epsilon = 1
    min_val = data.min()
    data = data - min_val + epsilon
    new_data = np.log(data)
    min_val = data.min()
    max_val = data.max()
    final_data = (data - min_val) / (max_val - min_val)
    return final_data
    
def normalize_data(data):
    for i in range(data.shape[0]):
        data[i,:,:] = normalize(data[i,:,:, :])
    return data


In [6]:
# from tqdm import tqdm
# directory = os.fsencode( "../../../../../datax/scratch/pma/reverse_search/test/")
# count = 0
# data = []
# for folder in os.listdir(directory):
#     print(folder)
#     for subfolder in os.listdir(directory+folder):
#         back = os.fsencode( "/")
#         if '.' not in str(subfolder):
#             for file in os.listdir(directory+folder+back+subfolder):
#                 file_directory = str(os.path.join(directory+folder+back+subfolder, file)).replace('b', '').replace("'","")
#                 if 'filtered.npy' in file_directory:
#                     data.append(np.load(str(file_directory)))
#                     count += 1
# data = np.vstack(data)


# des_vecs = []
# kps_vecs = []
# sift = cv.SIFT_create()

# for i in tqdm(range(data.shape[0])):
#     temp =  normalize(np.expand_dims(data[i, :,:], axis = -1)) *255
#     temp = temp.astype('uint8')
#     print(temp.shape)
#     kp, des = sift.detectAndCompute(temp, None)
#     des_vecs.append(des)
#     kps_vecs.append(kp)

# descriptors = np.vstack(list(filter(lambda x: x is not None, des_vecs)))
# clusters = 800
# criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 5, .01)
# centroids = cv.kmeans(descriptors, clusters, None, criteria, 1, cv.KMEANS_PP_CENTERS)[2]
# np.save("centroids.npy", centroids)

In [7]:
def measure_cluster_size(x, labels):
    get_labels = list(set(labels))
    mean_cluster_spread = []
    for l in get_labels:
        index = np.asarray(labels==l)
        x_features = x[index, :]
        mean_centroid = np.mean(x_features, axis = 0)
        diff = x_features - mean_centroid
        norms = np.linalg.norm(diff, axis = 1)
        mean_cluster_spread.append(np.max(norms)/np.mean(norms))
        # print(x_features.shape, mean_centroid.shape, norms.shape)
    return np.mean(mean_cluster_spread), np.std(mean_cluster_spread)

In [8]:
import os
from tqdm import tqdm
total_scores = []
for i in tqdm(range(10)):
    directory = os.fsencode( "../../../../../datax/scratch/pma/reverse_search/test/")
    count = 0
    data = []
    for folder in os.listdir(directory):
        print(folder)
        for subfolder in os.listdir(directory+folder):
            back = os.fsencode( "/")
            if '.' not in str(subfolder):
                for file in os.listdir(directory+folder+back+subfolder):
                    file_directory = str(os.path.join(directory+folder+back+subfolder, file)).replace('b', '').replace("'","")
                    if 'filtered.npy' in file_directory:
                        data.append(np.load(str(file_directory)))
                        count += 1
    data = np.vstack(data)
    print(data.shape)
    injected, labels = painting(data)
    
    print(injected.shape)
    
    input_data = np.expand_dims(injected, axis = -1)
    del data
    gc.collect()
    input_data = normalize_data(input_data)
    print(input_data[0,:,:].max(), input_data[0,:,:].min())
    print(input_data.shape)
    
    des_vecs = []
    kps_vecs = []
    sift = cv.SIFT_create()

    for i in range(input_data.shape[0]):
        temp =  input_data[i, :, :, :] *255
        temp = temp.astype('uint8')
        kp, des = sift.detectAndCompute(temp, None)
        des_vecs.append(des)
        kps_vecs.append(kp)

    descriptors = np.vstack(list(filter(lambda x: x is not None, des_vecs)))
    clusters = 800
    criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 5, .01)
    centroids = cv.kmeans(descriptors, clusters, None, criteria, 1, cv.KMEANS_PP_CENTERS)[2]
    np.save("centroids.npy", centroids)
    
    features = SIFT(input_data[:,:,:,0])
    print(features.shape)
    score, spread = measure_cluster_size(x = features, labels = labels[:, 0])
    print("SCORE IS: ", score, spread)
    total_scores.append(score)

  0%|                                                    | 0/10 [00:00<?, ?it/s]

b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 16, 256)
1.0 0.0
(100000, 16, 256, 1)


 10%|████                                     | 1/10 [07:43<1:09:27, 463.01s/it]

(100000, 800)
SCORE IS:  2.9166765 1.295244
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 16, 256)
1.0 0.0
(100000, 16, 256, 1)


 20%|████████▏                                | 2/10 [15:34<1:02:24, 468.02s/it]

(100000, 800)
SCORE IS:  3.0570383 1.5321516
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 16, 256)
1.0 0.0
(100000, 16, 256, 1)


 30%|████████████▉                              | 3/10 [23:09<53:55, 462.22s/it]

(100000, 800)
SCORE IS:  3.259152 1.3618685
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 16, 256)
1.0 0.0
(100000, 16, 256, 1)


 40%|█████████████████▏                         | 4/10 [30:44<45:55, 459.23s/it]

(100000, 800)
SCORE IS:  3.107264 1.3904837
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 16, 256)
1.0 0.0
(100000, 16, 256, 1)


 50%|█████████████████████▌                     | 5/10 [38:15<38:01, 456.26s/it]

(100000, 800)
SCORE IS:  2.9518628 1.2252791
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 16, 256)
1.0 0.0
(100000, 16, 256, 1)


 60%|█████████████████████████▊                 | 6/10 [45:34<30:01, 450.45s/it]

(100000, 800)
SCORE IS:  3.3328233 1.793119
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 16, 256)
1.0 0.0
(100000, 16, 256, 1)


 70%|██████████████████████████████             | 7/10 [52:59<22:25, 448.46s/it]

(100000, 800)
SCORE IS:  3.02143 1.2757397
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 16, 256)
1.0 0.0
(100000, 16, 256, 1)


 80%|████████████████████████████████▊        | 8/10 [1:00:27<14:57, 448.61s/it]

(100000, 800)
SCORE IS:  3.2249622 1.8557835
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 16, 256)
1.0 0.0
(100000, 16, 256, 1)


 90%|████████████████████████████████████▉    | 9/10 [1:07:50<07:26, 446.69s/it]

(100000, 800)
SCORE IS:  2.9981163 1.1894457
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 16, 256)
1.0 0.0
(100000, 16, 256, 1)


100%|████████████████████████████████████████| 10/10 [1:15:11<00:00, 451.19s/it]

(100000, 800)
SCORE IS:  3.0566323 1.149919





In [9]:
print(np.mean(total_scores))
print(np.std(total_scores))

3.0925956
0.13057114
