In [1]:
import numpy as np
import setigen as stg
from blimpy import Waterfall
import matplotlib.pyplot as plt
import random
from astropy import units as u
from tqdm import tqdm
import os
from sklearn.metrics import silhouette_score
import pandas as pd
import tqdm

num_classes = 100
num_samples_per_class = 1000

def painting(data):
    all_data = []
    labels = []
    for c in range(num_classes):
        drift = 2*random.random()*(-1)**random.randint(0,2)
        snr = random.randint(100, 150)
        width = random.randint(20, 50)
        for s in range(num_samples_per_class):
            index = random.randint(0, data.shape[0]-1)
            window = data[index, :,:]
            
            start = random.randint(50, 180)
            
            frame = stg.Frame.from_data(df=2.7939677238464355*u.Hz,
                                        dt=18.253611008*u.s,
                                        fch1=1289*u.MHz,
                                        ascending=True,
                                        data=window)
            frame.add_signal(stg.constant_path(
                                        f_start=frame.get_frequency(index=start),
                                       drift_rate=drift*u.Hz/u.s),
                                      stg.constant_t_profile(level=frame.get_intensity(snr=snr)),
                                      stg.gaussian_f_profile(width=width*u.Hz),
                                      stg.constant_bp_profile(level=1))
            all_data.append(frame.data)
            labels.append(c)
    all_data = np.array(all_data)
    labels = np.vstack(labels)
    return all_data, labels

In [2]:
def measure_cluster_size(x, labels):
    get_labels = list(set(labels))
    mean_cluster_spread = []
    for l in get_labels:
        index = np.asarray(labels==l)
        x_features = x[index, :]
        mean_centroid = np.mean(x_features, axis = 0)
        diff = x_features - mean_centroid
        norms = np.linalg.norm(diff, axis = 1)
        mean_cluster_spread.append(np.max(norms)/np.mean(norms))
        # print(x_features.shape, mean_centroid.shape, norms.shape)
    return np.mean(mean_cluster_spread), np.std(mean_cluster_spread)

In [3]:
from tqdm import tqdm
for i in tqdm(range(10)):
    num_classes = 100
    num_samples_per_class = 1000

    directory = os.fsencode( "../../../../../datax/scratch/pma/reverse_search/test/")
    count = 0
    data = []
    for folder in os.listdir(directory):
        print(folder)
        for subfolder in os.listdir(directory+folder):
            back = os.fsencode( "/")
            if '.' not in str(subfolder):
                for file in os.listdir(directory+folder+back+subfolder):
                    file_directory = str(os.path.join(directory+folder+back+subfolder, file)).replace('b', '').replace("'","")
                    if 'filtered.npy' in file_directory:
                        data.append(np.load(str(file_directory)))
                        count += 1
    data = np.vstack(data)
    print(data.shape)
    injected, labels = painting(data)


    features = []
    for i in range(injected.shape[0]):
        features.append(injected[i,:,:].flatten())    
    features = np.array(features)

    print(features.shape)
    print(labels[:,0].shape)
    # score = silhouette_score(X = features[:50000,:], labels = labels[:50000,0])
    score, spread = measure_cluster_size(features[:50000,:], labels = labels[:50000,0])
    print(score, spread)


    f = open('output1.txt','a')  # w : writing mode  /  r : reading mode  /  a  :  appending mode
    f.write('{}'.format(score))
    f.close()

  0%|                                                    | 0/10 [00:00<?, ?it/s]

b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 4096)
(100000,)


 10%|████▎                                      | 1/10 [02:15<20:15, 135.01s/it]

122.4324457975884 58.118815396245
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 4096)
(100000,)


 20%|████████▌                                  | 2/10 [04:32<18:12, 136.52s/it]

132.5168487238469 70.83907666749892
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 4096)
(100000,)


 30%|████████████▉                              | 3/10 [06:48<15:53, 136.26s/it]

120.1850844700792 33.473290299680215
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 4096)
(100000,)


 40%|█████████████████▏                         | 4/10 [09:04<13:37, 136.21s/it]

116.33955095509981 34.089443862140364
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 4096)
(100000,)


 50%|█████████████████████▌                     | 5/10 [11:22<11:23, 136.68s/it]

137.11093593383373 76.7686301989164
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 4096)
(100000,)


 60%|█████████████████████████▊                 | 6/10 [13:38<09:06, 136.68s/it]

143.2654383639055 81.043132118133
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 4096)
(100000,)


 70%|██████████████████████████████             | 7/10 [15:56<06:51, 137.01s/it]

123.05365671027742 57.02763225570784
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 4096)
(100000,)


 80%|██████████████████████████████████▍        | 8/10 [18:12<04:33, 136.70s/it]

131.43524178137494 69.88256367808573
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 4096)
(100000,)


 90%|██████████████████████████████████████▋    | 9/10 [20:30<02:16, 136.93s/it]

131.67456969352017 61.5258195021225
b'HIP104887-1850'
b'HIP87579-1008'
b'clustering_tests'
(347064, 16, 256)
(100000, 4096)
(100000,)


100%|██████████████████████████████████████████| 10/10 [22:46<00:00, 136.64s/it]

132.25276662096422 68.93586983725231





In [4]:
scores_list =[-0.25901990191348734,-0.47767020929706805, -0.3672917473966914, -0.34257881331263146,-0.34405826318845145,
-0.45805348261169165,
-0.3781393168000071,
-0.3109327536798293,
-0.37308408003121096,
-0.31608956544911326
]
print(np.mean(scores_list))
print(np.std(scores_list))

-0.3626918133680182
0.062475627577611874


In [7]:
print(np.mean([122.4324457975884,
132.5168487238469,
120.1850844700792,
116.33955095509981,
137.11093593383373 ,
143.2654383639055 ,
123.05365671027742 ,
131.43524178137494 ,
131.67456969352017, 
132.25276662096422 ]))
print(np.std([122.4324457975884,
132.5168487238469,
120.1850844700792,
116.33955095509981,
137.11093593383373 ,
143.2654383639055 ,
123.05365671027742 ,
131.43524178137494 ,
131.67456969352017, 
132.25276662096422 ]))

129.02665390504905
7.885409455987227
