In [1]:
from datasets import load_dataset

dataset = load_dataset('../../astroclip/datasets/legacy_survey.py', 
                       cache_dir='/mnt/ceph/users/sgolkar/datasets')

# Drop the image and redshift features
dataset = dataset.remove_columns(["image", "redshift"])

# Split the test set 50/50 into test and val
test_half_length = len(dataset['test']) // 2
test_data = dataset['test'].select(indices=range(test_half_length))
val_data = dataset['test'].select(indices=range(test_half_length, len(dataset['test'])))

# modify the dataset dict to reflect the split
dataset['test'] = test_data
dataset['val'] = val_data

dataset = dataset.rename_column('spectrum', 'x')

In [5]:
path = "/mnt/ceph/users/sgolkar/datasets/astroclip/spectrum_ds"
dataset.save_to_disk(path)

Saving the dataset (0/20 shards):   0%|          | 0/158380 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/19798 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/19798 [00:00<?, ? examples/s]

In [15]:
import numpy as np
def fnc(samples):
    out = []
    for sample in samples["x"]:
        out.append((np.array(sample)-7.3)/8.2)
    return {"x": out}

ds = dataset.map(
    fnc,
    batched=True,
    num_proc=30,
    batch_size=100,
    load_from_cache_file=False,
)

Map (num_proc=30):   0%|          | 0/158380 [00:00<?, ? examples/s]

Map (num_proc=30):   0%|          | 0/19798 [00:00<?, ? examples/s]

Map (num_proc=30):   0%|          | 0/19798 [00:00<?, ? examples/s]

In [16]:
path = "/mnt/ceph/users/sgolkar/datasets/astroclip/spectrum_ds"
ds.save_to_disk(path)

Saving the dataset (0/20 shards):   0%|          | 0/158380 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/19798 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/19798 [00:00<?, ? examples/s]

### Making the joint redshift, spectrum stdalized and bunched dataset

In [3]:
# Loading the dataset

from datasets import load_dataset

dataset = load_dataset('../../astroclip/datasets/legacy_survey.py', 
                       cache_dir='/mnt/ceph/users/sgolkar/datasets')

# Drop the image and redshift features
dataset = dataset.remove_columns(["image"])

# Split the test set 50/50 into test and val
test_half_length = len(dataset['test']) // 2
test_data = dataset['test'].select(indices=range(test_half_length))
val_data = dataset['test'].select(indices=range(test_half_length, len(dataset['test'])))

# modify the dataset dict to reflect the split
dataset['test'] = test_data
dataset['val'] = val_data

In [2]:
# Calculating the std distribution

import numpy as np

def fnc(samples):
    stds = []
    means = []
    for sample in samples["spectrum"]:
        x = np.array(sample)
        means.append(x.mean())
        stds.append(x.std())
    return {"std": stds, "mean": means }
ds = dataset.map(
    fnc,
    batched=True,
    num_proc=30,
    batch_size=100,
    load_from_cache_file=False,
    remove_columns=["spectrum", "redshift"]
)

train_stds = np.array(ds['train']['std'])
val_stds = np.array(ds['val']['std'])
test_stds = np.array(ds['test']['std'])

print(train_stds.mean(), test_stds.mean(), val_stds.mean())
print(train_stds.std(), test_stds.std(), val_stds.std())

train_means = np.array(ds['train']['mean'])
val_means = np.array(ds['val']['mean'])
test_means = np.array(ds['test']['mean'])

print(train_means.mean(), test_means.mean(), val_means.mean())
print(train_means.std(), test_means.std(), val_means.std())

Map (num_proc=30):   0%|          | 0/158380 [00:00<?, ? examples/s]

Map (num_proc=30):   0%|          | 0/19798 [00:00<?, ? examples/s]

Map (num_proc=30):   0%|          | 0/19798 [00:00<?, ? examples/s]

1.851979419144681 2.13490672906514 1.4055575607510395
28.964371440936596 8.890413099443752 3.8311471087170785
2.6224963266261474 3.9564168084707725 1.3600593376579933
3.5976514405913194 5.105478611235835 0.6251189150057348


In [3]:
# and the redshift stats

rs = {split: np.array(dataset[split]['redshift']) for split in dataset.keys()}

rs_means = {split: rs[split].mean() for split in rs.keys()}
rs_stds = {split: rs[split].std() for split in rs.keys()}

print(rs_means)
print(rs_stds)

{'train': 0.2651120342603547, 'test': 0.20726879838914092, 'val': 0.3198322842947078}
{'train': 0.1621082109136878, 'test': 0.12090782935061481, 'val': 0.16460627380227752}


In [41]:
# defining the new dataset

import numpy as np

def slice(x, section_length=10, overlap=5):

    start_indices = np.arange(0, len(x) - overlap, section_length - overlap)
    sections = [x[start:start + section_length] for start in start_indices]

    # If the last section is not of length 'section_length', you can decide whether to keep or discard it
    if len(sections[-1]) < section_length:
        sections.pop(-1)  # Discard the last section    

    return np.concatenate(sections, 1).T


def fnc(samples):

    out = []

    for x, rs in zip(samples["spectrum"], samples["redshift"]):
        x = np.array(x)
        std, mean = x.std(), x.mean()
        # skipping samples that are all zero
        if std == 0:
            continue
        x = (x - mean) / std
        x = slice(x, 10, 5)
        x = np.pad(x, pad_width=((1,0),(3,0)), mode='constant', constant_values=0)

        x[0,0] = (mean-2)/2
        x[0,1] = (std-2)/8
        x[0,2] = (rs-0.2)/0.15


        out.append(x)       

    return {"x": out}


ds = dataset.map(
    fnc,
    batched=True,
    num_proc=30,
    batch_size=100,
    load_from_cache_file=False,
    remove_columns=["spectrum", "redshift"]
)


path = "/mnt/ceph/users/sgolkar/datasets/astroclip/spec_rs_chunked_ds"
ds.save_to_disk(path)

Map (num_proc=30):   0%|          | 0/158380 [00:00<?, ? examples/s]

Map (num_proc=30):   0%|          | 0/19798 [00:00<?, ? examples/s]

Map (num_proc=30):   0%|          | 0/19798 [00:00<?, ? examples/s]

In [4]:
# defining the new dataset

import numpy as np

def slice(x, section_length=10, overlap=5):

    start_indices = np.arange(0, len(x) - overlap, section_length - overlap)
    sections = [x[start:start + section_length] for start in start_indices]

    # If the last section is not of length 'section_length', you can decide whether to keep or discard it
    if len(sections[-1]) < section_length:
        sections.pop(-1)  # Discard the last section    

    return np.concatenate(sections, 1).T


def fnc(samples):

    out = []

    for x, rs in zip(samples["spectrum"], samples["redshift"]):
        x = np.array(x)
        std, mean = x.std(), x.mean()
        # skipping samples that are all zero
        if std == 0:
            continue
        x = (x - mean) / std
        x = slice(x, 20, 10)
        x = np.pad(x, pad_width=((1,0),(2,0)), mode='constant', constant_values=0)

        x[0,0] = (mean-2)/2
        x[0,1] = (std-2)/8


        out.append(x)       

    return {"x": out}


ds = dataset.map(
    fnc,
    batched=True,
    num_proc=30,
    batch_size=100,
    load_from_cache_file=False,
    remove_columns=["spectrum", "redshift"]
)


path = "/mnt/ceph/users/sgolkar/datasets/astroclip/spec_chunked_ds"
ds.save_to_disk(path)

Map (num_proc=30):   0%|          | 0/158380 [00:00<?, ? examples/s]

TimeoutError: 