In [None]:
import os
import h5py as h5
import numpy as np
import pandas as pd

def load_data(folder, start_index=0, num_files_to_process=None):
    # List of names of all files in the folder
    files = os.listdir(folder)
    if num_files_to_process is not None:
        files = files[start_index:start_index+num_files_to_process]
    num_files = len(files)
    print(f"Number of files in the folder: {num_files}")

    # Dictionary to store the data
    data = {}

    # Loop over all files
    for i, file in enumerate(files, start=1):
        print(f"Processing file {i}/{num_files}...")
        # Open the file
        with h5.File(os.path.join(folder, file), 'r') as f:
            # Dictionary to store the data for this file
            file_data = {}

            # Loop over all groups in the file
            for group_key in f.keys():
                group = f[group_key]

                # Dictionary to store the data for this group
                group_data = {}

                # Check if the group is a dataset or another group
                if isinstance(group, h5.Dataset):
                    # Read the dataset into a numpy array
                    array = np.empty(group.shape, dtype=group.dtype)
                    group.read_direct(array)

                    # Store the array in the group data dictionary
                    group_data[group_key] = array
                else:
                    # Loop over all subgroups/datasets in the group
                    for subgroup_key in group.keys():
                        subgroup = group[subgroup_key]

                        # Check if the subgroup is a dataset or another group
                        if isinstance(subgroup, h5.Dataset):
                            # Read the dataset into a numpy array
                            array = np.empty(subgroup.shape, dtype=subgroup.dtype)
                            subgroup.read_direct(array)

                            # Store the array in the group data dictionary
                            group_data[subgroup_key] = array
                        else:
                            # Dictionary to store the data for this subgroup
                            subgroup_data = {}

                            # Loop over all datasets in the subgroup
                            for dataset_key in subgroup.keys():
                                dataset = subgroup[dataset_key]

                                # Read the dataset into a numpy array
                                array = np.empty(dataset.shape, dtype=dataset.dtype)
                                dataset.read_direct(array)

                                # Store the array in the subgroup data dictionary
                                subgroup_data[dataset_key] = array

                            # Store the subgroup data in the group data dictionary
                            group_data[subgroup_key] = subgroup_data

                # Store the group data in the file data dictionary
                file_data[group_key] = group_data

            # Store the file data in the data dictionary
            data[file] = file_data

    return data

# Load the training data
train_folder = 'D:\\Datasets\\g2net-detecting-continuous-gravitational-waves (1)\\train'
train_data = load_data(train_folder, start_index=0, num_files_to_process=100)

train_labels = 'D:\\Datasets\\g2net-detecting-continuous-gravitational-waves (1)\\train_labels.csv'
reading = pd.read_csv(train_labels)
print(reading.to_string())

# Load the test data
test_folder = 'D:\\Datasets\\g2net-detecting-continuous-gravitational-waves (1)\\train'  # Note: We're still using the train folder
test_data = load_data(test_folder, start_index=100, num_files_to_process=50)  # Start from the 101st file

In [None]:
for file in train_data.keys():
    print(file)
    for group in train_data[file].keys():
        print(group)
        for subgroup in train_data[file][group].keys():
            print(subgroup)
            for dataset in train_data[file][group][subgroup].keys():
                print(dataset)
                print(train_data[file][group][subgroup][dataset].shape)
                print(train_data[file][group][subgroup][dataset])
                break
            break
        break
    break

In [None]:
import numpy as np
import h5py as h5
from scipy.signal import stft


def preprocess_data(data, hdf5_output_file):
    num_files = len(data.keys())
    print("Length of data:", str(num_files))
    # Loop over all files
    for i, file in enumerate(data.keys(), start=1):
        print(f"Processing file {i}/{num_files}...")
        # Loop over all groups in the file
        for group in data[file].keys():
            # Loop over all subgroups in the group
            for subgroup in data[file][group].keys():
                if isinstance(data[file][group][subgroup], dict) and 'SFTs' in data[file][group][subgroup]:
                    # Get the SFTs
                    sfts = data[file][group][subgroup]['SFTs']

                    # Apply STFT
                    _, _, sfts_stft = stft(sfts)
                    sfts_stft = np.abs(sfts_stft)  # Retain the magnitude of the STFT

                    # Open the HDF5 file
                    with h5.File(hdf5_output_file, 'a') as f:
                        # Check if a group for this file already exists in the HDF5 file
                        # If not, create it
                        file_group_name = f"{file}_{i}"
                        if file_group_name in f:
                            file_group = f[file_group_name]
                        else:
                            file_group = f.create_group(file_group_name)

                        # Check if a subgroup for this group already exists in the file group
                        # If not, create it
                        if group in file_group:
                            group_subgroup = file_group[group]
                        else:
                            group_subgroup = file_group.create_group(group)

                        # Check if a dataset for the STFT of SFTs already exists in the subgroup
                        # If not, create it
                        if 'SFTs' in group_subgroup:
                            sfts_dataset = group_subgroup['SFTs']
                        else:
                            sfts_dataset = group_subgroup.create_dataset('SFTs', data=sfts_stft)

# Preprocess the training data
preprocess_data(train_data, 'D:/Datasets/preprocessed_data/preprocessed_data.h5')

for file in train_data.keys():
    print(file)
    for group in train_data[file].keys():
        print(group)
        for subgroup in train_data[file][group].keys():
            print(subgroup)
            for dataset in train_data[file][group][subgroup].keys():
                print(dataset)
                print(train_data[file][group][subgroup][dataset].shape)
                print(train_data[file][group][subgroup][dataset])
                break
            break
        break
    break


In [None]:
import sys
!{sys.executable} -m pip install imblearn

In [3]:
import sys

def print_var_sizes():
    for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                             key= lambda x: -x[1])[:2]:
        print("{:>30}: {:>8}".format(name, size))

print_var_sizes()

In [None]:
import h5py as h5

def print_structure(hdf5_file):
    def print_group(group, indent=""):
        print(indent, group.name)
        for key in group.keys():
            item = group[key]
            if isinstance(item, h5.Dataset):  # Dataset
                print(indent + "  ", key)
            elif isinstance(item, h5.Group):  # Group
                print_group(item, indent + "  ")
    with h5.File(hdf5_file, 'r') as f:
        print_group(f)

print_structure('D:/Datasets/preprocessed_data/preprocessed_data.h5')

In [None]:
for sft in X:
    print(sft.shape, sft.ndim)

In [3]:
del train_data

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import numpy as np
import h5py as h5
import joblib
from scipy.ndimage import zoom

# Load the labels from the CSV file
labels_df = pd.read_csv('D:\\Datasets\\g2net-detecting-continuous-gravitational-waves (1)\\train_labels.csv')
labels_df.set_index('id', inplace=True)  # set 'id' as the index to facilitate lookup

# Modify the labels: replace -1 with 1
labels_df['target'] = labels_df['target'].replace(-1, 1)

# Load the preprocessed data from the HDF5 file
with h5.File('D:/Datasets/preprocessed_data/preprocessed_data.h5', 'r') as f:
    # Keep each SFT data as a 2D array
    X = []
    y = []

    for file in f.keys():
        file_group = f[file]
        for observation in file_group.keys():
            obs_group = file_group[observation]
            sfts = obs_group['SFTs'][:]
            X.append(np.abs(sfts))  # Keep the STFT as 2D array, no flattening
            y.append(labels_df.loc[observation, 'target'])

    # Find the maximum shape along each dimension
    max_shape = np.max([sft.shape for sft in X], axis=0)

    # Resize each SFT to the maximum shape
    X = [zoom(sft, (max_shape[0] / sft.shape[0], max_shape[1] / sft.shape[1], max_shape[2] / sft.shape[2])) for sft in X]

    X = np.array(X)
    y = np.array(y)

# Here we should apply normalization across each feature independently, not across all values
scaler = StandardScaler()
X = scaler.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)

# Save the scaler
joblib.dump(scaler, "scaler.save")

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=42)

# Save the training and validation splits to disk
with h5.File('D:/Datasets/preprocessed_data/X_train.h5', 'w') as f:
    f.create_dataset('X_train', data=X_train)

with h5.File('D:/Datasets/preprocessed_data/X_val.h5', 'w') as f:
    f.create_dataset('X_val', data=X_val)

with h5.File('D:/Datasets/preprocessed_data/y_train.h5', 'w') as f:
    f.create_dataset('y_train', data=y_train)

with h5.File('D:/Datasets/preprocessed_data/y_val.h5', 'w') as f:
    f.create_dataset('y_val', data=y_val)

In [5]:
del X_train, X_val, y_train, y_val

In [6]:
del X, scaler, y

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv3D, MaxPooling3D, Flatten, Dense
from tensorflow.keras.optimizers import Adam
import h5py as h5
from sklearn.metrics import classification_report

# Define the model
model = Sequential([
    Conv3D(32, (3, 3, 3), activation='relu', input_shape=(None, None, None, 1)),  # Shape will be determined by the data
    MaxPooling3D((2, 2, 2)),
    Conv3D(64, (3, 3, 3), activation='relu'),
    MaxPooling3D((2, 2, 2)),
    tf.keras.layers.GlobalAveragePooling3D(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.00001), 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

def generate_batches(X_file, y_file, batch_size):
    while True:
        with h5.File(X_file, 'r') as X_f, h5.File(y_file, 'r') as y_f:
            X_dataset = X_f['X_train']  # or 'X_val', depending on the file
            y_dataset = y_f['y_train']  # or 'y_val', depending on the file
            num_batches = X_dataset.shape[0] // batch_size

            for i in range(num_batches):
                start = i * batch_size
                end = start + batch_size
                X_batch = X_dataset[start:end][..., np.newaxis]
                y_batch = y_dataset[start:end]
                yield X_batch, y_batch

batch_size = 32  # Set batch size
train_gen = generate_batches('D:/Datasets/preprocessed_data/X_train.h5', 
                             'D:/Datasets/preprocessed_data/y_train.h5', 
                             batch_size)

val_gen = generate_batches('D:/Datasets/preprocessed_data/X_val.h5', 
                           'D:/Datasets/preprocessed_data/y_val.h5', 
                           batch_size)

num_train_samples = 10000  # Update this with your actual number of training samples
num_val_samples = 2000  # Update this with your actual number of validation samples

# Train the model
history = model.fit(train_gen, 
                    steps_per_epoch=num_train_samples // batch_size, 
                    validation_data=val_gen, 
                    validation_steps=num_val_samples // batch_size, 
                    epochs=10)

# Evaluate the model using the validation generator
val_loss, val_accuracy = model.evaluate(val_gen, steps=num_val_samples // batch_size)
print("Validation Loss: ", val_loss)
print("Validation Accuracy: ", val_accuracy)

# Get the model's predictions on the validation data
# Note that this might not be the exact equivalent of model.predict(X_val)
# because it might not cover the whole validation set depending on the number of steps
probabilities = model.predict(val_gen, steps=num_val_samples // batch_size)

# Convert the probabilities into class predictions
predictions = (probabilities > 0.5).astype("int32")

# Now, here is the tricky part. If you want to compute the classification report, you would need the true labels.
# If you can afford to load just the labels into memory, you can do so.
# Otherwise, you might need to adjust your generator to yield the IDs as well, and then collect these during prediction.
with h5.File('D:/Datasets/preprocessed_data/y_val.h5', 'r') as f:
    y_val = f['y_val'][:]

print(classification_report(y_val[:len(predictions)], predictions))

Epoch 1/10


In [6]:
del model

In [None]:
from imblearn.over_sampling import RandomOverSampler

# Instantiate RandomOverSampler
ros = RandomOverSampler(random_state=42)

# Apply RandomOverSampler to the data
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

# Train the model using the resampled data
model.fit(X_train_ros, y_train_ros, validation_data=(X_val, y_val), epochs=10, batch_size=32)

# Evaluate the model
val_loss, val_accuracy = model.evaluate(X_val, y_val)
print("Validation Loss: ", val_loss)
print("Validation Accuracy: ", val_accuracy)

# Get the model's predictions on the validation data
probabilities = model.predict(X_val)

# Convert the probabilities into class predictions
predictions = (probabilities > 0.5).astype("int32")

print(predictions)

print(classification_report(y_val, predictions))

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Reshape the data to 3D for LSTM  [samples, timesteps, features]
X_train_reshaped = X_train_ros.reshape((X_train_ros.shape[0], 1, X_train_ros.shape[1]))
X_val_reshaped = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))

# Define the LSTM model
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.00001), 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_reshaped, y_train_ros, validation_data=(X_val_reshaped, y_val), epochs=10, batch_size=32)

# Evaluate the model
val_loss, val_accuracy = model.evaluate(X_val_reshaped, y_val)
print("Validation Loss: ", val_loss)
print("Validation Accuracy: ", val_accuracy)

# Get the model's predictions on the validation data
probabilities = model.predict(X_val_reshaped)

# Convert the probabilities into class predictions
predictions = (probabilities > 0.5).astype("int32")

print(predictions)

print(classification_report(y_val, predictions))

In [None]:
unique, counts = np.unique(y_train_ros, return_counts=True)
print(np.asarray((unique, counts)).T)

unique, counts = np.unique(y_train, return_counts=True)
print(np.asarray((unique, counts)).T)

In [None]:
print(predictions)

In [None]:
import os
import numpy as np
import h5py as h5
from sklearn.preprocessing import StandardScaler
import pandas as pd
import time


def preprocess_data(data, hdf5_output_file):
    num_files = len(data.keys())
    # Loop over all files
    for i, file in enumerate(data.keys(), start=1):
        # Loop over all groups in the file
        for group in data[file].keys():
            # Loop over all subgroups in the group
            for subgroup in data[file][group].keys():
                if isinstance(data[file][group][subgroup], dict) and 'SFTs' in data[file][group][subgroup]:
                    # Get the SFTs
                    sfts = data[file][group][subgroup]['SFTs']

                    # Apply FFT
                    sfts_fft = np.real(np.fft.fft(sfts))

                    # Open the HDF5 file
                    with h5.File(hdf5_output_file, 'a') as f:
                        # Check if a group for this file already exists in the HDF5 file
                        # If not, create it
                        file_group_name = f"{file}_{i}"
                        if file_group_name in f:
                            file_group = f[file_group_name]
                        else:
                            file_group = f.create_group(file_group_name)

                        # Check if a subgroup for this group already exists in the file group
                        # If not, create it
                        if group in file_group:
                            group_subgroup = file_group[group]
                        else:
                            group_subgroup = file_group.create_group(group)

                        # Check if a dataset for the FFT of SFTs already exists in the subgroup
                        # If not, create it
                        if 'SFTs' in group_subgroup:
                            sfts_dataset = group_subgroup['SFTs']
                        else:
                            sfts_dataset = group_subgroup.create_dataset('SFTs', data=sfts_fft)

# The folder that contains your test data
test_folder = 'D:\\Datasets\\g2net-detecting-continuous-gravitational-waves (1)\\test'

# The list of test files
test_files = os.listdir(test_folder)

# Define the output path for preprocessed test data
hdf5_output_file = 'D:/Datasets/preprocessed_data/preprocessed_test_data.h5'

# Initialize a StandardScaler instance for data normalization
scaler = joblib.load("scaler.save")

# Create an empty DataFrame to store the predictions
predictions_df = pd.DataFrame(columns=['id', 'target'])

# Loop over each test file
for i, test_file in enumerate(test_files, start=1):
    file_start_time = time.time()
    print(f"Processing test file {i}/{len(test_files)}...")
    
    # Load the test file
    test_data = load_data(test_folder, start_index=i-1, num_files_to_process=1)
    
    # Preprocess the test data
    preprocess_data(test_data, hdf5_output_file)

    # Load the preprocessed test data
    with h5.File(hdf5_output_file, 'r') as f:
        X_test = []
        for file in f.keys():
            file_group = f[file]
            for observation in file_group.keys():
                obs_group = file_group[observation]
                sfts = obs_group['SFTs'][:]
                flattened_sfts = np.real(sfts).flatten()
                X_test.append(flattened_sfts)
        
        max_len = joblib.load("max_len.save")
        max_len = max(max_len, max(len(sft) for sft in X_test))
        X_test = [np.pad(sft, (0, max_len - len(sft))) for sft in X_test]
        X_test = np.array(X_test)
        
        # Normalize the data
        X_test = scaler.transform(X_test)
        
        # Make a prediction
        predictions = model.predict(X_test)
        
        binary_predictions = np.round(predictions).astype(int)
        
        print("Prediction for the file: " + str(binary_predictions))
        
        # Append the predictions to the DataFrame
        for j, prediction in enumerate(binary_predictions):
            # Remove the extension from the test file name
            file_id = os.path.splitext(test_file)[0]
            predictions_df = predictions_df.append({'id': file_id, 'target': prediction}, ignore_index=True)
        
    # Delete the current test file from the preprocessed_test_data.h5 to save disk space
    with h5.File(hdf5_output_file, 'a') as f:
        del f[list(f.keys())[0]]  # delete the first key, which is the current test file
        
    # compute the elapsed time for this file
    file_end_time = time.time()
    file_elapsed_time = file_end_time - file_start_time

    # estimate the remaining time
    remaining_files = len(test_files) - i
    estimated_time_left = remaining_files * file_elapsed_time
    print(f"Estimated time left: {estimated_time_left} seconds")

# Write the DataFrame to a CSV file
predictions_df.to_csv("predictions.csv", index=False)