In [1]:
import os
import h5py as h5
import numpy as np
import pandas as pd

def load_data(folder, start_index=0, num_files_to_process=None):
    # List of names of all files in the folder
    files = os.listdir(folder)
    if num_files_to_process is not None:
        files = files[start_index:start_index+num_files_to_process]
    num_files = len(files)
    print(f"Number of files in the folder: {num_files}")

    # Dictionary to store the data
    data = {}

    # Loop over all files
    for i, file in enumerate(files, start=1):
        print(f"Processing file {i}/{num_files}...")
        # Open the file
        with h5.File(os.path.join(folder, file), 'r') as f:
            # Dictionary to store the data for this file
            file_data = {}

            # Loop over all groups in the file
            for group_key in f.keys():
                group = f[group_key]

                # Dictionary to store the data for this group
                group_data = {}

                # Check if the group is a dataset or another group
                if isinstance(group, h5.Dataset):
                    # Read the dataset into a numpy array
                    array = np.empty(group.shape, dtype=group.dtype)
                    group.read_direct(array)

                    # Store the array in the group data dictionary
                    group_data[group_key] = array
                else:
                    # Loop over all subgroups/datasets in the group
                    for subgroup_key in group.keys():
                        subgroup = group[subgroup_key]

                        # Check if the subgroup is a dataset or another group
                        if isinstance(subgroup, h5.Dataset):
                            # Read the dataset into a numpy array
                            array = np.empty(subgroup.shape, dtype=subgroup.dtype)
                            subgroup.read_direct(array)

                            # Store the array in the group data dictionary
                            group_data[subgroup_key] = array
                        else:
                            # Dictionary to store the data for this subgroup
                            subgroup_data = {}

                            # Loop over all datasets in the subgroup
                            for dataset_key in subgroup.keys():
                                dataset = subgroup[dataset_key]

                                # Read the dataset into a numpy array
                                array = np.empty(dataset.shape, dtype=dataset.dtype)
                                dataset.read_direct(array)

                                # Store the array in the subgroup data dictionary
                                subgroup_data[dataset_key] = array

                            # Store the subgroup data in the group data dictionary
                            group_data[subgroup_key] = subgroup_data

                # Store the group data in the file data dictionary
                file_data[group_key] = group_data

            # Store the file data in the data dictionary
            data[file] = file_data

    return data

# Load the training data
train_folder = 'D:\\Datasets\\g2net-detecting-continuous-gravitational-waves (1)\\train'
train_data = load_data(train_folder, start_index=0, num_files_to_process=600)

train_labels = 'D:\\Datasets\\g2net-detecting-continuous-gravitational-waves (1)\\train_labels.csv'
reading = pd.read_csv(train_labels)
print(reading.to_string())

# Load the test data
test_folder = 'D:\\Datasets\\g2net-detecting-continuous-gravitational-waves (1)\\train'  # Note: We're still using the train folder
test_data = load_data(test_folder, start_index=100, num_files_to_process=50)  # Start from the 101st file

Number of files in the folder: 600
Processing file 1/600...
Processing file 2/600...
Processing file 3/600...
Processing file 4/600...
Processing file 5/600...
Processing file 6/600...
Processing file 7/600...
Processing file 8/600...
Processing file 9/600...
Processing file 10/600...
Processing file 11/600...
Processing file 12/600...
Processing file 13/600...
Processing file 14/600...
Processing file 15/600...
Processing file 16/600...
Processing file 17/600...
Processing file 18/600...
Processing file 19/600...
Processing file 20/600...
Processing file 21/600...
Processing file 22/600...
Processing file 23/600...
Processing file 24/600...
Processing file 25/600...
Processing file 26/600...
Processing file 27/600...
Processing file 28/600...
Processing file 29/600...
Processing file 30/600...
Processing file 31/600...
Processing file 32/600...
Processing file 33/600...
Processing file 34/600...
Processing file 35/600...
Processing file 36/600...
Processing file 37/600...
Processing f

Processing file 308/600...
Processing file 309/600...
Processing file 310/600...
Processing file 311/600...
Processing file 312/600...
Processing file 313/600...
Processing file 314/600...
Processing file 315/600...
Processing file 316/600...
Processing file 317/600...
Processing file 318/600...
Processing file 319/600...
Processing file 320/600...
Processing file 321/600...
Processing file 322/600...
Processing file 323/600...
Processing file 324/600...
Processing file 325/600...
Processing file 326/600...
Processing file 327/600...
Processing file 328/600...
Processing file 329/600...
Processing file 330/600...
Processing file 331/600...
Processing file 332/600...
Processing file 333/600...
Processing file 334/600...
Processing file 335/600...
Processing file 336/600...
Processing file 337/600...
Processing file 338/600...
Processing file 339/600...
Processing file 340/600...
Processing file 341/600...
Processing file 342/600...
Processing file 343/600...
Processing file 344/600...
P

Processing file 2/50...
Processing file 3/50...
Processing file 4/50...
Processing file 5/50...
Processing file 6/50...
Processing file 7/50...
Processing file 8/50...
Processing file 9/50...
Processing file 10/50...
Processing file 11/50...
Processing file 12/50...
Processing file 13/50...
Processing file 14/50...
Processing file 15/50...
Processing file 16/50...
Processing file 17/50...
Processing file 18/50...
Processing file 19/50...
Processing file 20/50...
Processing file 21/50...
Processing file 22/50...
Processing file 23/50...
Processing file 24/50...
Processing file 25/50...
Processing file 26/50...
Processing file 27/50...
Processing file 28/50...
Processing file 29/50...
Processing file 30/50...
Processing file 31/50...
Processing file 32/50...
Processing file 33/50...
Processing file 34/50...
Processing file 35/50...
Processing file 36/50...
Processing file 37/50...
Processing file 38/50...
Processing file 39/50...
Processing file 40/50...
Processing file 41/50...
Processi

In [2]:
for file in train_data.keys():
    print(file)
    for group in train_data[file].keys():
        print(group)
        for subgroup in train_data[file][group].keys():
            print(subgroup)
            for dataset in train_data[file][group][subgroup].keys():
                print(dataset)
                print(train_data[file][group][subgroup][dataset].shape)
                print(train_data[file][group][subgroup][dataset])
                break
            break
        break
    break

001121a05.hdf5
001121a05
H1
SFTs
(360, 4612)
[[-2.01780108e-24+1.70660666e-22j -1.46458433e-23+9.64213830e-23j
   1.39103851e-23-4.39236616e-23j ... -1.71771812e-23+1.28372925e-22j
   1.38921069e-22-8.29776722e-23j  7.27295802e-23+4.45595750e-23j]
 [-1.87113869e-22+1.00735020e-22j -1.51093962e-22+9.36745661e-24j
   5.01929822e-23-5.20982517e-23j ... -5.97221222e-23-1.09045934e-22j
   8.07383990e-23+1.61488764e-22j  1.47601404e-22+1.07951260e-22j]
 [-1.90160601e-22+1.52689834e-22j  1.05964946e-22+6.50407170e-23j
   1.29634017e-22+5.38747570e-23j ... -1.53258975e-22-5.96231233e-23j
  -6.09477406e-23+2.81030404e-23j -3.27759715e-23-6.07620238e-23j]
 ...
 [-8.01955113e-23+5.76858513e-23j  2.17070893e-22+1.22597858e-22j
  -1.94681547e-23-5.32914921e-23j ... -1.04928606e-23+3.23708883e-23j
   1.36459952e-23+4.76253567e-23j -5.76766263e-24-1.29746023e-22j]
 [ 1.49911681e-22-7.36853073e-23j  1.17350406e-22+1.99666867e-22j
  -1.16330023e-22-1.47523521e-23j ... -4.64647940e-23+3.71537551e-23j
  

In [2]:
import numpy as np
import h5py as h5
from scipy.signal import stft

def preprocess_data(data, hdf5_output_file):
    num_files = len(data.keys())
    print("Length of data:", str(num_files))
    # Loop over all files
    for i, file in enumerate(data.keys(), start=1):
        print(f"Processing file {i}/{num_files}...")
        # Loop over all groups in the file
        for group in data[file].keys():
            # Loop over all subgroups in the group
            for subgroup in data[file][group].keys():
                if isinstance(data[file][group][subgroup], dict) and 'SFTs' in data[file][group][subgroup]:
                    # Get the SFTs
                    sfts = data[file][group][subgroup]['SFTs']

                    # Apply STFT
                    _, _, sfts_stft = stft(sfts)
                    sfts_stft = np.abs(sfts_stft)  # Retain the magnitude of the STFT

                    # Open the HDF5 file
                    with h5.File(hdf5_output_file, 'a') as f:
                        # Check if a group for this file already exists in the HDF5 file
                        # If not, create it
                        file_group_name = f"{file}_{i}"
                        if file_group_name in f:
                            file_group = f[file_group_name]
                        else:
                            file_group = f.create_group(file_group_name)

                        # Check if a subgroup for this group already exists in the file group
                        # If not, create it
                        if group in file_group:
                            group_subgroup = file_group[group]
                        else:
                            group_subgroup = file_group.create_group(group)

                        # Check if a dataset for the STFT of SFTs already exists in the subgroup
                        # If not, create it
                        if 'SFTs' in group_subgroup:
                            sfts_dataset = group_subgroup['SFTs']
                        else:
                            sfts_dataset = group_subgroup.create_dataset('SFTs', data=sfts_stft)

# Preprocess the training data
preprocess_data(train_data, 'D:/Datasets/preprocessed_data/preprocessed_data.h5')

for file in train_data.keys():
    print(file)
    for group in train_data[file].keys():
        print(group)
        for subgroup in train_data[file][group].keys():
            print(subgroup)
            for dataset in train_data[file][group][subgroup].keys():
                print(dataset)
                print(train_data[file][group][subgroup][dataset].shape)
                print(train_data[file][group][subgroup][dataset])
                break
            break
        break
    break


Length of data: 600
Processing file 1/600...




Processing file 2/600...
Processing file 3/600...
Processing file 4/600...
Processing file 5/600...
Processing file 6/600...
Processing file 7/600...
Processing file 8/600...
Processing file 9/600...
Processing file 10/600...
Processing file 11/600...
Processing file 12/600...
Processing file 13/600...
Processing file 14/600...
Processing file 15/600...
Processing file 16/600...
Processing file 17/600...
Processing file 18/600...
Processing file 19/600...
Processing file 20/600...
Processing file 21/600...
Processing file 22/600...
Processing file 23/600...
Processing file 24/600...
Processing file 25/600...
Processing file 26/600...
Processing file 27/600...
Processing file 28/600...
Processing file 29/600...
Processing file 30/600...
Processing file 31/600...
Processing file 32/600...
Processing file 33/600...
Processing file 34/600...
Processing file 35/600...
Processing file 36/600...
Processing file 37/600...
Processing file 38/600...
Processing file 39/600...
Processing file 40/6

Processing file 311/600...
Processing file 312/600...
Processing file 313/600...
Processing file 314/600...
Processing file 315/600...
Processing file 316/600...
Processing file 317/600...
Processing file 318/600...
Processing file 319/600...
Processing file 320/600...
Processing file 321/600...
Processing file 322/600...
Processing file 323/600...
Processing file 324/600...
Processing file 325/600...
Processing file 326/600...
Processing file 327/600...
Processing file 328/600...
Processing file 329/600...
Processing file 330/600...
Processing file 331/600...
Processing file 332/600...
Processing file 333/600...
Processing file 334/600...
Processing file 335/600...
Processing file 336/600...
Processing file 337/600...
Processing file 338/600...
Processing file 339/600...
Processing file 340/600...
Processing file 341/600...
Processing file 342/600...
Processing file 343/600...
Processing file 344/600...
Processing file 345/600...
Processing file 346/600...
Processing file 347/600...
P

In [7]:
import sys
!{sys.executable} -m pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
     -------------------------------------- 235.6/235.6 kB 2.9 MB/s eta 0:00:00
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.11.0 imblearn-0.0



[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import sys

def print_var_sizes():
    for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                             key= lambda x: -x[1])[:2]:
        print("{:>30}: {:>8}".format(name, size))

print_var_sizes()

In [4]:
import h5py as h5

def print_structure(hdf5_file):
    def print_group(group, indent=""):
        print(indent, group.name)
        for key in group.keys():
            item = group[key]
            if isinstance(item, h5.Dataset):  # Dataset
                print(indent + "  ", key)
            elif isinstance(item, h5.Group):  # Group
                print_group(item, indent + "  ")
    with h5.File(hdf5_file, 'r') as f:
        print_group(f)

print_structure('D:/Datasets/preprocessed_data/preprocessed_data.h5')

 /
   /001121a05.hdf5_1
     /001121a05.hdf5_1/001121a05
       SFTs
   /004f23b2d.hdf5_2
     /004f23b2d.hdf5_2/004f23b2d
       SFTs
   /00a6db666.hdf5_3
     /00a6db666.hdf5_3/00a6db666
       SFTs
   /00f36a6ac.hdf5_4
     /00f36a6ac.hdf5_4/00f36a6ac
       SFTs
   /010a387db.hdf5_5
     /010a387db.hdf5_5/010a387db
       SFTs
   /0197bacf8.hdf5_6
     /0197bacf8.hdf5_6/0197bacf8
       SFTs
   /01b8b67f3.hdf5_7
     /01b8b67f3.hdf5_7/01b8b67f3
       SFTs
   /01bcf6533.hdf5_8
     /01bcf6533.hdf5_8/01bcf6533
       SFTs
   /01dba9731.hdf5_9
     /01dba9731.hdf5_9/01dba9731
       SFTs
   /021248995.hdf5_10
     /021248995.hdf5_10/021248995
       SFTs
   /02887d232.hdf5_11
     /02887d232.hdf5_11/02887d232
       SFTs
   /029ed046c.hdf5_12
     /029ed046c.hdf5_12/029ed046c
       SFTs
   /02c478b09.hdf5_13
     /02c478b09.hdf5_13/02c478b09
       SFTs
   /02c8f43f3.hdf5_14
     /02c8f43f3.hdf5_14/02c8f43f3
       SFTs
   /03189bb3d.hdf5_15
     /03189bb3d.hdf5_15/03189bb3d
       

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import numpy as np
import h5py as h5
import joblib
from scipy.ndimage import zoom

# Load the labels from the CSV file
labels_df = pd.read_csv('D:\\Datasets\\g2net-detecting-continuous-gravitational-waves (1)\\train_labels.csv')
labels_df.set_index('id', inplace=True)  # set 'id' as the index to facilitate lookup

# Modify the labels: replace -1 with 1
labels_df['target'] = labels_df['target'].replace(-1, 1)

# Load the preprocessed data from the HDF5 file
with h5.File('D:/Datasets/preprocessed_data/preprocessed_data.h5', 'r') as f:
    # Keep each SFT data as a 2D array
    X = []
    y = []

    for file in f.keys():
        file_group = f[file]
        for observation in file_group.keys():
            obs_group = file_group[observation]
            sfts = obs_group['SFTs'][:]
            X.append(np.abs(sfts))  # Keep the STFT as 2D array, no flattening
            y.append(labels_df.loc[observation, 'target'])

    # Find the maximum shape along each dimension
    max_shape = np.max([sft.shape for sft in X], axis=0)

    # Resize each SFT to the maximum shape
    X = [zoom(sft, (max_shape[0] / sft.shape[0], max_shape[1] / sft.shape[1])) for sft in X]

    X = np.array(X)
    y = np.array(y)

# Here we should apply normalization across each feature independently, not across all values
scaler = StandardScaler()
X = scaler.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)

# Save the scaler
joblib.dump(scaler, "scaler.save")

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=42)

In [3]:
del train_data

In [7]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.optimizers import Adam

# Define the model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(X_train.shape[1], X_train.shape[2], 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.00001), 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

# Add an extra dimension to the data for the channel
X_train = X_train[..., np.newaxis]
X_val = X_val[..., np.newaxis]

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

# Evaluate the model
val_loss, val_accuracy = model.evaluate(X_val, y_val)
print("Validation Loss: ", val_loss)
print("Validation Accuracy: ", val_accuracy)

# Get the model's predictions on the validation data
probabilities = model.predict(X_val)

# Convert the probabilities into class predictions
predictions = (probabilities > 0.5).astype("int32")

print(predictions)

print(classification_report(y_val, predictions))

Epoch 1/10


KeyboardInterrupt: 

In [7]:
from imblearn.over_sampling import RandomOverSampler

# Instantiate RandomOverSampler
ros = RandomOverSampler(random_state=42)

# Apply RandomOverSampler to the data
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

# Train the model using the resampled data
model.fit(X_train_ros, y_train_ros, validation_data=(X_val, y_val), epochs=10, batch_size=32)

# Evaluate the model
val_loss, val_accuracy = model.evaluate(X_val, y_val)
print("Validation Loss: ", val_loss)
print("Validation Accuracy: ", val_accuracy)

# Get the model's predictions on the validation data
probabilities = model.predict(X_val)

# Convert the probabilities into class predictions
predictions = (probabilities > 0.5).astype("int32")

print(predictions)

print(classification_report(y_val, predictions))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation Loss:  1.3089741468429565
Validation Accuracy:  0.6000000238418579
[[1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 

In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Reshape the data to 3D for LSTM  [samples, timesteps, features]
X_train_reshaped = X_train_ros.reshape((X_train_ros.shape[0], 1, X_train_ros.shape[1]))
X_val_reshaped = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))

# Define the LSTM model
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.00001), 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_reshaped, y_train_ros, validation_data=(X_val_reshaped, y_val), epochs=10, batch_size=32)

# Evaluate the model
val_loss, val_accuracy = model.evaluate(X_val_reshaped, y_val)
print("Validation Loss: ", val_loss)
print("Validation Accuracy: ", val_accuracy)

# Get the model's predictions on the validation data
probabilities = model.predict(X_val_reshaped)

# Convert the probabilities into class predictions
predictions = (probabilities > 0.5).astype("int32")

print(predictions)

print(classification_report(y_val, predictions))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


KeyboardInterrupt: 

In [8]:
unique, counts = np.unique(y_train_ros, return_counts=True)
print(np.asarray((unique, counts)).T)

unique, counts = np.unique(y_train, return_counts=True)
print(np.asarray((unique, counts)).T)

[[ -1 109]
 [  0 109]
 [  1 109]]
[[ -1   1]
 [  0  50]
 [  1 109]]


In [12]:
print(predictions)

[[1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]


In [8]:
import os
import numpy as np
import h5py as h5
from sklearn.preprocessing import StandardScaler
import pandas as pd
import time


def preprocess_data(data, hdf5_output_file):
    num_files = len(data.keys())
    # Loop over all files
    for i, file in enumerate(data.keys(), start=1):
        # Loop over all groups in the file
        for group in data[file].keys():
            # Loop over all subgroups in the group
            for subgroup in data[file][group].keys():
                if isinstance(data[file][group][subgroup], dict) and 'SFTs' in data[file][group][subgroup]:
                    # Get the SFTs
                    sfts = data[file][group][subgroup]['SFTs']

                    # Apply FFT
                    sfts_fft = np.real(np.fft.fft(sfts))

                    # Open the HDF5 file
                    with h5.File(hdf5_output_file, 'a') as f:
                        # Check if a group for this file already exists in the HDF5 file
                        # If not, create it
                        file_group_name = f"{file}_{i}"
                        if file_group_name in f:
                            file_group = f[file_group_name]
                        else:
                            file_group = f.create_group(file_group_name)

                        # Check if a subgroup for this group already exists in the file group
                        # If not, create it
                        if group in file_group:
                            group_subgroup = file_group[group]
                        else:
                            group_subgroup = file_group.create_group(group)

                        # Check if a dataset for the FFT of SFTs already exists in the subgroup
                        # If not, create it
                        if 'SFTs' in group_subgroup:
                            sfts_dataset = group_subgroup['SFTs']
                        else:
                            sfts_dataset = group_subgroup.create_dataset('SFTs', data=sfts_fft)

# The folder that contains your test data
test_folder = 'D:\\Datasets\\g2net-detecting-continuous-gravitational-waves (1)\\test'

# The list of test files
test_files = os.listdir(test_folder)

# Define the output path for preprocessed test data
hdf5_output_file = 'D:/Datasets/preprocessed_data/preprocessed_test_data.h5'

# Initialize a StandardScaler instance for data normalization
scaler = joblib.load("scaler.save")

# Create an empty DataFrame to store the predictions
predictions_df = pd.DataFrame(columns=['id', 'target'])

# Loop over each test file
for i, test_file in enumerate(test_files, start=1):
    file_start_time = time.time()
    print(f"Processing test file {i}/{len(test_files)}...")
    
    # Load the test file
    test_data = load_data(test_folder, start_index=i-1, num_files_to_process=1)
    
    # Preprocess the test data
    preprocess_data(test_data, hdf5_output_file)

    # Load the preprocessed test data
    with h5.File(hdf5_output_file, 'r') as f:
        X_test = []
        for file in f.keys():
            file_group = f[file]
            for observation in file_group.keys():
                obs_group = file_group[observation]
                sfts = obs_group['SFTs'][:]
                flattened_sfts = np.real(sfts).flatten()
                X_test.append(flattened_sfts)
        
        max_len = joblib.load("max_len.save")
        max_len = max(max_len, max(len(sft) for sft in X_test))
        X_test = [np.pad(sft, (0, max_len - len(sft))) for sft in X_test]
        X_test = np.array(X_test)
        
        # Normalize the data
        X_test = scaler.transform(X_test)
        
        # Make a prediction
        predictions = model.predict(X_test)
        
        binary_predictions = np.round(predictions).astype(int)
        
        print("Prediction for the file: " + str(binary_predictions))
        
        # Append the predictions to the DataFrame
        for j, prediction in enumerate(binary_predictions):
            # Remove the extension from the test file name
            file_id = os.path.splitext(test_file)[0]
            predictions_df = predictions_df.append({'id': file_id, 'target': prediction}, ignore_index=True)
        
    # Delete the current test file from the preprocessed_test_data.h5 to save disk space
    with h5.File(hdf5_output_file, 'a') as f:
        del f[list(f.keys())[0]]  # delete the first key, which is the current test file
        
    # compute the elapsed time for this file
    file_end_time = time.time()
    file_elapsed_time = file_end_time - file_start_time

    # estimate the remaining time
    remaining_files = len(test_files) - i
    estimated_time_left = remaining_files * file_elapsed_time
    print(f"Estimated time left: {estimated_time_left} seconds")

# Write the DataFrame to a CSV file
predictions_df.to_csv("predictions.csv", index=False)

Processing test file 1/7975...
Number of files in the folder: 1
Processing file 1/1...
Prediction for the file: [[1]]
Estimated time left: 7595.1559882164 seconds
Processing test file 2/7975...
Number of files in the folder: 1
Processing file 1/1...
Prediction for the file: [[1]]
Estimated time left: 3442.3337168693542 seconds
Processing test file 3/7975...
Number of files in the folder: 1
Processing file 1/1...
Prediction for the file: [[1]]
Estimated time left: 4370.591660499573 seconds
Processing test file 4/7975...
Number of files in the folder: 1
Processing file 1/1...
Prediction for the file: [[1]]
Estimated time left: 4087.7927870750427 seconds
Processing test file 5/7975...
Number of files in the folder: 1
Processing file 1/1...
Prediction for the file: [[1]]
Estimated time left: 3172.88850069046 seconds
Processing test file 6/7975...
Number of files in the folder: 1
Processing file 1/1...
Prediction for the file: [[1]]
Estimated time left: 3559.5763700008392 seconds
Processing

Prediction for the file: [[1]]
Estimated time left: 2824.655826330185 seconds
Processing test file 77/7975...
Number of files in the folder: 1
Processing file 1/1...
Prediction for the file: [[1]]
Estimated time left: 2710.2449893951416 seconds
Processing test file 78/7975...
Number of files in the folder: 1
Processing file 1/1...
Prediction for the file: [[1]]
Estimated time left: 2737.601462364197 seconds
Processing test file 79/7975...
Number of files in the folder: 1
Processing file 1/1...
Prediction for the file: [[1]]
Estimated time left: 2639.42227935791 seconds
Processing test file 80/7975...
Number of files in the folder: 1
Processing file 1/1...
Prediction for the file: [[1]]
Estimated time left: 2731.5981256961823 seconds
Processing test file 81/7975...
Number of files in the folder: 1
Processing file 1/1...
Prediction for the file: [[1]]
Estimated time left: 3386.980679512024 seconds
Processing test file 82/7975...
Number of files in the folder: 1
Processing file 1/1...
Pre

KeyboardInterrupt: 