In [1]:
import h5py as h5
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

train_folder = 'D:\\Datasets\\g2net-detecting-continuous-gravitational-waves (1)\\train'
test_folder = 'D:\\Datasets\\g2net-detecting-continuous-gravitational-waves (1)\\test'

# List of names of all files in the folder
train_files = os.listdir(train_folder)
test_files = os.listdir(test_folder)

In [2]:
# Get train_labels and associate them with appropriate files

train_labels = 'D:\\Datasets\\g2net-detecting-continuous-gravitational-waves (1)\\train_labels.csv'

# Create a dictionary with train_files values as keys and train_labels values as values
train_labels_dict = {}
with open(train_labels, 'r') as f:
    for line in f:
        (key, val) = line.split(',')
        train_labels_dict[key] = val

# Remove "\n" from the end of the values
for key in train_labels_dict:
    train_labels_dict[key] = train_labels_dict[key].rstrip()

In [3]:
# Get contents of first train file
with h5.File(train_folder + '\\' + train_files[0], 'r') as f:
    for file_key in f.keys():
        group = f[file_key]
        print(group)
        try:
            for group_key in group.keys():
                group2 = group[group_key]
                print(f"---->{group2}")
                for group_key2 in group2.keys():
                    print(f"--------->{group2[group_key2]}")
        except AttributeError:
            pass

<HDF5 group "/001121a05" (3 members)>
----><HDF5 group "/001121a05/H1" (2 members)>
---------><HDF5 dataset "SFTs": shape (360, 4612), type "<c8">
---------><HDF5 dataset "timestamps_GPS": shape (4612,), type "<i8">
----><HDF5 group "/001121a05/L1" (2 members)>
---------><HDF5 dataset "SFTs": shape (360, 4653), type "<c8">
---------><HDF5 dataset "timestamps_GPS": shape (4653,), type "<i8">
----><HDF5 dataset "frequency_Hz": shape (360,), type "<f8">


In [4]:
def traverse_hdf5(hdf5_file, file_name):
    data = []
    # Open the HDF5 file
    with h5.File(hdf5_file, 'r') as f:
        # Recursively traverse through the groups in the HDF5 file
        def traverse(name, path):
            # Get the object at the current path
            obj = f[name]
            # If the object is a group, traverse through it
            if isinstance(obj, h5.Group):
                for key in obj.keys():
                    traverse(f"{name}/{key}", f"{path}/{key}")
            # If the object is a dataset, store the data and the dataset name in the list
            elif isinstance(obj, h5.Dataset):
                data.append((obj[()], path, file_name))
        traverse("/", "/")
    # Create a Pandas dataframe from the data
    df = pd.DataFrame(data, columns=["data", "name", "file"])

    # If the end of the column of a name is "SFTs", replace the value within the data column with the average of the values
    for i in range(len(df)):
        if df['name'][i][-4:] == 'SFTs':
            df['data'][i] = np.mean(df['data'][i])

    # If the end of the column of a name is "frequency_Hz", replace the value within the data column with the average of
    # the values
    for i in range(len(df)):
        if df['name'][i][-12:] == 'frequency_Hz':
            df['data'][i] = np.mean(df['data'][i])

    # Remove the columns with names that end with "timestamps_GPS"
    df = df[~df['name'].str.endswith('timestamps_GPS')]

    # Transpose the dataframe, with eventually columns of "Filename", "H1-SFTs", "L1-SFTs", "Frequency_Hz"
    df = df.pivot(index='file', columns='name', values='data').reset_index()

    # Rename the "Filename" column to "id"
    df.rename(columns={'file': 'Id'}, inplace=True)

    # Filename is the name of the file without the extension
    Filename = file_name[:-5]

    # Rename the "H1-SFTs" column to "H1"
    df.rename(columns={'//' + Filename + '/H1/SFTs': 'Avg H1-SFTs'}, inplace=True)
    df.rename(columns={'//' + Filename + '/L1/SFTs': 'Avg L1-SFTs'}, inplace=True)
    df.rename(columns={'//' + Filename + '/frequency_Hz': 'Avg Frequency_Hz'}, inplace=True)

    return df

In [5]:
complete_df = pd.DataFrame(columns=["Id", "Avg H1-SFTs", "Avg L1-SFTs", "Avg Frequency_Hz"])

# Execute the traverse_hdf5 function on all files in the train folder and replace complete_df with complete_df + df
for file in train_files:
    df = traverse_hdf5(train_folder + '\\' + file, file)
    complete_df = complete_df.append(df)

In [6]:
complete_df

Unnamed: 0,Id,Avg H1-SFTs,Avg L1-SFTs,Avg Frequency_Hz
0,001121a05.hdf5,(-2.7874754e-26-5.2556594e-26j),(-7.563195e-26+1.6791384e-26j),53.6075
0,004f23b2d.hdf5,(5.7957826e-26-7.850618e-26j),(-6.577777e-26-9.961035e-27j),329.119722
0,00a6db666.hdf5,(1.7794107e-25-5.2310586e-29j),(3.6714905e-26+1.6741528e-26j),213.288056
0,00f36a6ac.hdf5,(2.7037856e-26+9.426087e-27j),(-9.8693624e-26-3.6599078e-26j),453.359167
0,010a387db.hdf5,(-7.8788974e-26-8.984369e-27j),(6.442983e-26-2.6589115e-26j),231.983611
...,...,...,...,...
0,fe38dbe64.hdf5,(1.9862872e-27-4.079406e-27j),(2.9517718e-26-4.4535038e-26j),373.019722
0,feafd0d16.hdf5,(-3.9736005e-26-4.9526113e-27j),(5.1489706e-26-6.1739946e-26j),481.319722
0,feeca844e.hdf5,(-3.3825502e-26-9.813063e-26j),(4.5678313e-27-6.1452293e-28j),369.324722
0,ff5ad023f.hdf5,(5.578002e-26+4.128473e-27j),(-6.8264504e-26+4.6519235e-26j),293.868056


In [7]:
# Function to convert scientific notation to float
def convert_to_float(x):
    return float(x)

# Convert the values in the "Avg H1-SFTs" column to float
complete_df['Avg H1-SFTs'] = complete_df['Avg H1-SFTs'].apply(convert_to_float)

complete_df

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Id,Avg H1-SFTs,Avg L1-SFTs,Avg Frequency_Hz
0,001121a05.hdf5,-2.787475e-26,(-7.563195e-26+1.6791384e-26j),53.6075
0,004f23b2d.hdf5,5.795783e-26,(-6.577777e-26-9.961035e-27j),329.119722
0,00a6db666.hdf5,1.779411e-25,(3.6714905e-26+1.6741528e-26j),213.288056
0,00f36a6ac.hdf5,2.703786e-26,(-9.8693624e-26-3.6599078e-26j),453.359167
0,010a387db.hdf5,-7.878897e-26,(6.442983e-26-2.6589115e-26j),231.983611
...,...,...,...,...
0,fe38dbe64.hdf5,1.986287e-27,(2.9517718e-26-4.4535038e-26j),373.019722
0,feafd0d16.hdf5,-3.973601e-26,(5.1489706e-26-6.1739946e-26j),481.319722
0,feeca844e.hdf5,-3.382550e-26,(4.5678313e-27-6.1452293e-28j),369.324722
0,ff5ad023f.hdf5,5.578002e-26,(-6.8264504e-26+4.6519235e-26j),293.868056


In [8]:
# Convert the values in the "Avg L1-SFTs" column to float
complete_df['Avg L1-SFTs'] = complete_df['Avg L1-SFTs'].apply(convert_to_float)

complete_df

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Id,Avg H1-SFTs,Avg L1-SFTs,Avg Frequency_Hz
0,001121a05.hdf5,-2.787475e-26,-7.563195e-26,53.6075
0,004f23b2d.hdf5,5.795783e-26,-6.577777e-26,329.119722
0,00a6db666.hdf5,1.779411e-25,3.671491e-26,213.288056
0,00f36a6ac.hdf5,2.703786e-26,-9.869362e-26,453.359167
0,010a387db.hdf5,-7.878897e-26,6.442983e-26,231.983611
...,...,...,...,...
0,fe38dbe64.hdf5,1.986287e-27,2.951772e-26,373.019722
0,feafd0d16.hdf5,-3.973601e-26,5.148971e-26,481.319722
0,feeca844e.hdf5,-3.382550e-26,4.567831e-27,369.324722
0,ff5ad023f.hdf5,5.578002e-26,-6.826450e-26,293.868056


In [9]:
# Remove ".hdf5" from the end of the Id column for each row
complete_df['Id'] = complete_df['Id'].str[:-5]

labels_df = pd.read_csv(train_labels)

labels_df = labels_df.rename(columns={"id": "Id"})

complete_df = pd.merge(complete_df, labels_df, on="Id", how="inner")

In [10]:
complete_df

Unnamed: 0,Id,Avg H1-SFTs,Avg L1-SFTs,Avg Frequency_Hz,target
0,001121a05,-2.787475e-26,-7.563195e-26,53.6075,1
1,004f23b2d,5.795783e-26,-6.577777e-26,329.119722,1
2,00a6db666,1.779411e-25,3.671491e-26,213.288056,1
3,00f36a6ac,2.703786e-26,-9.869362e-26,453.359167,1
4,010a387db,-7.878897e-26,6.442983e-26,231.983611,1
...,...,...,...,...,...
598,fe38dbe64,1.986287e-27,2.951772e-26,373.019722,1
599,feafd0d16,-3.973601e-26,5.148971e-26,481.319722,1
600,feeca844e,-3.382550e-26,4.567831e-27,369.324722,1
601,ff5ad023f,5.578002e-26,-6.826450e-26,293.868056,1


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Separate the independent variables (predictor variables) from the dependent variable (response variable)
X = complete_df.drop(columns=["Id", "target"])
y = complete_df["target"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.24, random_state=42)

# Create an instance of the LogisticRegression model
model = LogisticRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the accuracy, precision, recall, and F1 score
accuracy = accuracy_score(y_test, y_pred)


print("Accuracy:", accuracy)


Accuracy: 0.7172413793103448


In [12]:
complete_df_test = pd.DataFrame(columns=["Id", "Avg H1-SFTs", "Avg L1-SFTs", "Avg Frequency_Hz"])

# Execute the traverse_hdf5 function on all files in the train folder and replace complete_df with complete_df + df
for file in test_files:
    df = traverse_hdf5(test_folder + '\\' + file, file)
    complete_df_test = complete_df_test.append(df)
    
# Function to convert scientific notation to float
def convert_to_float(x):
    return float(x)


# Remove ".hdf5" from the end of the Id column for each row
complete_df_test['Id'] = complete_df_test['Id'].str[:-5]

# Convert the values in the "Avg H1-SFTs" column to float
complete_df_test['Avg H1-SFTs'] = complete_df_test['Avg H1-SFTs'].apply(convert_to_float)

# Convert the values in the "Avg L1-SFTs" column to float
complete_df_test['Avg L1-SFTs'] = complete_df_test['Avg L1-SFTs'].apply(convert_to_float)


print(complete_df_test.head())

          Id   Avg H1-SFTs   Avg L1-SFTs Avg Frequency_Hz
0  00054c878 -2.864107e-26  1.970546e-26       306.501944
0  0007285a3 -2.269763e-26  1.093008e-25       126.448611
0  00076c5a6 -3.871007e-26 -3.364705e-26       197.389167
0  001349290  6.066534e-26  6.196433e-26        95.008056
0  001a52e92  8.669040e-26  8.671533e-27       464.176944


  # Remove the CWD from sys.path while we load stuff.


In [14]:
X_test = complete_df_test.drop(columns=["Id"])

# Make predictions on the test data
y_pred_test = model.predict(X_test)

y_pred_test

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [15]:
# Create csv file with first column being test_files without the .hdf5 extension and second column being the predictions
submission = pd.DataFrame({'Id': test_files, 'target': y_pred_test})
submission['Id'] = submission['Id'].str[:-5]

submission

Unnamed: 0,Id,target
0,00054c878,1
1,0007285a3,1
2,00076c5a6,1
3,001349290,1
4,001a52e92,1
...,...,...
7970,ffbce04ef,1
7971,ffc2d976b,1
7972,ffc905909,1
7973,ffe276f3e,1


In [22]:
submission['target'].value_counts()

1    7975
Name: target, dtype: int64