In [None]:
fast_sub = True # set this to False to generate the whole dataset
train = True # set this to True to generate the train set
test = True # set this to True to generate the test set

# Constant-Q Transform Preprocessing for G2Net Gravitational Wave Detection

If you find this notebook and dataset helpful, please, consider upvoting it. I took inspiration from the notebook of Geir Drange: https://www.kaggle.com/mistag/data-preprocessing-with-gwpy, so if give him an upvote too.

If you would like to use the pre-baked dataset I made, here are the links, but if you are here to learn how I did it, you are in the right place =)

## Link to the dataset
* **TRAIN DATASET PT1** https://www.kaggle.com/coldfir3/g2net-cqt-dataset-pt1-jpgrgb
* **TRAIN DATASET PT2** https://www.kaggle.com/coldfir3/g2net-cqt-dataset-pt2-jpgrgb
* **TRAIN DATASET PT3**: https://www.kaggle.com/coldfir3/g2net-cqt-dataset-pt3-jpgrgb
* **TEST DATASET**: https://www.kaggle.com/coldfir3/g2net-cqt-dataset-test-jpgrgb

## Explanation
This notebook inputs raw waveform data and outputs RGB images. For this, I employed the following pipeline:

1. Read the .npy file and store it as a NumPy array.
1. Convert each channel to a gwpy TimeSeries
1. Whiten the signal and apply a Tukey window.
1. (optional) Apply a bandpass filter. It is optional as `q_transform` will already do this. I also noticed that by not using this step the borders of the image are sharper.
1. Apply the q-transform
1. Convert to RGB:
    1. Stack each channel on `dim = -1`
    1. Clip the values to q_max and scale it from `[0, q_max]` to `[0, 255]`
    1. Convert the array to unsigned 8 bits
    1. Create the image and rotate it 90º, with frequency on the vertical axis and time on the horizontal axis.
1. Save it as a .jpeg file

The main difference from Geir's notebook is the way I built the RGB images. Geir uses `MinMaxScaler` to normalize each channel to [0 - 1] before assembling the images in the original code. I think there are a couple of problems with this approach. 1) when using min-max coupled with the subsequent 8-bits discretization, outliers could wash out all the information contained in the image (i.e. the image would be mostly black). Secondly, if the signal is more potent in detector A when compared to detector B, you will lose this information by normalizing each channel independently.

It is worth noticing, though, that the default value I chose for the max_q was utterly arbitrary, and different values could lead to different results. The other defaults were ported from Geir's notebook, and I encourage you to experiment with different values and be kind to share your findings in the comment section below.

## Installing and loading the dependencies

In [None]:
%%capture
!python -m pip install gwpy
!pip install astropy==4.2.1

In [None]:
from gwpy.timeseries import TimeSeries
from gwpy.plot import Plot
import numpy as np
from scipy import signal
from PIL import Image

import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from joblib import Parallel, delayed
from pathlib import Path

## Main processing function

In [None]:
def sig2rgb(fname, whiten = True, window=0.2, bandpass=False, f_range = (30,400), q_range = (16,32), q_max = 10):
    
    # Load the file 
    data = np.load(fname)
    # Split each chanel and convert to TimeSeries
    data = map(lambda x: TimeSeries(x, sample_rate=2048), data)
    # Whiten the signal and apply a tukey window
    data = map(lambda x: x.whiten(window=("tukey", window)), data)
    # (optional) bandpass filter
    if bandpass:
        data = map(lambda x: x.bandpass(*f_range), data)
    # Q-transform
    data = map(lambda x: x.q_transform(qrange=q_range, frange=f_range, logf=True, whiten=False), data)
    # Convert to RGB image
    img = np.stack(list(data), axis = -1)
    img = np.clip(img, 0, q_max)/q_max * 255
    img = img.astype(np.uint8)
    img = Image.fromarray(img).rotate(90, expand=1)
    return img

In [None]:
sig2rgb('../input/g2net-gravitational-wave-detection/train/0/0/0/000a5b6e5c.npy')

## Generating the dataset and ziping the files

In [None]:
def save_img(x, folder_out, **kwargs):
    fname = Path('../input/g2net-gravitational-wave-detection/' + folder_out + '/' + '/'.join([x[0], x[1], x[2], x]) + '.npy')
    file_out = folder_out + '/' + fname.with_suffix('.jpg').name
    x = sig2rgb(fname, **kwargs)
    x.save(file_out)

In [None]:
from zipfile import ZipFile
import shutil
import os
def zip_folder(folder, rm_original = True):
    # iterate over all the files in directory
    for folderName, subfolders, filenames in os.walk(folder):
        # create a ZipFile object
        with ZipFile(folderName.split('/')[-1] + '.zip', 'w') as zipObj:
            for filename in filenames:
                # create complete filepath of file in directory
                filePath = os.path.join(folderName, filename)
                # add file to zip
                zipObj.write(filePath, os.path.basename(filePath))
                # delete the file to open space
                if rm_original:
                    os.remove(filePath)
    if rm_original:
        shutil.rmtree(folder)

### Test data

In [None]:
test_df = pd.read_csv('../input/g2net-gravitational-wave-detection/sample_submission.csv')
if fast_sub: test_ids = test_df['id'][:100]
else: test_ids = test_df['id']
if test:
    os.makedirs('test', exist_ok = True)
    o = Parallel(n_jobs=4)(delayed(save_img)(x, 'test') for x in tqdm(test_ids))
    zip_folder('test')

### Train data

In [None]:
test_df = pd.read_csv('../input/g2net-gravitational-wave-detection/training_labels.csv')
if fast_sub: test_ids = test_df['id'][:100]
else: test_ids = test_df['id']
if train:
    os.makedirs('train', exist_ok = True)
    o = Parallel(n_jobs=4)(delayed(save_img)(x, 'train') for x in tqdm(test_ids))
    zip_folder('train')