In [168]:
import h5py
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [169]:
def getdatasetname(file_name_with_dir):
    filename_without_dir = file_name_with_dir.split("/")[-1]
    temp = filename_without_dir.split("_")[:-1]
    datasetname = "_".join(temp)
    return datasetname

In [170]:
filename_path = "./data/Intra/train/rest_105923_1.h5"
with h5py.File(filename_path, "r") as f:
    dataset_name = getdatasetname(filename_path)
    matrix = f.get(dataset_name)[()]
    print(type(matrix))
    print(matrix.shape)

<class 'numpy.ndarray'>
(248, 35624)


Preprocessing: MinMax Scaling


In [171]:
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(matrix)
scaled_data = scaler.transform(matrix)
print(scaled_data.shape)

(248, 35624)


Downsampling


In [172]:
print(scaled_data.shape[1] / 2034)

17.514257620452312


Calculate the factors of the sample rate 2034. The factors are used to retrieve an evenly spaced proportion of the data, alternatively a sliding window could be used that retreives chuncks of data, as opposed to values


In [173]:
factors = []
for x in range(1, 2034):
    if 2034 % x == 0:
        factors.append(x)

factors = np.array(factors)

print(factors)

[   1    2    3    6    9   18  113  226  339  678 1017]


Each one of the factors can be used as a subsample size for the current sample rate of 2034.
You can specify a freqency (n) for the data you want to keep.
Each factor has a limit for (n), after which the subsample would be the same size as the sample


In [174]:
min_sample_freqency = 2034 / factors
np.set_printoptions(suppress=True)


print(min_sample_freqency)

[2034. 1017.  678.  339.  226.  113.   18.    9.    6.    3.    2.]


A new sample frequency can be specified using factors of the original frequency 2034
A frequency of 1017 would mean every second value is kept i.e., 2034/1017 = 2
The lower the factor, the less data will be kept from the sample e.g., 2034/113 = 18 (Every 18th value will be kept)


In [175]:
def donwsample(dataset, frequency):
    downsampled_dataset = []

    for i in range(0, dataset.shape[1], 2034):
        second = dataset[:, i : i + 2034]
        subsample = []

        for j in range(0, 2034, int(2034 / frequency)):
            if j < second.shape[1]:
                measurement = second[:, j]
                subsample.append(measurement)

        downsampled_dataset.extend(subsample)

    return np.array(downsampled_dataset).T

In [176]:
print(donwsample(scaled_data, 113).shape)
print(scaled_data.shape)

(248, 1980)
(248, 35624)
