In [None]:
!pip install google-cloud-logging
import logging
import google.cloud.logging
import google.cloud.logging_v2 as logging_v2
from os import environ

client = logging_v2.client.Client()

# set the format for the log
google_log_format= logging.Formatter(
fmt='%(name)s | %(module)s | %(funcName)s | %(message)s',
datefmt='%Y-%m-$dT%H:%M:%S')


handler = client.get_default_handler()
handler.setFormatter(google_log_format)

cloud_logger = logging.getLogger("vertex-ai-notebook-logger")
cloud_logger.setLevel("CRITICAL")
cloud_logger.addHandler(handler)

log = logging.getLogger("vertex-ai-notebook-logger")
log.critical("This is a log from a Vertex AI Notebook!")

import os
os.environ['OMP_NUM_THREADS'] = "1"
os.environ['MKL_NUM_THREADS'] = "1"
os.environ['OPENBLAS_NUM_THREADS'] = "1"

In [None]:
import pandas as pd
import numpy as np
import pywt
from tsfresh import extract_features
from tsfresh.utilities.distribution import MultiprocessingDistributor
from tqdm import tqdm
from tsfresh import extract_relevant_features
import tsfresh

path = 'gcs/hms_applied_cv/'
offset = 1
path_for_file = path + f'hms-harmful-brain-activity-classification/new_features{offset}.csv'

In [None]:
df = pd.read_csv(path + 'hms-harmful-brain-activity-classification/train.csv')
# Concat the last 6 columns into one
df['concatenated_scores'] = df.iloc[:, -6:].apply(lambda x: ''.join(x.astype(str)), axis=1)
# Create column of False
df['is_center'] = False

# Sort by eeg_id and then eeg_sub_id
df = df.sort_values(['eeg_id', 'eeg_sub_id'])

# Iterate over the rows
counter = 0
start_row_index = 0
for i in range(len(df)):
    if i == 0:
        counter += 1
        continue
    
    # Check if the concatenated_scores is the same as the previous row
    if df.at[i, 'concatenated_scores'] == df.at[i-1, 'concatenated_scores'] and df.at[i, 'eeg_id'] == df.at[i-1, 'eeg_id']:
        counter += 1
    else:
        row_to_edit = 0
        if counter == 1:
            row_to_edit = start_row_index
        else:
            row_to_edit = int((counter - 1) / 2) + start_row_index
        df.at[row_to_edit, 'is_center'] = True
        start_row_index = i
        counter = 1


row_to_edit = int((counter - 1) / 2) + start_row_index
df.at[row_to_edit, 'is_center'] = True

df.to_csv(path + 'hms-harmful-brain-activity-classification/cleaned_train.csv')
cleaned_train = df
cleaned_train = pd.read_csv(path + 'hms-harmful-brain-activity-classification/cleaned_train.csv')
cleaned_train = cleaned_train[cleaned_train['is_center'] == True]
log.critical("Cleaned_Train Created!")
# cleaned_train.loc[(cleaned_train['eeg_id'] == 568657) & (cleaned_train['is_center'] == True), 'expert_consensus'].values[0]

In [None]:
USE_WAVELET = 'db8'
NAMES = ['LL','LP','RP','RR']

FEATS = [['Fp1','F7','T3','T5','O1'],
         ['Fp1','F3','C3','P3','O1'],
         ['Fp2','F8','T4','T6','O2'],
         ['Fp2','F4','C4','P4','O2']]

# DENOISE FUNCTION
def maddest(d, axis=None):
    return np.mean(np.absolute(d - np.mean(d, axis)), axis)

def denoise(x, wavelet='haar', level=1):    
    coeff = pywt.wavedec(x, wavelet, mode="per")
    sigma = (1/0.6745) * maddest(coeff[-level])

    uthresh = sigma * np.sqrt(2*np.log(len(x)))
    coeff[1:] = (pywt.threshold(i, value=uthresh, mode='hard') for i in coeff[1:])

    ret=pywt.waverec(coeff, wavelet, mode='per')
    
    return ret

def spectrogram_from_eeg(ids, display=False):
    for i, eeg_id in tqdm(enumerate(ids)):
        ids_data = np.array([], dtype=np.int64)
        times_data = np.array([])
        LLS = np.array([])
        LPS = np.array([])
        RRS = np.array([])
        RPS = np.array([])
        results = []
        ids_test = []
        
        cleaned_train = pd.read_csv(path + 'hms-harmful-brain-activity-classification/cleaned_train.csv')
        cleaned_train = cleaned_train[cleaned_train['is_center'] == True]
        
        try:
            df1 = pd.read_csv(path_for_file)
            current_ids = set(df1['eeg_ids'])
            print(current_ids)
        except:
            current_ids = []
        
        if eeg_id in current_ids:
            print("skip")
            continue
        
#         if i != 0 and non_nan_count == 0:
#             print("Skip")
#             continue
        
        log.critical(f"{i} out of {len(ids)}")
        try:
             # LOAD MIDDLE 50 SECONDS OF EEG SERIES
            eeg = pd.read_parquet(path + f'hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
        except Exception as e:
            print("ERROR")
            print(e)
    
        ids_test.append(eeg_id)
        middle = (len(eeg)-10_000)//2
        eeg = eeg.iloc[middle:middle+10_000]


        if display: plt.figure(figsize=(10,7))
        signals_int = []
        LL = np.array([])
        LP = np.array([])
        RR = np.array([])
        RP = np.array([])
        for k in range(4):
            COLS = FEATS[k]
            for kk in range(4):

                # COMPUTE PAIR DIFFERENCES
                x = eeg[COLS[kk]].values - eeg[COLS[kk+1]].values

                # FILL NANS
                m = np.nanmean(x)
                if np.isnan(x).mean()<1: x = np.nan_to_num(x,nan=m)
                else: x[:] = 0

                # DENOISE
                if USE_WAVELET:
                    x = denoise(x, wavelet=USE_WAVELET)
                signals_int.append(x)

        signals = np.stack(signals_int, axis=0)
        LL = signals[0] + signals[1] + signals[2] + signals[3]
        LP = signals[4] + signals[5] + signals[6] + signals[7]
        RR = signals[8] + signals[9] + signals[10] + signals[11]
        RP = signals[12] + signals[13] + signals[14] + signals[15]

        id_data = np.full(10000, eeg_id)
        time = np.arange(0, 10000, 1)
        # id_data = np.expand_dims(id_data, axis=1)
        # time = np.expand_dims(time, axis=1)
        LL = np.expand_dims(LL, axis=1)
        LP = np.expand_dims(LP, axis=1)
        RR = np.expand_dims(RR, axis=1)
        RP = np.expand_dims(RP, axis=1)

        # ids_data.append(id_data)
        # times_data.append(time)
        # LLS.append(LL)
        # LPS.append(LP)
        # RRS.append(RR)
        # RPS.append(RP)
        result = cleaned_train.loc[(cleaned_train['eeg_id'] == eeg_id) & (cleaned_train['is_center'] == True), 'expert_consensus'].values[0]
        results.append([result] * len(LL))

        ids_data = np.append(ids_data, id_data)
        times_data = np.append(times_data, time)
        LLS = np.append(LLS, LL)
        LPS = np.append(LPS, LP)
        RRS = np.append(RRS, RR)
        RPS = np.append(RPS, RP)

        signals = np.stack([ids_data, times_data, LLS, LPS, RRS, RPS], axis = 1)
        # signals = np.squeeze(signals, axis=2)

        df = pd.DataFrame(signals)

        # Rename columns
        df.columns = ['id', 'time', 'LL', 'LP', 'RR', 'RP']
        # Set the index
        df.set_index('time', inplace=True)
        # Reset the index
        df.reset_index(inplace=True)

        # Distributor = MultiprocessingDistributor(n_workers=3,
        #                                      disable_progressbar=False,
        #                                      progressbar_title="Feature Extraction")

        # Call TSFresh on the data
        # extracted_features = extract_features(df, column_id="id", column_sort="time", distributor=Distributor )
        log.critical(f"Extraction started")
        settings = tsfresh.feature_extraction.settings.EfficientFCParameters()
        output = extract_features(df,
                                                         column_id='id', column_sort='time',
                                                          default_fc_parameters=settings)
        
        if len(output) > 1:
            print(output)
        output.insert(0, 'eeg_id', None)
        output['eeg_id'] = eeg_id
        try:
            df1 = pd.read_csv(path_for_file)
            df_concat = pd.concat([df1, output], axis=0, ignore_index=True, keys=None)
            df_concat.drop(df_concat.columns[df_concat.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)
            df_concat.to_csv(path_for_file)
            print(str(len(df_concat) / len(ids)) + '%')
        except Exception as e:
            print(e)
            output.to_csv(path_for_file)
        
#         print(len(cleaned_train.columns))
#         if len(cleaned_train.columns) == 18:
#             output['eeg_id'] = [eeg_id]
#             cleaned_train = pd.merge(cleaned_train, output, on='eeg_id', how='left')
#         else:
#             cols_to_update = cleaned_train.columns[col_index:]
#             row_to_update = cleaned_train[cleaned_train['eeg_id'] == eeg_id].iloc[0]

#             # Step 2: Update the values in DataFrame 'a'
#             cleaned_train.loc[cleaned_train['eeg_id'] == eeg_id, cols_to_update] = output.loc[:, cols_to_update].values[0]

#         cleaned_train.to_csv('gcs/hms-harmful-brain-activity-classification/cleaned_train.csv')
            

    unique_values = set([item for row in results for item in row])
    mapping = {value: i for i, value in enumerate(unique_values)}
    results = [[mapping[item] for item in row] for row in results]
    results = [item for sublist in results for item in sublist]
    results = pd.Series(results)

    results = cleaned_train[['eeg_id', 'expert_consensus']]
    results = pd.Series(results['expert_consensus'].values, index=results['eeg_id'])
    results = results.loc[results.index.isin(ids_test)]
    # Convert the categorical Series to a categorical data type
    results = results.astype('category')
    # Convert the categorical values to numerical codes
    results = results.cat.codes



    signals = np.stack([ids_data, times_data, LLS, LPS, RRS, RPS], axis = 1)
    # signals = np.squeeze(signals, axis=2)

    df = pd.DataFrame(signals)

    # Rename columns
    df.columns = ['id', 'time', 'LL', 'LP', 'RR', 'RP']
    # Set the index
    df.set_index('time', inplace=True)
    # Reset the index
    df.reset_index(inplace=True)

    # Distributor = MultiprocessingDistributor(n_workers=3,
    #                                      disable_progressbar=False,
    #                                      progressbar_title="Feature Extraction")

    # Call TSFresh on the data
    # extracted_features = extract_features(df, column_id="id", column_sort="time", distributor=Distributor )
    log.critical(f"Extraction started")
    settings = tsfresh.feature_extraction.settings.EfficientFCParameters()
    features_filtered_direct = extract_features(df,
                                                     column_id='id', column_sort='time',
                                                      default_fc_parameters=settings)
    # extracted_features['eeg_id'] = int(eeg_id)
    
    return features_filtered_direct

Driver

In [None]:
cleaned_train = pd.read_csv(path + 'hms-harmful-brain-activity-classification/cleaned_train.csv')
cleaned_train = cleaned_train[cleaned_train['is_center'] == True]
ids = set(cleaned_train['eeg_id'])
# df1 = pd.read_csv(path + 'hms-harmful-brain-activity-classification/new_features.csv')
# current_ids = set(df1['eeg_id'])
# diff = list(ids - current_ids)[:500]
diff = list(ids)[0 + 500 * offset:500 + 500 * offset]
diff = set(diff)
# features = spectrogram_from_eeg(ids)
spectrogram_from_eeg(diff)