# Undersampling

In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

from tqdm import tqdm

import time
import torch
import gc

pd.set_option('display.max_columns', None)

## Daten laden

In [2]:
IS_TRAINING = True
folder = "train/" if IS_TRAINING else "validation/"
file_path_prefix = "../../data/processed/transformer/" + folder 
overview = pd.read_parquet(file_path_prefix + "overview-v2.parquet")

## Undersample

In [3]:
overview

Unnamed: 0,num_series_id,step,awake,critical_event_point,series_index
0,1,0,1,0.0,0
1,1,1,1,0.0,1
2,1,2,1,0.0,2
3,1,3,1,0.0,3
4,1,4,1,0.0,4
...,...,...,...,...,...
73060068,277,589675,1,0.0,589675
73060069,277,589676,1,0.0,589676
73060070,277,589677,1,0.0,589677
73060071,277,589678,1,0.0,589678


In [4]:
events = overview[overview.critical_event_point > 0.99999]

In [5]:
def get_is_included(series, events):
    series['include'] = False
    
    for row in events.iterrows():
        current_step = row[1].step
    
        min_step = current_step - 360
        max_step = min(current_step + 360, len(series) - 1)
           
        series.loc[min_step:max_step, 'include'] = True
    
    return series

In [6]:
def get_merged_series(num_series_id, events):
    series = pd.read_parquet(file_path_prefix + "overview-v2.parquet", filters=[('num_series_id','=',num_series_id)])
    events_filtered = events.query('num_series_id == @num_series_id')
       
    series = get_is_included(series, events_filtered)
    
    return series

In [7]:
%%time

overview_undersample_data = []

total_len = events.num_series_id.nunique()

for i, num_series_id in enumerate(events.num_series_id.unique()):
    print(f'Step {i+1} of {total_len} ({num_series_id})')
    overview_undersample = get_merged_series(num_series_id, events)
    overview_undersample_data.append(overview_undersample)
    del overview_undersample
    gc.collect()

overview_undersample = pd.concat(overview_undersample_data).reset_index(drop=True)

Step 1 of 213 (1)
Step 2 of 213 (2)
Step 3 of 213 (3)
Step 4 of 213 (4)
Step 5 of 213 (5)
Step 6 of 213 (6)
Step 7 of 213 (8)
Step 8 of 213 (9)
Step 9 of 213 (11)
Step 10 of 213 (12)
Step 11 of 213 (13)
Step 12 of 213 (14)
Step 13 of 213 (15)
Step 14 of 213 (17)
Step 15 of 213 (19)
Step 16 of 213 (20)
Step 17 of 213 (22)
Step 18 of 213 (23)
Step 19 of 213 (25)
Step 20 of 213 (28)
Step 21 of 213 (29)
Step 22 of 213 (30)
Step 23 of 213 (31)
Step 24 of 213 (33)
Step 25 of 213 (34)
Step 26 of 213 (35)
Step 27 of 213 (36)
Step 28 of 213 (37)
Step 29 of 213 (38)
Step 30 of 213 (39)
Step 31 of 213 (40)
Step 32 of 213 (41)
Step 33 of 213 (42)
Step 34 of 213 (43)
Step 35 of 213 (45)
Step 36 of 213 (46)
Step 37 of 213 (49)
Step 38 of 213 (50)
Step 39 of 213 (51)
Step 40 of 213 (52)
Step 41 of 213 (53)
Step 42 of 213 (54)
Step 43 of 213 (56)
Step 44 of 213 (57)
Step 45 of 213 (58)
Step 46 of 213 (59)
Step 47 of 213 (60)
Step 48 of 213 (61)
Step 49 of 213 (62)
Step 50 of 213 (64)
Step 51 of 213 (6

## Overview speichern

In [8]:
overview_undersample = overview_undersample[overview_undersample.include == True].copy().reset_index(drop=True)

In [9]:
overview_undersample

Unnamed: 0,num_series_id,step,awake,critical_event_point,series_index,include
0,1,4632,1,0.011109,4632,True
1,1,4633,1,0.011389,4633,True
2,1,4634,1,0.011676,4634,True
3,1,4635,1,0.011969,4635,True
4,1,4636,1,0.012269,4636,True
...,...,...,...,...,...,...
5093401,277,581961,1,0.012269,581961,True
5093402,277,581962,1,0.011969,581962,True
5093403,277,581963,1,0.011676,581963,True
5093404,277,581964,1,0.011389,581964,True


In [10]:
overview_undersample.to_parquet(file_path_prefix + "overview-v2-undersample.parquet")