# Data Cleaning

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import gc

## Daten laden

In [2]:
train_events = pd.read_csv('../../data/raw/events.csv')

In [3]:
print(train_events.shape)

(14508, 6)


## Serien säubern

In [4]:
### PARAMS
ANGLEZ_VARIANCE_SEQUENCE_LENGTH = 6 * 60 * 12 # 8h
ANGLEZ_REPETITION_SEQUENCE_LENGTH = 4 * 60 * 12 # 4h

CLEAN_BUFFER = 0 * 60 * 12 # 3h

In [5]:
def mark_clean_anglez_too_low_variance(series):
    last_step = series.iloc[-1]['step']       
    
    for current_start_step in range(0, len(series), ANGLEZ_VARIANCE_SEQUENCE_LENGTH):
        current_end_step = current_start_step + ANGLEZ_VARIANCE_SEQUENCE_LENGTH

        series_chunk = series[current_start_step:current_end_step]

        series_chunk_anglez = series_chunk['anglez'].abs()
        if not (series_chunk_anglez > 50).any():
            clean_from = max(0, current_start_step - CLEAN_BUFFER)
            clean_to = min(last_step, current_end_step + CLEAN_BUFFER)
            
            series.loc[clean_from:clean_to, 'clean'] = 1

    return series

In [6]:
def mark_clean_repetition(series):
    last_step = series.iloc[-1]['step']       

    for current_start_step in range(0, len(series), ANGLEZ_REPETITION_SEQUENCE_LENGTH):
        current_end_step = current_start_step + ANGLEZ_REPETITION_SEQUENCE_LENGTH

        series_chunk = series[current_start_step:current_end_step].reset_index(drop=True)

        for comparing_start_step in range(current_end_step, len(series), ANGLEZ_REPETITION_SEQUENCE_LENGTH):
            comparing_end_step = comparing_start_step + ANGLEZ_REPETITION_SEQUENCE_LENGTH
            comparing_series_chunk = series[comparing_start_step:comparing_end_step].reset_index(drop=True)

            if series_chunk['anglez'].equals(comparing_series_chunk['anglez']):
                clean_from = max(0, current_start_step - CLEAN_BUFFER)
                clean_to = min(last_step, current_end_step + CLEAN_BUFFER)
                series.loc[clean_from:clean_to, 'clean'] = 1
                
                clean_from = max(0, comparing_start_step - CLEAN_BUFFER)
                clean_to = min(last_step, comparing_end_step + CLEAN_BUFFER)
                series.loc[clean_from:clean_to, 'clean'] = 1

    return series

In [7]:
def mark_clean(series):
    series['clean'] = 0

    series = mark_clean_anglez_too_low_variance(series)
    series = mark_clean_repetition(series)

    return series

In [8]:
def cleaning(series_id, events):
    events = events[events.series_id == series_id].reset_index(drop=True)
    series = pd.read_parquet('../../data/raw/series_merged.parquet', filters=[('series_id', '=', series_id)])
    
    series = mark_clean(series)
    series = series[series.clean == 0]
    
    return series

In [9]:
%%time

train_data = []

total_len = train_events.series_id.nunique()

for i, series_id in enumerate(train_events.series_id.unique()):
    print(f'Step {i+1} of {total_len}')
    train = cleaning(series_id, train_events)
    train_data.append(train)
    del train
    gc.collect()

train_series = pd.concat(train_data).reset_index(drop=True)

Step 1 of 277
Step 2 of 277
Step 3 of 277
Step 4 of 277
Step 5 of 277
Step 6 of 277
Step 7 of 277
Step 8 of 277
Step 9 of 277
Step 10 of 277
Step 11 of 277
Step 12 of 277
Step 13 of 277
Step 14 of 277
Step 15 of 277
Step 16 of 277
Step 17 of 277
Step 18 of 277
Step 19 of 277
Step 20 of 277
Step 21 of 277
Step 22 of 277
Step 23 of 277
Step 24 of 277
Step 25 of 277
Step 26 of 277
Step 27 of 277
Step 28 of 277
Step 29 of 277
Step 30 of 277
Step 31 of 277
Step 32 of 277
Step 33 of 277
Step 34 of 277
Step 35 of 277
Step 36 of 277
Step 37 of 277
Step 38 of 277
Step 39 of 277
Step 40 of 277
Step 41 of 277
Step 42 of 277
Step 43 of 277
Step 44 of 277
Step 45 of 277
Step 46 of 277
Step 47 of 277
Step 48 of 277
Step 49 of 277
Step 50 of 277
Step 51 of 277
Step 52 of 277
Step 53 of 277
Step 54 of 277
Step 55 of 277
Step 56 of 277
Step 57 of 277
Step 58 of 277
Step 59 of 277
Step 60 of 277
Step 61 of 277
Step 62 of 277
Step 63 of 277
Step 64 of 277
Step 65 of 277
Step 66 of 277
Step 67 of 277
Step

In [10]:
print(train_series.shape)

(94436069, 10)


## Serien ohne Events entfernen

In [11]:
series_id_list = train_series['series_id'].unique().tolist()

events_series_id_list = train_events.dropna()['series_id'].unique().tolist()

series_without_events = list(set(series_id_list) - set(events_series_id_list))

print('Serien ohne Events: \n', series_without_events)

Serien ohne Events: 
 ['a3e59c2ce3f6', 'c5d08fc3e040', '0f9e60a8e56d', '390b487231ce', 'c7b1283bb7eb', '2fc653ca75c7', 'e11b9d69f856', '89c7daa72eee']


In [12]:
cleaned_train_series = train_series[~train_series.series_id.isin(series_without_events)]
cleaned_train_events = train_events[~train_events.series_id.isin(series_without_events)]

## Gesäuberte Daten speichern

In [13]:
print(cleaned_train_series.shape)
print(cleaned_train_events.shape)

(93442547, 10)
(14154, 6)


In [14]:
cleaned_train_series.to_parquet('../../data/cleaned/series.parquet')
cleaned_train_events.to_csv('../../data/cleaned/events.csv', index=False)