# Datenqualitätsanalyse

In [1]:
# Base
import numpy as np
import pandas as pd
import gc
from tqdm import tqdm
import seaborn as sns
from datetime import datetime

# Visualisation
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns


# Scoring
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

## Daten laden

In [2]:
train_events = pd.read_csv('../../data/raw/train_events.csv')

##  Cleaning

In [3]:
### PARAMS
CLEAN_BUFFER = 0 * 60 * 12 # 0h

In [4]:
UNKNOWN_BUFFER = 0

def mark_not_annotated(series, events):
    series['not_annotated'] = 0

    previous_defined_event = None
    for index, current_event in events.iterrows():
        if np.isnan(current_event['step']):
            next_events = events[index:]
            next_defined_events = next_events[next_events['step'].notna()]
            next_defined_event = next_defined_events.iloc[0] if len(next_defined_events) > 0 else None

            unknown_start_step = previous_defined_event['step'] + UNKNOWN_BUFFER if previous_defined_event is not None else 0
            unknown_end_step = next_defined_event['step'] - UNKNOWN_BUFFER if next_defined_event is not None else series.iloc[-1]['step'] + 1

            series.loc[unknown_start_step:unknown_end_step, 'not_annotated'] = 1

            continue

        previous_defined_event = current_event
    
    ## Fix if after last event are many more steps
    last_event = events.iloc[-1]
    
    if not(np.isnan(last_event['step'])):
        last_step = series.iloc[-1]
        last_event_date = datetime.strptime(last_event['timestamp'][0:10], '%Y-%m-%d').date()
        last_step_date = datetime.strptime(last_step['timestamp'][0:10], '%Y-%m-%d').date()
    
        if last_event_date < last_step_date:
            series.loc[last_event['step'] + UNKNOWN_BUFFER:, 'not_annotated'] = 1
        
    return series

In [5]:
ANGLEZ_VARIANCE_SEQUENCE_LENGTH = 6 * 60 * 12 # 6h

def mark_anglez_too_little_variance(series, threshold):
    column_name = f'too_little_variance_{threshold}'
    series[column_name] = 0

    
    last_step = series.iloc[-1]['step']       
    
    for current_start_step in range(0, len(series), ANGLEZ_VARIANCE_SEQUENCE_LENGTH):
        current_end_step = current_start_step + ANGLEZ_VARIANCE_SEQUENCE_LENGTH

        series_chunk = series[current_start_step:current_end_step]

        series_chunk_anglez = series_chunk['anglez'].abs()
        if not (series_chunk_anglez > threshold).any():
            clean_from = max(0, current_start_step - CLEAN_BUFFER)
            clean_to = min(last_step, current_end_step + CLEAN_BUFFER)
            
            series.loc[clean_from:clean_to, column_name] = 1

    return series

In [6]:
def mark_anglez_repetition(series, window_size):
    column_name = f'anglez_repetition_{window_size}'
    series[column_name] = 0
    last_step = series.iloc[-1]['step']       

    for current_start_step in range(0, len(series), window_size):
        current_end_step = current_start_step + window_size

        series_chunk = series[current_start_step:current_end_step].reset_index(drop=True)

        for comparing_start_step in range(current_end_step, len(series), window_size):
            comparing_end_step = comparing_start_step + window_size
            comparing_series_chunk = series[comparing_start_step:comparing_end_step].reset_index(drop=True)

            if series_chunk['anglez'].equals(comparing_series_chunk['anglez']):
                clean_from = max(0, current_start_step - CLEAN_BUFFER)
                clean_to = min(last_step, current_end_step + CLEAN_BUFFER)
                series.loc[clean_from:clean_to, column_name] = 1
                
                clean_from = max(0, comparing_start_step - CLEAN_BUFFER)
                clean_to = min(last_step, comparing_end_step + CLEAN_BUFFER)
                series.loc[clean_from:clean_to, column_name] = 1

    return series

In [7]:
def analyse(series_id):
    events = pd.read_csv('../../data/raw/train_events.csv')
    events = events[events.series_id == series_id].reset_index(drop=True)
    series = pd.read_parquet('../../data/raw/train_series.parquet', filters=[('series_id', '=', series_id)])
    
    series = mark_not_annotated(series, events)
    series = mark_anglez_too_little_variance(series, 50)
    series = mark_anglez_repetition(series, 8 * 60 * 12) #8h
    series = mark_anglez_repetition(series, 6 * 60 * 12) #6h
    series = mark_anglez_repetition(series, 4 * 60 * 12) #4h
    series = mark_anglez_repetition(series, 2 * 60 * 12) #2h

    return series

In [8]:
series_data = []

for series_id in tqdm(train_events.series_id.unique()):
    series = analyse(series_id)
    series_data.append(series)
    
    del series
    gc.collect()

train_series = pd.concat(series_data).reset_index(drop=True)

100%|██████████████████████████████████████████████████████████████████████████████| 277/277 [2:18:43<00:00, 30.05s/it]


## Analysis

In [9]:
# train_series = pd.read_parquet('../../data/data_quality.parquet')

In [10]:
events = pd.read_csv('../../data/raw/events.csv')
events = events[events['step'].notna()].reset_index()
events['step'] = events['step'].astype('int')
train_series = pd.merge(train_series, events[['series_id', 'step', 'event']], on=['series_id', 'step'], how='left')

In [11]:
whole_size = len(train_series)
not_annotated_size = len(train_series[train_series['not_annotated'] == 1])
too_little_variance_50_size = len(train_series[train_series['too_little_variance_50'] == 1])
anglez_repetition_8_size = len(train_series[train_series[f'anglez_repetition_{8 * 60 * 12}'] == 1])
anglez_repetition_6_size = len(train_series[train_series[f'anglez_repetition_{6 * 60 * 12}'] == 1])
anglez_repetition_4_size = len(train_series[train_series[f'anglez_repetition_{4 * 60 * 12}'] == 1])
anglez_repetition_2_size = len(train_series[train_series[f'anglez_repetition_{2 * 60 * 12}'] == 1])

In [12]:
print('Anteil an nicht annotierten Daten: ', (not_annotated_size/whole_size) * 100) 
print('Anteil an daten mit keinem Wert über oder unter 50: ', (too_little_variance_50_size/whole_size) * 100) 
print('Anteil an Daten mit Wiederholungen (8h): ', (anglez_repetition_8_size/whole_size) * 100) 
print('Anteil an Daten mit Wiederholungen (6h): ', (anglez_repetition_6_size/whole_size) * 100) 
print('Anteil an Daten mit Wiederholungen (4h): ', (anglez_repetition_4_size/whole_size) * 100) 
print('Anteil an Daten mit Wiederholungen (2h): ', (anglez_repetition_2_size/whole_size) * 100)

Anteil an nicht annotierten Daten:  40.08542956367489
Anteil an daten mit keinem Wert über oder unter 50:  14.710157398797028
Anteil an Daten mit Wiederholungen (8h):  22.802553789346376
Anteil an Daten mit Wiederholungen (6h):  24.212959901783826
Anteil an Daten mit Wiederholungen (4h):  25.233880078164017
Anteil an Daten mit Wiederholungen (2h):  26.692625205222754


In [13]:
not_annotated_and_too_little_variance_50_size = len(train_series[(train_series['too_little_variance_50'] == 1) & (train_series['not_annotated'] == 1)])
not_annotated_and_anglez_repetition_8_size = len(train_series[(train_series[f'anglez_repetition_{8 * 60 * 12}'] == 1) & (train_series['not_annotated'] == 1) ])
not_annotated_and_anglez_repetition_6_size = len(train_series[(train_series[f'anglez_repetition_{6 * 60 * 12}'] == 1) & (train_series['not_annotated'] == 1)])
not_annotated_and_anglez_repetition_4_size = len(train_series[(train_series[f'anglez_repetition_{4 * 60 * 12}'] == 1) & (train_series['not_annotated'] == 1)])
not_annotated_and_anglez_repetition_2_size = len(train_series[(train_series[f'anglez_repetition_{2 * 60 * 12}'] == 1) & (train_series['not_annotated'] == 1)])

annotated_unknown_and_too_little_variance_50_size = len(train_series[(train_series['too_little_variance_50'] == 1) & (train_series['not_annotated'] == 0)])
annotated_unknown_and_anglez_repetition_8_size = len(train_series[(train_series[f'anglez_repetition_{8 * 60 * 12}'] == 1) & (train_series['not_annotated'] == 0)])
annotated_unknown_and_anglez_repetition_6_size = len(train_series[(train_series[f'anglez_repetition_{6 * 60 * 12}'] == 1) & (train_series['not_annotated'] == 0)])
annotated_unknown_and_anglez_repetition_4_size = len(train_series[(train_series[f'anglez_repetition_{4 * 60 * 12}'] == 1) & (train_series['not_annotated'] == 0)])
annotated_unknown_and_anglez_repetition_2_size = len(train_series[(train_series[f'anglez_repetition_{2 * 60 * 12}'] == 1) & (train_series['not_annotated'] == 0)])

In [14]:
print('Nicht annotiert')
print('Anteil an daten mit keinem Wert über oder unter 50: ', (not_annotated_and_too_little_variance_50_size/too_little_variance_50_size) * 100) 
print('Anteil an Daten mit Wiederholungen (8h): ', (not_annotated_and_anglez_repetition_8_size/anglez_repetition_8_size) * 100) 
print('Anteil an Daten mit Wiederholungen (6h): ', (not_annotated_and_anglez_repetition_6_size/anglez_repetition_6_size) * 100) 
print('Anteil an Daten mit Wiederholungen (4h): ', (not_annotated_and_anglez_repetition_4_size/anglez_repetition_4_size) * 100) 
print('Anteil an Daten mit Wiederholungen (2h): ', (not_annotated_and_anglez_repetition_2_size/anglez_repetition_2_size) * 100)

Nicht annotiert
Anteil an daten mit keinem Wert über oder unter 50:  99.02629537007067
Anteil an Daten mit Wiederholungen (8h):  99.5073047560906
Anteil an Daten mit Wiederholungen (6h):  99.31252815562863
Anteil an Daten mit Wiederholungen (4h):  98.83097307158876
Anteil an Daten mit Wiederholungen (2h):  98.14292691866714


In [15]:
print('Annotiert')
print('Anteil an daten mit keinem Wert über oder unter 50: ', (annotated_unknown_and_too_little_variance_50_size/too_little_variance_50_size) * 100) 
print('Anteil an Daten mit Wiederholungen (8h): ', (annotated_unknown_and_anglez_repetition_8_size/anglez_repetition_8_size) * 100) 
print('Anteil an Daten mit Wiederholungen (6h): ', (annotated_unknown_and_anglez_repetition_6_size/anglez_repetition_6_size) * 100) 
print('Anteil an Daten mit Wiederholungen (4h): ', (annotated_unknown_and_anglez_repetition_4_size/anglez_repetition_4_size) * 100) 
print('Anteil an Daten mit Wiederholungen (2h): ', (annotated_unknown_and_anglez_repetition_2_size/anglez_repetition_2_size) * 100)

Annotiert
Anteil an daten mit keinem Wert über oder unter 50:  0.9737046299293326
Anteil an Daten mit Wiederholungen (8h):  0.492695243909407
Anteil an Daten mit Wiederholungen (6h):  0.6874718443713727
Anteil an Daten mit Wiederholungen (4h):  1.1690269284112476
Anteil an Daten mit Wiederholungen (2h):  1.8570730813328569


## Daten speichern für weitere Analysen

In [17]:
train_series.to_parquet('../../data/data_quality.parquet')