# Data Cleaning

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import gc

## Daten laden

In [2]:
train_events = pd.read_csv('../../data/raw/events.csv')

In [3]:
print(train_events.shape)

(14508, 6)


## Wiederholungen aus Serien entfernen

In [4]:
sequence_length = 24 * 60 * 12 # 17280 Steps = 1 Day
plot_series = False

def remove_duplicates(series_id):
    series_to_clean = pd.read_parquet('../../data/raw/series.parquet', filters=[('series_id','=',series_id)])

    multiplicator = 0

    indices_to_remove = []

    if plot_series:
        fig, ax = plt.subplots(figsize=(20, 3))
        sns.lineplot(data=train_series, x="step", y="anglez", linewidth = 0.5)
        plt.show()

    while True:
        # Get 24 Hours (s1) and the next 24 Hours (s2)
        s1 = series_to_clean[multiplicator*sequence_length:(multiplicator+1)*sequence_length]['anglez'].reset_index(drop=True)
        s2 = series_to_clean[(multiplicator+1)*sequence_length:(multiplicator+2)*sequence_length]['anglez'].reset_index(drop=True)

        # If the length is not the same, its the last part of the series
        if len(s1) != len(s2):
            # If the last part of the series is the same as the part 24 hours before, remove that as well
            if s1[:len(s2)].equals(s2):
                indices_to_remove.append((len(series_to_clean)-len(s2), len(series_to_clean)))
            break

        # If the 24 hours match, remove those indices
        if s1.equals(s2):
            indices_to_remove.append(((multiplicator+1)*sequence_length, (multiplicator+2)*sequence_length))

        multiplicator += 1


    cleaned_df = series_to_clean

    # Remove the indices reversed, otherwise the indices of the remaining rows change
    for start_idx, end_idx in reversed(indices_to_remove):
        cleaned_df = cleaned_df.drop(index=cleaned_df.iloc[start_idx:end_idx].index)

    if plot_series:
        fig, ax = plt.subplots(figsize=(20, 3))
        sns.lineplot(data=cleaned_df, x="step", y="anglez", linewidth = 0.5)
        plt.show()
    
    return cleaned_df

In [5]:
%%time

train_data = []

total_len = train_events.series_id.nunique()

for i, series_id in enumerate(train_events.series_id.unique()):
    print(f'Step {i+1} of {total_len}')
    train = remove_duplicates(series_id)
    train_data.append(train)
    del train
    gc.collect()

train_series = pd.concat(train_data).reset_index(drop=True)

Step 1 of 277
Step 2 of 277
Step 3 of 277
Step 4 of 277
Step 5 of 277
Step 6 of 277
Step 7 of 277
Step 8 of 277
Step 9 of 277
Step 10 of 277
Step 11 of 277
Step 12 of 277
Step 13 of 277
Step 14 of 277
Step 15 of 277
Step 16 of 277
Step 17 of 277
Step 18 of 277
Step 19 of 277
Step 20 of 277
Step 21 of 277
Step 22 of 277
Step 23 of 277
Step 24 of 277
Step 25 of 277
Step 26 of 277
Step 27 of 277
Step 28 of 277
Step 29 of 277
Step 30 of 277
Step 31 of 277
Step 32 of 277
Step 33 of 277
Step 34 of 277
Step 35 of 277
Step 36 of 277
Step 37 of 277
Step 38 of 277
Step 39 of 277
Step 40 of 277
Step 41 of 277
Step 42 of 277
Step 43 of 277
Step 44 of 277
Step 45 of 277
Step 46 of 277
Step 47 of 277
Step 48 of 277
Step 49 of 277
Step 50 of 277
Step 51 of 277
Step 52 of 277
Step 53 of 277
Step 54 of 277
Step 55 of 277
Step 56 of 277
Step 57 of 277
Step 58 of 277
Step 59 of 277
Step 60 of 277
Step 61 of 277
Step 62 of 277
Step 63 of 277
Step 64 of 277
Step 65 of 277
Step 66 of 277
Step 67 of 277
Step

In [6]:
print(train_series.shape)

(109178100, 6)


## Serien ohne Events entfernen

In [7]:
series_id_list = train_series['series_id'].unique().tolist()

events_series_id_list = train_events.dropna()['series_id'].unique().tolist()

series_without_events = list(set(series_id_list) - set(events_series_id_list))

print('Serien ohne Events: \n', series_without_events)

Serien ohne Events: 
 ['390b487231ce', 'e11b9d69f856', '89c7daa72eee', '2fc653ca75c7', 'a3e59c2ce3f6', '0f9e60a8e56d', 'c7b1283bb7eb', 'c5d08fc3e040']


In [8]:
cleaned_train_series = train_series[~train_series.series_id.isin(series_without_events)]
cleaned_train_events = train_events[~train_events.series_id.isin(series_without_events)]

## Gesäuberte Daten speichern

In [9]:
print(cleaned_train_series.shape)
print(cleaned_train_events.shape)

(107115120, 6)
(14154, 6)


In [10]:
cleaned_train_series.to_parquet('../../data/cleaned/series.parquet')
cleaned_train_events.to_csv('../../data/cleaned/events.csv', index=False)