Python Imports

In [None]:
%pip install --upgrade pip
%pip install pandas
%pip install numpy
%pip install plotly
%pip install nbformat

In [85]:
import pandas as pd
import numpy as np
from itertools import groupby
from operator import itemgetter
import plotly.graph_objs as go
import plotly.express as px

In [None]:
# pandas settings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 0)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)

# Defining variables and functions
sensors = ["dig1", "ana1", "noise"]
        
# Load the dataset
df = pd.read_csv('1datasetCompressed.csv', comment='#', low_memory=False)
df.set_index('Index', inplace=True)

# Display the first few rows of the DataFrame
print(df.head())

Remove impossible Values

In [87]:
df.loc[df['dig1'] < 10, 'dig1'] = np.nan
df.loc[df['dig1'] > 50, 'dig1'] = np.nan
df.loc[df['ana1'] < 10, 'ana1'] = np.nan
df.loc[df['ana1'] > 50, 'ana1'] = np.nan
df.loc[df['noise'] < 0, 'noise'] = np.nan
df.loc[df['noise'] > 100, 'noise'] = np.nan

Search the DataFrame for mismatches in Sensordata/Availability for the same indices

In [88]:
# Find indices where some sensor values are present but others are not
missing_values = df.isna()
some_missing = missing_values.any(axis=1) & ~missing_values.all(axis=1)

# Get the rows
filtered_df = df[some_missing]

# Filter out rows that have neighbors with missing values
missing_indices = df.index[missing_values.any(axis=1)]
def has_missing_neighbors(index):
    if index - 1 in missing_indices or index + 1 in missing_indices:
        return True
    return False

filtered_df = filtered_df[~filtered_df.index.map(has_missing_neighbors)]

Fill in NaN values with linear interpolation for missing datablocks < 30 datapoints

In [89]:
def replace_nan_with_interpolation(df, max_gap=30):
    for col in df.columns:
        if col == 'Time':
            continue
        df[col] = df[col].interpolate(method='linear', limit=max_gap, limit_direction='both')
    return df
# Replace NaN values
df = replace_nan_with_interpolation(df, max_gap=30)

columns_to_round = ['ana1', 'dig1', 'noise']
df[columns_to_round] = df[columns_to_round].round(2)

Fill values with linear interpolation for Missing indexes in datablocks < 30 datapoints

In [90]:
def find_missing_index_ranges(df):
    all_indices = pd.RangeIndex(start=df.index.min(), stop=df.index.max() + 1)
    missing_indices = all_indices.difference(df.index)
    
    results = []
    if len(missing_indices) == 0:
        return pd.DataFrame(results, columns=['from_index', 'to_index'])
    
    start_idx = missing_indices[0]
    for i in range(1, len(missing_indices)):
        if missing_indices[i] != missing_indices[i-1] + 1:
            end_idx = missing_indices[i-1]
            results.append({'from_index': start_idx, 'to_index': end_idx})
            start_idx = missing_indices[i]
    results.append({'from_index': start_idx, 'to_index': missing_indices[-1]})
    
    return pd.DataFrame(results)

def fill_missing_indices(df, max_chain_length=30):
    missing_ranges = find_missing_index_ranges(df)
    
    for _, row in missing_ranges.iterrows():
        from_index = row['from_index']
        to_index = row['to_index']
        chain_length = to_index - from_index + 1
        
        if chain_length < max_chain_length:
            if from_index > df.index.min() and to_index < df.index.max():
                for col in df.columns:
                    if col == 'Time':
                        continue
                    prev_val = df.at[from_index - 1, col]
                    next_val = df.at[to_index + 1, col]
                    if not pd.isna(prev_val) and not pd.isna(next_val):
                        avg_val = (prev_val + next_val) / 2
                        for idx in range(from_index, to_index + 1):
                            df.at[idx, col] = avg_val
    
    df.sort_index(inplace=True)
    return df

# Fill missing indices
df = fill_missing_indices(df)
columns_to_round = ['ana1', 'dig1', 'noise']
df[columns_to_round] = df[columns_to_round].round(2)

Find and interpolate outliers

In [91]:
def find_outliers_z_score(df, columns, threshold=3):
    outliers = {}
    for column in columns:
        mean = df[column].mean()
        std = df[column].std()
        z_scores = (df[column] - mean) / std
        outliers[column] = df[(z_scores > threshold) | (z_scores < -threshold)]
    return outliers

def replace_outliers_with_NaN(df, outliers):
    for column, outliers_df in outliers.items():
        df.loc[outliers_df.index, column] = np.nan
        replace_nan_with_interpolation(df, max_gap=30)
    return df

columns_to_check = ['ana1', 'dig1']
outliers = find_outliers_z_score(df, columns_to_check)

df = replace_outliers_with_NaN(df, outliers)

columns_to_round = ['ana1', 'dig1', 'noise']
df[columns_to_round] = df[columns_to_round].round(2)

Check if the Data is actually clean by

1. Checking if there are values outside a possible scope

In [None]:
mask_dig1 = (df['dig1'] < 10) | (df['dig1'] > 50)
mask_ana1 = (df['ana1'] < 10) | (df['ana1'] > 50)
mask_noise = (df['noise'] < 0) | (df['noise'] > 100)

combined_mask = mask_dig1 | mask_ana1 | mask_noise
filtered_df = df[combined_mask]

print(filtered_df)

2. Checking for NaN-Valueranges

In [None]:
# Function to find from-to indices for NaN values
def find_nan_ranges(df):
    results = []
    for col in df.columns:
        if col == 'Time':
            continue
        nan_indices = df[df[col].isna()].index
        if len(nan_indices) == 0:
            continue
        
        start_idx = nan_indices[0]
        for i in range(1, len(nan_indices)):
            if nan_indices[i] != nan_indices[i-1] + 1:
                end_idx = nan_indices[i-1]
                results.append({'from-index': start_idx, 'to-index': end_idx, 'affectedSensor': col})
                start_idx = nan_indices[i]
        results.append({'from-index': start_idx, 'to-index': nan_indices[-1], 'affectedSensor': col})
    
    return pd.DataFrame(results)

# Find from-to ranges for NaN values
nan_ranges_df = find_nan_ranges(df)

print(nan_ranges_df)

3. Checking for missing Indexranges

In [None]:
def find_missing_index_ranges(df):
    all_indices = pd.RangeIndex(start=df.index.min(), stop=df.index.max() + 1)
    missing_indices = all_indices.difference(df.index)
    
    results = []
    if len(missing_indices) == 0:
        return pd.DataFrame(results, columns=['from_index', 'to_index'])
    
    start_idx = missing_indices[0]
    for i in range(1, len(missing_indices)):
        if missing_indices[i] != missing_indices[i-1] + 1:
            end_idx = missing_indices[i-1]
            results.append({'from_index': start_idx, 'to_index': end_idx})
            start_idx = missing_indices[i]
    results.append({'from_index': start_idx, 'to_index': missing_indices[-1]})
    
    return pd.DataFrame(results)

# Find ranges of missing indices
missing_index_ranges_df = find_missing_index_ranges(df)

print(missing_index_ranges_df)

Check for outliers

In [None]:
def find_outliers_z_score(df, column, threshold=3.1):
    mean = df[column].mean()
    std = df[column].std()
    z_scores = (df[column] - mean) / std
    outliers = df[(z_scores > threshold) | (z_scores < -threshold)]
    return outliers

ana1_outliers = find_outliers_z_score(df, 'ana1')
dig1_outliers = find_outliers_z_score(df, 'dig1')

def check_no_outliers(outliers, column_name):
    if outliers.empty:
        print(f"No outliers detected in '{column_name}'.")
    else:
        print(f"Outliers detected in '{column_name}':")
        print(outliers)

# Check for outliers in 'ana1' and 'dig1'
check_no_outliers(ana1_outliers, 'ana1')
check_no_outliers(dig1_outliers, 'dig1')

Save and show the new Dataframe

In [None]:
columns_to_round = ['ana1', 'dig1', 'noise']
df[columns_to_round] = df[columns_to_round].round(2)

downsampled_df = df.sample(n=100000).sort_values(by=['Time'])

# Melt the DataFrame
melted_df = downsampled_df.melt(id_vars=['Time'], value_vars=['ana1', 'dig1', 'noise'], var_name='Sensor',value_name='Value')

# Create a Plotly figure
fig = px.line(melted_df, x='Time', y='Value', color='Sensor', title='Sensor Data Over Time')

# Update layout
fig.update_layout(
    xaxis_title='Time',
    yaxis_title='Value',
    hovermode='x unified'
)

# Show the figure
fig.show()

In [97]:
# Save the DataFrame to a CSV file
df.to_csv('2datasetCleaned.csv')