In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import interact, Dropdown
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from datetime import timedelta

In [14]:
# Load the data
asset_number = 16
data_path = './by_asset/C/assets/%s.csv' % asset_number
events_path = './by_asset/C/assets/%s_events.csv' % asset_number
feature_description_path = './CARE_To_Compare/Wind Farm C/feature_description.csv'

# Read the sensor data
data = pd.read_csv(data_path, sep=';')

# Read the event data
events = pd.read_csv(events_path, sep=';')

# Read the feature description data
feature_description = pd.read_csv(feature_description_path, sep=';')

  data = pd.read_csv(data_path, sep=';')


In [16]:
# Ensure feature_description contains the required columns
required_columns = {'sensor_name', 'description', 'unit'}
if not required_columns.issubset(feature_description.columns):
    raise ValueError(f"The feature description file must contain the columns: {required_columns}")

# Convert time columns to datetime for proper comparison
data['time_stamp'] = pd.to_datetime(data['time_stamp'])
events['event_start'] = pd.to_datetime(events['event_start'])
events['event_end'] = pd.to_datetime(events['event_end'])

# Initialize the event column in data
data['event'] = 'other'

# Assign event labels to the data based on the event time ranges
for _, event in events.iterrows():
    mask = (data['time_stamp'] >= event['event_start']) & (data['time_stamp'] <= event['event_end'])
    data.loc[mask, 'event'] = event['event_label']

# Define shorthand to longhand mapping
shorthand_to_longhand = {
    '_avg': 'average',
    '_max': 'maximum',
    '_min': 'minimum',
    '_std': 'std_dev'
}

# Create a mapping of sensor names to their descriptions and units
sensor_mapping = {}
for _, row in feature_description.iterrows():
    base_sensor = row['sensor_name']
    description = row['description']
    unit = row['unit']
    for shorthand, longhand in shorthand_to_longhand.items():
        full_sensor_name = f"{base_sensor}{shorthand}"
        full_description = f"{base_sensor}: {description} [{unit}] ({longhand})"
        sensor_mapping[full_sensor_name] = full_description

# Filter the sensor columns and map their names to descriptions
sensor_columns = [col for col in data.columns if col.startswith('sensor_') or col.startswith('power_') or col.startswith('wind_speed_')]
sensor_options = {sensor_mapping[sensor]: sensor for sensor in sensor_columns if sensor in sensor_mapping}

# Ensure the dropdown is not empty
if not sensor_options:
    raise ValueError("No matching sensors found between the data and the feature description file.")

# Function to plot selected sensor data with event-based coloring
def plot_sensor(sensor_id):
    if sensor_id not in data.columns:
        raise ValueError(f"Sensor '{sensor_id}' not found in the data.")
    
    plt.figure(figsize=(12, 6))
    
    # Filter data based on event type
    anomaly = data[data['event'] == 'anomaly']
    normal = data[data['event'] == 'normal']
    other = data[data['event'] == 'other']
    
    # Plot data with different colors, ensuring red and green are on top
    plt.plot(other['time_stamp'], other[sensor_id], 'b.', label='Other', alpha=0.5)
    plt.plot(normal['time_stamp'], normal[sensor_id], 'g.', label='Normal', alpha=0.8)
    plt.plot(anomaly['time_stamp'], anomaly[sensor_id], 'r.', label='Anomaly', alpha=0.8)
    
    plt.title(f'Plot of {sensor_mapping[sensor_id]}')
    plt.xlabel('Time Stamp')
    plt.ylabel(sensor_mapping[sensor_id])
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid()
    plt.show()

# Create a dropdown for sensor selection
interact(plot_sensor, sensor_id=Dropdown(options=sensor_options, description='Select Sensor:'))

interactive(children=(Dropdown(description='Select Sensor:', options={'sensor_0: ABB-LS Input K1, IL1 [A] (ave…

<function __main__.plot_sensor(sensor_id)>

In [25]:
# Configurable variables
sequence_length = 30  # Number of consecutive datapoints
train_event_ids = [79, 46]  # Event IDs for training
test_event_ids = [30, 65]  # Event IDs for testing
time_step = timedelta(minutes=10)  # Expected time difference between consecutive rows

# Load the data
asset_number = 16
data_path = './by_asset/C/assets/%s.csv' % asset_number
events_path = './by_asset/C/assets/%s_events.csv' % asset_number

data = pd.read_csv(data_path, sep=';')
events = pd.read_csv(events_path, sep=';')

# Convert time columns to datetime
data['time_stamp'] = pd.to_datetime(data['time_stamp'])
events['event_start'] = pd.to_datetime(events['event_start'])
events['event_end'] = pd.to_datetime(events['event_end'])

# Assign event labels to the data
data['event'] = 'other'
for _, event in events.iterrows():
    mask = (data['time_stamp'] >= event['event_start']) & (data['time_stamp'] <= event['event_end'])
    data.loc[mask, 'event'] = event['event_label']

# Map event labels to binary classes
data['event_class'] = data['event'].apply(lambda x: 1 if x == 'anomaly' else 0)


# Function to prepare data for training/testing based on event IDs
def prepare_data(data, events, event_ids, sequence_length, time_step):
    filtered_data = pd.DataFrame()
    for event_id in event_ids:
        event = events[events['event_id'] == event_id]
        for _, row in event.iterrows():
            event_data = data[(data['time_stamp'] >= row['event_start']) & (data['time_stamp'] <= row['event_end'])]
            filtered_data = pd.concat([filtered_data, event_data])
    
    # Sort by timestamp to ensure proper sequencing
    filtered_data = filtered_data.sort_values(by='time_stamp')
    return create_sequences(filtered_data, sequence_length, time_step)


# Function to create sequences of 30 consecutive datapoints
def create_sequences(data, sequence_length, time_step):
    sequences = []
    labels = []
    timestamps = []
    
    # Identify gaps in time
    data['time_diff'] = data['time_stamp'].diff().fillna(pd.Timedelta(seconds=0))
    data['is_continuous'] = data['time_diff'] <= time_step
    
    # Reset the index whenever there is a gap
    data['group'] = (~data['is_continuous']).cumsum()
    
    # Select only numeric columns for the model
    numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
    numeric_columns = [col for col in numeric_columns if col not in ['event_class', 'time_diff', 'is_continuous', 'group']]
    
    # Process each group separately
    for _, group_data in data.groupby('group'):
        group_data = group_data.reset_index(drop=True)
        for i in range(len(group_data) - sequence_length + 1):
            sequence = group_data.iloc[i:i + sequence_length]
            sequences.append(sequence[numeric_columns].values.flatten())  # Use only numeric columns
            labels.append(sequence['event_class'].mean())
            timestamps.append(sequence['time_stamp'].iloc[0])  # Keep track of the start time of the sequence
    
    return np.array(sequences), np.array(labels), timestamps

# Prepare training and testing data
X_train, y_train, train_timestamps = prepare_data(data, events, train_event_ids, sequence_length, time_step)
X_test, y_test, test_timestamps = prepare_data(data, events, test_event_ids, sequence_length, time_step)

# Train the XGBoost model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Example output of timestamps for debugging
print("Train timestamps:", train_timestamps[:5])
print("Test timestamps:", test_timestamps[:5])

  data = pd.read_csv(data_path, sep=';')
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

         0.0       0.37      0.88      0.52      2033
         1.0       0.39      0.05      0.09      3234

    accuracy                           0.37      5267
   macro avg       0.38      0.46      0.30      5267
weighted avg       0.38      0.37      0.25      5267

Train timestamps: [Timestamp('2024-05-09 06:50:00'), Timestamp('2024-05-09 07:00:00'), Timestamp('2024-05-09 07:10:00'), Timestamp('2024-05-09 07:20:00'), Timestamp('2024-05-09 07:30:00')]
Test timestamps: [Timestamp('2019-11-06 04:20:00'), Timestamp('2019-11-06 04:30:00'), Timestamp('2019-11-06 04:40:00'), Timestamp('2019-11-06 04:50:00'), Timestamp('2019-11-06 05:00:00')]


In [None]:
#%pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.8-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.15-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.16-py3-none-any.whl.metadata (20 kB)
Downloading ipywidgets-8.1.8-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.16-py3-none-any.whl (914 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m914.9/914.9 kB[0m [31m20.9 MB/s[0m  [33m0:00:00[0m
[?25hDownloading widgetsnbextension-4.0.15-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m28.3 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: widgetsnbextension, jupyterlab_widgets, ipywidgets
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [ipywidgets]3[0m [ipywidgets]
[1A[2KSuccessfully installed ipywidgets