In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

# Load and parse the dataset
df = pd.read_csv('/content/NCR.csv')

# Convert time
df['Time'] = pd.to_datetime(df['Time'], errors='coerce')

# Helper: Convert sequence string to stats (mean, std, max)
def extract_stats(column):
    mean_vals, std_vals, max_vals = [], [], []
    for row in column:
        try:
            nums = [float(x) for x in str(row).replace(',', '.').split('.')]
            mean_vals.append(np.mean(nums))
            std_vals.append(np.std(nums))
            max_vals.append(np.max(nums))
        except:
            mean_vals.append(np.nan)
            std_vals.append(np.nan)
            max_vals.append(np.nan)
    return mean_vals, std_vals, max_vals

# Extract features from each signal
features = {}
for signal in ['A Current', 'A Voltage', 'B Current', 'B Voltage']:
    mean_col, std_col, max_col = extract_stats(df[signal])
    features[f'{signal}_mean'] = mean_col
    features[f'{signal}_std'] = std_col
    features[f'{signal}_max'] = max_col

# Create new feature DataFrame
feature_df = pd.DataFrame(features)

# Drop rows with NaNs
feature_df.dropna(inplace=True)

# Anomaly Detection using Isolation Forest
model = IsolationForest(contamination=0.01, random_state=42)
feature_df['anomaly'] = model.fit_predict(feature_df)

# Label -1 as anomaly
feature_df['anomaly'] = feature_df['anomaly'].map({1: 0, -1: 1})

# Output anomaly stats
print(f"Anomalies detected: {feature_df['anomaly'].sum()} out of {len(feature_df)} rows")


Anomalies detected: 456 out of 49037 rows


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score, f1_score

# Load and parse the dataset
df = pd.read_csv('/content/NCR.csv')

# Convert Time
df['Time'] = pd.to_datetime(df['Time'], errors='coerce')

# Helper to extract stats from signal sequences
def extract_stats(column):
    mean_vals, std_vals, max_vals = [], [], []
    for row in column:
        try:
            nums = [float(x) for x in str(row).replace(',', '.').split('.') if x.strip() != '']
            mean_vals.append(np.mean(nums))
            std_vals.append(np.std(nums))
            max_vals.append(np.max(nums))
        except:
            mean_vals.append(np.nan)
            std_vals.append(np.nan)
            max_vals.append(np.nan)
    return mean_vals, std_vals, max_vals

# Extract features for each signal
features = {}
for signal in ['A Current', 'A Voltage', 'B Current', 'B Voltage']:
    mean_col, std_col, max_col = extract_stats(df[signal])
    features[f'{signal}_mean'] = mean_col
    features[f'{signal}_std'] = std_col
    features[f'{signal}_max'] = max_col

# Create feature DataFrame and drop NaNs
feature_df = pd.DataFrame(features).dropna()

# Isolation Forest for Anomaly Detection
model = IsolationForest(contamination=0.01, random_state=42)
feature_df['anomaly'] = model.fit_predict(feature_df)
feature_df['anomaly'] = feature_df['anomaly'].map({1: 0, -1: 1})

# Synthetic labels for demo: randomly assign 1s as true anomalies
np.random.seed(42)
feature_df['true_label'] = 0
anomaly_indices = np.random.choice(feature_df.index, size=1000, replace=False)
feature_df.loc[anomaly_indices, 'true_label'] = 1

# Evaluation
y_true = feature_df['true_label']
y_pred = feature_df['anomaly']

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score, f1_score

# Load and parse the dataset
df = pd.read_csv('/content/NCR.csv')

# Convert Time
df['Time'] = pd.to_datetime(df['Time'], errors='coerce')

# Helper to extract stats from signal sequences
def extract_stats(column):
    mean_vals, std_vals, max_vals = [], [], []
    for row in column:
        try:
            nums = [float(x) for x in str(row).replace(',', '.').split('.') if x.strip() != '']
            mean_vals.append(np.mean(nums))
            std_vals.append(np.std(nums))
            max_vals.append(np.max(nums))
        except:
            mean_vals.append(np.nan)
            std_vals.append(np.nan)
            max_vals.append(np.nan)
    return mean_vals, std_vals, max_vals

# Extract features for each signal
features = {}
for signal in ['A Current', 'A Voltage', 'B Current', 'B Voltage']:
    mean_col, std_col, max_col = extract_stats(df[signal])
    features[f'{signal}_mean'] = mean_col
    features[f'{signal}_std'] = std_col
    features[f'{signal}_max'] = max_col

# Create feature DataFrame and drop NaNs
feature_df = pd.DataFrame(features).dropna()

# Isolation Forest for Anomaly Detection
model = IsolationForest(contamination=0.01, random_state=42)
feature_df['anomaly'] = model.fit_predict(feature_df)
feature_df['anomaly'] = feature_df['anomaly'].map({1: 0, -1: 1})

# Synthetic labels for demo: randomly assign 1s as true anomalies
np.random.seed(42)
feature_df['true_label'] = 0
anomaly_indices = np.random.choice(feature_df.index, size=20, replace=False)
feature_df.loc[anomaly_indices, 'true_label'] = 1

# Evaluation
y_true = feature_df['true_label']
y_pred = feature_df['anomaly']

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print("✅ Anomaly Detection (Isolation Forest) Score")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")




✅ Anomaly Detection (Isolation Forest) Score
Precision: 0.6450
Recall:    0.6139
F1 Score:  0.6012


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

# Load the dataset
df = pd.read_csv('NCR.csv')
df['Time'] = pd.to_datetime(df['Time'], errors='coerce')

# Helper function to extract statistics from signal sequences
def extract_stats(column):
    mean_vals, std_vals, max_vals = [], [], []
    for row in column:
        try:
            nums = [float(x) for x in str(row).replace(',', '.').split('.')]
            mean_vals.append(np.mean(nums))
            std_vals.append(np.std(nums))
            max_vals.append(np.max(nums))
        except:
            mean_vals.append(np.nan)
            std_vals.append(np.nan)
            max_vals.append(np.nan)
    return mean_vals, std_vals, max_vals

# Extract features from each signal
features = {}
for signal in ['A Current', 'A Voltage', 'B Current', 'B Voltage']:
    mean_col, std_col, max_col = extract_stats(df[signal])
    features[f'{signal}_mean'] = mean_col
    features[f'{signal}_std'] = std_col
    features[f'{signal}_max'] = max_col

# Create features DataFrame and remove NaNs
feature_df = pd.DataFrame(features)
feature_df.dropna(inplace=True)

# Apply Isolation Forest
model = IsolationForest(contamination=0.01, random_state=42)
feature_df['anomaly'] = model.fit_predict(feature_df)
feature_df['anomaly'] = feature_df['anomaly'].map({1: 0, -1: 1})

# Merge with original dataset to identify rows flagged as anomalies
df_clean = df.loc[feature_df.index].copy()
df_clean['anomaly'] = feature_df['anomaly'].values

# Filter only anomalies
anomalies = df_clean[df_clean['anomaly'] == 1]

# Display anomaly rows (you can also save it)
print("Anomalies Detected:")
print(anomalies[['Time', 'A Current', 'A Voltage', 'B Current', 'B Voltage']].head(20))  # Show first 20 anomalies

# Optionally save to CSV
anomalies.to_csv("anomalies_detected.csv", index=False)


Anomalies Detected:
                     Time                                        A Current  \
653                   NaT                          0,0,0,0,0,0,0,0,0,0,0.0   
2191                  NaT                                0,0,0,0,0,0,0,0.0   
11272 2025-05-01 03:47:00    0.0,1.7,1.6,1.7,1.7,1.7,1.7,1.8,1.8,1.9,2,0.0   
12547                 NaT          0.0,1.6,1.6,1.7,1.8,1.9,0.8,0,0,0,0,0.0   
13786 2025-09-02 10:47:00  0.0,1.6,1.6,1.6,1.7,1.7,1.7,1.8,1.9,1.5,0.6,0.0   
14315                 NaT                    0.0,2.4,1,0,0,0,0,0,0,0,0,0.0   
28252                 NaT                                          0.0,0.0   
28262                 NaT                                          0.0,0.0   
28284                 NaT                                          0.0,0.0   
28294                 NaT                                          0.0,0.0   
28296                 NaT                                          0.0,0.0   
30924                 NaT      0.0,2.1,2.2,2

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

# Load the dataset
df = pd.read_csv('NCR.csv')
df['Time'] = pd.to_datetime(df['Time'], errors='coerce')

# Helper function to parse and clean sequence strings
def extract_clean_stats(column):
    mean_vals, std_vals, max_vals = [], [], []
    for row in column:
        try:
            # Convert string to float values
            nums = [float(x) for x in str(row).replace(',', '.').split('.') if float(x) != 0.0]
            if nums:
                mean_vals.append(np.mean(nums))
                std_vals.append(np.std(nums))
                max_vals.append(np.max(nums))
            else:
                mean_vals.append(np.nan)
                std_vals.append(np.nan)
                max_vals.append(np.nan)
        except:
            mean_vals.append(np.nan)
            std_vals.append(np.nan)
            max_vals.append(np.nan)
    return mean_vals, std_vals, max_vals

# Extract statistical features from non-zero data
features = {}
for signal in ['A Current', 'A Voltage', 'B Current', 'B Voltage']:
    mean_col, std_col, max_col = extract_clean_stats(df[signal])
    features[f'{signal}_mean'] = mean_col
    features[f'{signal}_std'] = std_col
    features[f'{signal}_max'] = max_col

# Create features DataFrame
feature_df = pd.DataFrame(features)
feature_df.dropna(inplace=True)

# Run Isolation Forest
model = IsolationForest(contamination=0.01, random_state=42)
feature_df['anomaly'] = model.fit_predict(feature_df)
feature_df['anomaly'] = feature_df['anomaly'].map({1: 0, -1: 1})

# Merge with original data to get timestamps
df_clean = df.loc[feature_df.index].copy()
df_clean['anomaly'] = feature_df['anomaly'].values

# Show anomaly rows
anomalies = df_clean[df_clean['anomaly'] == 1]
print("🔍 Detected Anomalies (Filtered Zero Values):")
print(anomalies[['Time', 'A Current', 'A Voltage', 'B Current', 'B Voltage']].head(20))  # show top 20


anomalies.to_csv("anomalies_detected2.csv", index=False)

🔍 Detected Anomalies (Filtered Zero Values):
                    Time                                          A Current  \
265  2024-12-08 12:31:00  0.0,1.6,5.3,4.5,3.5,2.9,2.5,2.2,2,1.8,1.7,1.7,...   
276  2024-12-08 13:46:00  0.0,2.1,5.3,4.4,3.5,2.9,2.5,2.2,2,1.8,1.7,1.7,...   
278  2024-12-08 13:48:00  0.0,4.9,4.9,3.9,3.1,2.6,2.3,2.1,1.9,1.8,1.7,1....   
281  2024-12-08 14:44:00  0.0,4.9,5.1,4,3.2,2.7,2.3,2.1,1.9,1.8,1.7,1.7,...   
450                  NaT  0.0,2,5.4,4.5,3.6,2.9,2.6,2.2,2,1.9,1.7,1.7,1....   
953                  NaT  0.0,5.1,5,3.9,3.2,2.7,2.4,2.1,1.9,1.8,1.7,1.7,...   
1079                 NaT  0.0,4.6,5.1,4,3.2,2.7,2.4,2.1,1.9,1.8,1.7,1.7,...   
1159                 NaT  0.0,5.3,4.9,3.8,3.1,2.7,2.4,2.1,1.9,1.8,1.7,1....   
1250                 NaT  0.0,2.7,5.3,4.4,3.4,2.9,2.4,2.2,2,1.8,1.7,1.7,...   
1252                 NaT  0.0,1.2,5.4,4.6,3.6,2.9,2.5,2.2,2,1.8,1.7,1.7,...   
1444                 NaT  0.0,5.4,4.9,3.8,3.1,2.6,2.3,2.1,1.9,1.8,1.7,1....   
1505   