In [1]:
import pandas as pd
import os

In [2]:
# Load the dataset
df = pd.read_csv('../data/raw/synthetic_semicon_50k.csv')

In [3]:
df.head(10)

Unnamed: 0,run_id,timestamp,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,...,sensor_92,sensor_93,sensor_94,sensor_95,sensor_96,sensor_97,sensor_98,sensor_99,sensor_100,label
0,1,2024-01-01 00:00:00,0.496714,-0.138264,0.647689,1.52303,-0.234153,-0.234137,1.579213,0.767435,...,0.968645,-0.702053,-0.327662,-0.392108,-1.463515,0.29612,0.261055,0.005113,-0.234587,1
1,2,2024-01-01 00:10:00,,-0.420645,-0.342715,,-0.161286,0.404051,1.886186,0.174578,...,0.856399,0.214094,-1.245739,0.173181,0.385317,-0.883857,0.153725,0.058209,-1.14297,1
2,3,2024-01-01 00:20:00,0.357787,0.560785,1.083051,1.053802,-1.377669,-0.937825,0.515035,0.513786,...,-0.493001,-0.589365,,0.357015,-0.69291,0.8996,0.3073,0.812862,0.629629,-1
3,4,2024-01-01 00:30:00,-0.828995,-0.560181,,0.61037,-0.020902,0.117327,1.277665,-0.591571,...,0.491919,-1.320233,1.831459,1.17944,-0.469176,-1.713135,1.353872,-0.11454,1.237816,-1
4,5,2024-01-01 00:40:00,-1.594428,-0.599375,0.005244,,-0.450065,0.62285,-1.06762,-0.142379,...,1.479944,0.077368,-0.861284,1.523124,,-1.037246,-0.190339,-0.875618,,1
5,6,2024-01-01 00:50:00,0.926178,,-1.398568,,-0.650643,-0.487125,,-0.863991,...,-0.477657,0.47898,0.333662,1.03754,-0.510016,-0.269875,-0.978764,-0.444293,0.3773,1
6,7,2024-01-01 01:00:00,0.756989,-0.922165,0.869606,1.355638,0.413435,1.876796,-0.773789,-1.244655,...,1.15933,-1.081063,,0.593101,-0.309546,0.326133,-1.251114,0.924027,-0.184902,1
7,8,2024-01-01 01:10:00,,1.049009,-0.704344,,-1.556629,0.60601,-1.280429,1.754794,...,0.207688,0.271579,-1.276749,-1.081057,1.053153,-0.039555,0.681501,0.028318,0.029756,1
8,9,2024-01-01 01:20:00,0.938284,,0.096121,-0.462275,-0.434496,,0.222134,-0.478749,...,,,1.169296,1.382159,0.64871,-0.167118,0.146714,1.206509,-0.816936,1
9,10,2024-01-01 01:30:00,0.368673,-0.393339,0.028745,1.278452,0.191099,0.046437,-1.359856,0.746254,...,-2.041735,,-0.681984,-1.00162,-0.2811,1.797687,0.640843,-0.571179,,1


In [4]:
df.shape

(42679, 103)

In [5]:
# check for missing values
df.isnull().sum()

run_id           0
timestamp        0
sensor_1      4327
sensor_2      4341
sensor_3      4245
              ... 
sensor_97     4270
sensor_98     4188
sensor_99     4310
sensor_100    4250
label            0
Length: 103, dtype: int64

In [6]:
# 2. Profile missingness
missing_pct = df.isna().mean().sort_values(ascending=False)
print(missing_pct.head(10))

sensor_26    0.103658
sensor_53    0.102556
sensor_87    0.102416
sensor_21    0.102299
sensor_5     0.102111
sensor_13    0.102111
sensor_76    0.102064
sensor_63    0.101971
sensor_86    0.101971
sensor_81    0.101947
dtype: float64


In [7]:
# 3 & 4. Drop high-missing features (>50%) and median-impute numeric columns
to_drop = missing_pct[missing_pct > 0.5].index.tolist()
df_clean = df.drop(columns=to_drop)

# Impute only numeric columns
num_cols = df_clean.select_dtypes(include='number').columns
df_clean[num_cols] = df_clean[num_cols].fillna(df_clean[num_cols].median())

# Verify
print("Cleaned shape:", df_clean.shape)
print("Remaining missing values:", df_clean.isna().sum().sum())


Cleaned shape: (42679, 103)
Remaining missing values: 0


In [8]:
# 5. Save processed file
df_clean.to_csv('../data/processed/defects_imputed.csv', index=False)

## Metadata Mapping

In [9]:
# 1. Ensure metadata directory exists
os.makedirs('../data/metadata', exist_ok=True)

# 2. Define sensor ranges and types
sensor_ranges = {
    (1, 10): ("temperature_sensor", "°C"),
    (11, 20): ("pressure_sensor", "bar"),
    (21, 40): ("vibration_sensor", "m/s²"),
    (41, 60): ("gas_flow_sensor", "L/min"),
    (61, 70): ("humidity_sensor", "%"),
    (71, 80): ("voltage_sensor", "V"),
    (81, 90): ("current_sensor", "A"),
    (91, 100): ("custom_feature_sensor", "unitless")
}

In [10]:
# 3. Build metadata list
rows = []
for (start, end), (stype, unit) in sensor_ranges.items():
    for i in range(start, end+1):
        rows.append({
            "feature_name": f"sensor_{i}",
            "sensor_type": stype,
            "unit": unit
        })

In [11]:
# 4. Create DataFrame and save to CSV
sensor_metadata_df = pd.DataFrame(rows)
sensor_metadata_df.to_csv('../data/metadata/sensor_metadata.csv', index=False)

In [12]:
meta = pd.read_csv('../data/metadata/sensor_metadata.csv')
meta.head(12)

Unnamed: 0,feature_name,sensor_type,unit
0,sensor_1,temperature_sensor,°C
1,sensor_2,temperature_sensor,°C
2,sensor_3,temperature_sensor,°C
3,sensor_4,temperature_sensor,°C
4,sensor_5,temperature_sensor,°C
5,sensor_6,temperature_sensor,°C
6,sensor_7,temperature_sensor,°C
7,sensor_8,temperature_sensor,°C
8,sensor_9,temperature_sensor,°C
9,sensor_10,temperature_sensor,°C


In [13]:
# check for missing values in metadata
meta.isnull().sum()

feature_name    0
sensor_type     0
unit            0
dtype: int64

In [14]:
def add_rolling_mean(df, cols, window):
    """Adds mean over last `window` runs for each column in `cols`."""
    for c in cols:
        df[f"{c}_roll_mean_{window}"] = df[c].rolling(window).mean()
    return df

def add_rolling_std(df, cols, window):
    """Adds std dev over last `window` runs for each column in `cols`."""
    for c in cols:
        df[f"{c}_roll_std_{window}"] = df[c].rolling(window).std()
    return df

In [15]:
sensor_cols = [c for c in df_clean.columns if c.startswith('sensor_')]

In [16]:
import pandas as pd

# Load the cleaned data from notebooks/
df = pd.read_csv('../data/processed/defects_imputed.csv')

print("Loaded shape:", df.shape)
df.head(3)


Loaded shape: (42679, 103)


Unnamed: 0,run_id,timestamp,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,...,sensor_92,sensor_93,sensor_94,sensor_95,sensor_96,sensor_97,sensor_98,sensor_99,sensor_100,label
0,1,2024-01-01 00:00:00,0.496714,-0.138264,0.647689,1.52303,-0.234153,-0.234137,1.579213,0.767435,...,0.968645,-0.702053,-0.327662,-0.392108,-1.463515,0.29612,0.261055,0.005113,-0.234587,1
1,2,2024-01-01 00:10:00,0.008416,-0.420645,-0.342715,0.008362,-0.161286,0.404051,1.886186,0.174578,...,0.856399,0.214094,-1.245739,0.173181,0.385317,-0.883857,0.153725,0.058209,-1.14297,1
2,3,2024-01-01 00:20:00,0.357787,0.560785,1.083051,1.053802,-1.377669,-0.937825,0.515035,0.513786,...,-0.493001,-0.589365,0.00537,0.357015,-0.69291,0.8996,0.3073,0.812862,0.629629,-1


In [17]:
# Ensure timestamp is datetime and sorted
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp')

# Define your helper functions inline
def add_rolling_mean(df, cols, window):
    for c in cols:
        df[f"{c}_roll_mean_{window}"] = df[c].rolling(window, min_periods=1).mean()
    return df

def add_rolling_std(df, cols, window):
    for c in cols:
        df[f"{c}_roll_std_{window}"] = df[c].rolling(window, min_periods=1).std()
    return df

# Identify sensor columns and apply
sensor_cols = [c for c in df.columns if c.startswith('sensor_')]
df = add_rolling_mean(df, sensor_cols, window=5)
df = add_rolling_std(df, sensor_cols, window=5)

# Save feature‐enhanced data
df.to_csv('../data/processed/defects_features.csv', index=False)
print("Features added and saved:", df.filter(like='roll_mean_5').shape)


  df[f"{c}_roll_mean_{window}"] = df[c].rolling(window, min_periods=1).mean()
  df[f"{c}_roll_mean_{window}"] = df[c].rolling(window, min_periods=1).mean()
  df[f"{c}_roll_mean_{window}"] = df[c].rolling(window, min_periods=1).mean()
  df[f"{c}_roll_std_{window}"] = df[c].rolling(window, min_periods=1).std()
  df[f"{c}_roll_std_{window}"] = df[c].rolling(window, min_periods=1).std()
  df[f"{c}_roll_std_{window}"] = df[c].rolling(window, min_periods=1).std()
  df[f"{c}_roll_std_{window}"] = df[c].rolling(window, min_periods=1).std()
  df[f"{c}_roll_std_{window}"] = df[c].rolling(window, min_periods=1).std()
  df[f"{c}_roll_std_{window}"] = df[c].rolling(window, min_periods=1).std()
  df[f"{c}_roll_std_{window}"] = df[c].rolling(window, min_periods=1).std()
  df[f"{c}_roll_std_{window}"] = df[c].rolling(window, min_periods=1).std()
  df[f"{c}_roll_std_{window}"] = df[c].rolling(window, min_periods=1).std()
  df[f"{c}_roll_std_{window}"] = df[c].rolling(window, min_periods=1).std()
  df[f

Features added and saved: (42679, 100)


In [18]:
# 1. Load the feature-enhanced data
df = pd.read_csv('../data/processed/defects_features.csv')

# 2. Quick sanity check
print("Shape:", df.shape)
df.head(3)

Shape: (42679, 303)


Unnamed: 0,run_id,timestamp,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,...,sensor_91_roll_std_5,sensor_92_roll_std_5,sensor_93_roll_std_5,sensor_94_roll_std_5,sensor_95_roll_std_5,sensor_96_roll_std_5,sensor_97_roll_std_5,sensor_98_roll_std_5,sensor_99_roll_std_5,sensor_100_roll_std_5
0,1,2024-01-01 00:00:00,0.496714,-0.138264,0.647689,1.52303,-0.234153,-0.234137,1.579213,0.767435,...,,,,,,,,,,
1,2,2024-01-01 00:10:00,0.008416,-0.420645,-0.342715,0.008362,-0.161286,0.404051,1.886186,0.174578,...,0.384378,0.07937,0.647814,0.649178,0.39972,1.307322,0.83437,0.075894,0.037544,0.642324
2,3,2024-01-01 00:20:00,0.357787,0.560785,1.083051,1.053802,-1.377669,-0.937825,0.515035,0.513786,...,0.27248,0.813417,0.499595,0.647952,0.390413,0.928672,0.907125,0.078786,0.451807,0.886391


In [26]:
import warnings
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score

# Silence fragmentation warnings
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

# Prepare features & labels (df already has your rolling features)
feature_cols = [c for c in df.columns if c.startswith('sensor_')]
X = df[feature_cols]
y_true = (df['label'] == -1).astype(int)

# Fit & predict
iso = IsolationForest(contamination=0.05, random_state=42)
iso.fit(X)
flags = (iso.predict(X) == -1).astype(int)

# Print metrics
print("Precision:", precision_score(y_true, flags))
print("Recall:   ", recall_score(y_true, flags))


Precision: 0.056232427366447985
Recall:    0.056179775280898875


## Prepare Features & True Labels

In [19]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score

# Select only numeric feature columns (raw sensors + rolling stats)
feature_cols = [c for c in df.columns 
                if c.startswith('sensor_')]

# True labels: 1 for defect (label == -1), 0 for normal
y_true = (df['label'] == -1).astype(int)

X = df[feature_cols]


## Fit the Model

In [20]:
# 5% contamination matches your injected anomaly rate
iso = IsolationForest(contamination=0.05, random_state=42)
iso.fit(X)

# Add anomaly scores and flags to your DataFrame
df['anomaly_score'] = iso.decision_function(X)   # higher = more “normal”
df['anomaly_flag']  = (iso.predict(X) == -1).astype(int)  # 1 = anomaly


## Evaluate Performance

In [21]:
prec = precision_score(y_true, df['anomaly_flag'])
rec  = recall_score(y_true, df['anomaly_flag'])

print(f"IsolationForest Precision: {prec:.2f}")
print(f"IsolationForest Recall:    {rec:.2f}")

IsolationForest Precision: 0.06
IsolationForest Recall:    0.06


## Inspect Flagged Runs

In [22]:
# View the first few runs marked as anomalies
df[df['anomaly_flag'] == 1].head()

Unnamed: 0,run_id,timestamp,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,...,sensor_93_roll_std_5,sensor_94_roll_std_5,sensor_95_roll_std_5,sensor_96_roll_std_5,sensor_97_roll_std_5,sensor_98_roll_std_5,sensor_99_roll_std_5,sensor_100_roll_std_5,anomaly_score,anomaly_flag
1,2,2024-01-01 00:10:00,0.008416,-0.420645,-0.342715,0.008362,-0.161286,0.404051,1.886186,0.174578,...,0.647814,0.649178,0.39972,1.307322,0.83437,0.075894,0.037544,0.642324,-0.010073,1
2,3,2024-01-01 00:20:00,0.357787,0.560785,1.083051,1.053802,-1.377669,-0.937825,0.515035,0.513786,...,0.499595,0.647952,0.390413,0.928672,0.907125,0.078786,0.451807,0.886391,-0.004654,1
30,31,2024-01-01 05:00:00,-1.907808,-0.002401,-0.413606,1.887688,0.556553,-1.335482,0.486036,-1.547304,...,0.435643,0.432061,0.674756,1.79894,1.307897,1.495228,0.363753,0.505643,-0.002705,1
76,77,2024-01-01 12:40:00,-0.128877,-0.509094,0.48536,-0.849105,0.305666,0.008771,-0.897659,-0.637487,...,1.074465,0.4136,0.610793,1.181399,1.261415,1.376969,0.461692,0.635173,-0.003814,1
84,85,2024-01-01 14:00:00,0.233865,1.164991,0.625318,-0.169274,1.352363,0.008771,2.052073,-1.705959,...,0.395442,1.152735,0.765285,0.777735,0.572272,0.600534,1.44512,0.531451,-0.000419,1


In [23]:
df.to_csv('../data/processed/with_anomalies.csv', index=False)

In [24]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score

# 1. Extract your feature matrix X and true labels y_true (assuming df is loaded)
feature_cols = [c for c in df.columns if c.startswith('sensor_')]
X = df[feature_cols]
y_true = (df['label'] == -1).astype(int)

# 2. Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Fit IsolationForest on scaled data
iso = IsolationForest(contamination=0.05, random_state=42)
iso.fit(X_scaled)

# 4. Generate flags and evaluate
flags = (iso.predict(X_scaled) == -1).astype(int)
print("Scaled Precision:", precision_score(y_true, flags))
print("Scaled Recall:   ", recall_score(y_true, flags))


Scaled Precision: 0.056232427366447985
Scaled Recall:    0.056179775280898875


In [25]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score

# We already have X_scaled and y_true from the previous cell

for c in [0.01, 0.03, 0.05, 0.10]:
    iso_c = IsolationForest(contamination=c, random_state=42)
    iso_c.fit(X_scaled)
    flags_c = (iso_c.predict(X_scaled) == -1).astype(int)
    p = precision_score(y_true, flags_c)
    r = recall_score(y_true, flags_c)
    print(f"contamination={c:.2f} -> Precision: {p:.3f}, Recall: {r:.3f}")


contamination=0.01 -> Precision: 0.061, Recall: 0.012
contamination=0.03 -> Precision: 0.056, Recall: 0.034
contamination=0.05 -> Precision: 0.056, Recall: 0.056
contamination=0.10 -> Precision: 0.054, Recall: 0.108
