In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the dataset
df = pd.read_csv('iot_sensor.csv')
print("before data processing \n",df.head())

# 1. Convert timestamp to datetime and sort the data
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(by='timestamp')

# 2. Handle missing values using forward fill
df['temperature'] = df['temperature'].ffill()
df['humidity'] = df['humidity'].ffill()

# 3. Remove sensor drift by subtracting rolling mean (24-point window)
window_size = 24
df['temperature_no_drift'] = df['temperature'] - df['temperature'].rolling(window=window_size, min_periods=1).mean()
df['humidity_no_drift'] = df['humidity'] - df['humidity'].rolling(window=window_size, min_periods=1).mean()

# 4. Normalize readings using Standard Scaler
scaler = StandardScaler()
df[['temperature_scaled', 'humidity_scaled']] = scaler.fit_transform(
    df[['temperature_no_drift', 'humidity_no_drift']]
)

# 5. Encode sensor IDs
encoder = LabelEncoder()
df['sensor_id_encoded'] = encoder.fit_transform(df['sensor_id'])

# Final optimized dataset for anomaly detection
processed_df = df[['timestamp', 'sensor_id_encoded', 'temperature_scaled', 'humidity_scaled']]
print("After data processing \n",processed_df.head())


before data processing 
              timestamp sensor_id  temperature  humidity
0  2025-02-01 00:00:00        S2         24.0      40.0
1  2025-02-01 01:00:00        S3         30.0       NaN
2  2025-02-01 02:00:00        S1         24.0      50.0
3  2025-02-01 03:00:00        S2         24.0       NaN
4  2025-02-01 04:00:00        S3         23.0      42.0
After data processing 
             timestamp  sensor_id_encoded  temperature_scaled  humidity_scaled
0 2025-02-01 00:00:00                  1            0.005921         0.035574
1 2025-02-01 01:00:00                  2            1.155562         0.035574
2 2025-02-01 02:00:00                  0           -0.760506         1.765490
3 2025-02-01 03:00:00                  1           -0.568899         1.333011
4 2025-02-01 04:00:00                  2           -0.760506        -0.587196
