In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('sensor.csv')

In [3]:
data['machine_status'].unique()

array(['NORMAL', 'BROKEN', 'RECOVERING'], dtype=object)

In [10]:
# Drop unnecessary columns (like 'Unnamed: 0' and 'timestamp')
data_drop = data.drop(columns=['Unnamed: 0', 'timestamp','sensor_15'])

In [11]:
data_drop

Unnamed: 0,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09,...,sensor_43,sensor_44,sensor_45,sensor_46,sensor_47,sensor_48,sensor_49,sensor_50,sensor_51,machine_status
0,2.465394,47.09201,53.211800,46.310760,634.375000,76.45975,13.41146,16.13136,15.56713,15.05353,...,41.92708,39.641200,65.68287,50.92593,38.194440,157.9861,67.70834,243.0556,201.3889,NORMAL
1,2.465394,47.09201,53.211800,46.310760,634.375000,76.45975,13.41146,16.13136,15.56713,15.05353,...,41.92708,39.641200,65.68287,50.92593,38.194440,157.9861,67.70834,243.0556,201.3889,NORMAL
2,2.444734,47.35243,53.211800,46.397570,638.888900,73.54598,13.32465,16.03733,15.61777,15.01013,...,41.66666,39.351852,65.39352,51.21528,38.194443,155.9606,67.12963,241.3194,203.7037,NORMAL
3,2.460474,47.09201,53.168400,46.397568,628.125000,76.98898,13.31742,16.24711,15.69734,15.08247,...,40.88541,39.062500,64.81481,51.21528,38.194440,155.9606,66.84028,240.4514,203.1250,NORMAL
4,2.445718,47.13541,53.211800,46.397568,636.458300,76.58897,13.35359,16.21094,15.69734,15.08247,...,41.40625,38.773150,65.10416,51.79398,38.773150,158.2755,66.55093,242.1875,201.3889,NORMAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220315,2.407350,47.69965,50.520830,43.142361,634.722229,64.59095,15.11863,16.65220,15.65393,15.16204,...,38.28125,68.287030,52.37268,48.32176,41.087960,212.3843,153.64580,,231.1921,NORMAL
220316,2.400463,47.69965,50.564240,43.142361,630.902771,65.83363,15.15480,16.70284,15.65393,15.11863,...,38.28125,66.840280,50.63657,48.03241,40.798610,213.8310,156.25000,,231.1921,NORMAL
220317,2.396528,47.69965,50.520830,43.142361,625.925903,67.29445,15.08970,16.70284,15.69734,15.11863,...,39.06250,65.393520,48.90046,48.03241,40.798610,217.3032,155.38190,,232.0602,NORMAL
220318,2.406366,47.69965,50.520832,43.142361,635.648100,65.09175,15.11863,16.56539,15.74074,15.11863,...,40.62500,64.236110,47.74306,48.32176,40.509258,222.5116,153.93520,,234.0856,NORMAL


In [14]:
# Handling missing values using linear interpolation
data_interpolated = data_drop.interpolate(method='linear', limit_direction='forward', axis=0)

In [15]:
data_interpolated.isnull().sum()

sensor_00         0
sensor_01         0
sensor_02         0
sensor_03         0
sensor_04         0
sensor_05         0
sensor_06         0
sensor_07         0
sensor_08         0
sensor_09         0
sensor_10         0
sensor_11         0
sensor_12         0
sensor_13         0
sensor_14         0
sensor_16         0
sensor_17         0
sensor_18         0
sensor_19         0
sensor_20         0
sensor_21         0
sensor_22         0
sensor_23         0
sensor_24         0
sensor_25         0
sensor_26         0
sensor_27         0
sensor_28         0
sensor_29         0
sensor_30         0
sensor_31         0
sensor_32         0
sensor_33         0
sensor_34         0
sensor_35         0
sensor_36         0
sensor_37         0
sensor_38         0
sensor_39         0
sensor_40         0
sensor_41         0
sensor_42         0
sensor_43         0
sensor_44         0
sensor_45         0
sensor_46         0
sensor_47         0
sensor_48         0
sensor_49         0
sensor_50         0


In [16]:
# Splitting features and target variable
X = data_interpolated.drop(columns=['machine_status'])  # Features
y = data_interpolated['machine_status']  # Target

In [17]:
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
# Training the Isolation Forest model
isolation_forest = IsolationForest(contamination=0.1, random_state=42)
isolation_forest.fit(X_train_scaled)

In [20]:
# Predicting anomalies on the test set
anomaly_scores = isolation_forest.decision_function(X_test_scaled)

In [21]:
anomaly_scores

array([ 0.17664353,  0.15347338,  0.16002662, ...,  0.15024213,
       -0.01142855,  0.14404402])

In [22]:
# Threshold for considering a data point as an anomaly
threshold = np.percentile(anomaly_scores, 100 * 0.1)

In [23]:
threshold

0.001368078632460469

In [24]:
# Predict anomalies based on the threshold
anomalies = X_test[anomaly_scores < threshold]

In [25]:
# Identifying the sensor values with anomalies
anomalous_sensors = anomalies.mean().idxmax()

In [26]:
print("Anomalous sensor:", anomalous_sensors)

Anomalous sensor: sensor_31


In [28]:
# Selecting multiple anomalous sensors based on a threshold or fixed number
anomaly_indices = anomaly_scores < threshold  # or any other condition based on threshold
anomalies = X_test[anomaly_indices]

# Identifying multiple anomalous sensors
num_sensors = 5  # Define the number of top anomalous sensors you want to identify
anomalous_sensors = anomalies.mean().nlargest(num_sensors).index
print("Anomalous sensors:", anomalous_sensors)


Anomalous sensors: Index(['sensor_31', 'sensor_28', 'sensor_32', 'sensor_23', 'sensor_21'], dtype='object')


In [None]:
def get_anomalous_sensors(file_path):

    data = pd.read_csv(file_path)
    
    data_drop = data.drop(columns=['Unnamed: 0', 'timestamp','sensor_15'])

    data_interpolated = data_drop.interpolate(method='linear', limit_direction='forward', axis=0)

    X = data_interpolated.drop(columns=['machine_status'])
    y = data_interpolated['machine_status']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    isolation_forest = IsolationForest(contamination=0.1, random_state=42)
    isolation_forest.fit(X_train_scaled)

    anomaly_scores = isolation_forest.decision_function(X_test_scaled)

    threshold = np.percentile(anomaly_scores, 100 * 0.1)

    anomalies = X_test[anomaly_scores < threshold]

    anomalous_sensors = anomalies.mean().idxmax()

    anomaly_indices = anomaly_scores < threshold
    anomalies = X_test[anomaly_indices]

    num_sensors = 5
    anomalous_sensors = anomalies.mean().nlargest(num_sensors).index
    anomalous_sensors = anomalous_sensors.tolist()
    anomalous_sensors.sort()

    return anomalous_sensors