# Huging Face Zero-shot Model

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
import statsmodels.api as sm
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [5]:
train_import = 'train_FD001.txt'
test_import = 'test_FD001.txt'
RUL_import = 'RUL_FD001.txt'

In [6]:
data = pd.read_csv(train_import, sep='\s+', header=None)

new_column_names = ['unit number', 'cycle number', 'operational setting 1', 'operational setting 2', 'operational setting 3',
                   'sensor measurement 1', 'sensor measurement 2', 'sensor measurement 3', 'sensor measurement 4',
                   'sensor measurement 5', 'sensor measurement 6', 'sensor measurement 7', 'sensor measurement 8',
                   'sensor measurement 9', 'sensor measurement 10', 'sensor measurement 11', 'sensor measurement 12',
                   'sensor measurement 13', 'sensor measurement 14', 'sensor measurement 15', 'sensor measurement 16',
                   'sensor measurement 17', 'sensor measurement 18', 'sensor measurement 19', 'sensor measurement 20',
                   'sensor measurement 21']

data.columns = new_column_names

data

Unnamed: 0,unit number,cycle number,operational setting 1,operational setting 2,operational setting 3,sensor measurement 1,sensor measurement 2,sensor measurement 3,sensor measurement 4,sensor measurement 5,...,sensor measurement 12,sensor measurement 13,sensor measurement 14,sensor measurement 15,sensor measurement 16,sensor measurement 17,sensor measurement 18,sensor measurement 19,sensor measurement 20,sensor measurement 21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,519.49,2388.26,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,...,519.68,2388.22,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,...,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,519.67,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.0640


In [7]:
data.isna().sum()

Unnamed: 0,0
unit number,0
cycle number,0
operational setting 1,0
operational setting 2,0
operational setting 3,0
sensor measurement 1,0
sensor measurement 2,0
sensor measurement 3,0
sensor measurement 4,0
sensor measurement 5,0


In [8]:
data.describe()

Unnamed: 0,unit number,cycle number,operational setting 1,operational setting 2,operational setting 3,sensor measurement 1,sensor measurement 2,sensor measurement 3,sensor measurement 4,sensor measurement 5,...,sensor measurement 12,sensor measurement 13,sensor measurement 14,sensor measurement 15,sensor measurement 16,sensor measurement 17,sensor measurement 18,sensor measurement 19,sensor measurement 20,sensor measurement 21
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,...,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,51.506568,108.807862,-9e-06,2e-06,100.0,518.67,642.680934,1590.523119,1408.933782,14.62,...,521.41347,2388.096152,8143.752722,8.442146,0.03,393.210654,2388.0,100.0,38.816271,23.289705
std,29.227633,68.88099,0.002187,0.000293,0.0,6.537152e-11,0.500053,6.13115,9.000605,3.3947e-12,...,0.737553,0.071919,19.076176,0.037505,1.556432e-14,1.548763,0.0,0.0,0.180746,0.108251
min,1.0,1.0,-0.0087,-0.0006,100.0,518.67,641.21,1571.04,1382.25,14.62,...,518.69,2387.88,8099.94,8.3249,0.03,388.0,2388.0,100.0,38.14,22.8942
25%,26.0,52.0,-0.0015,-0.0002,100.0,518.67,642.325,1586.26,1402.36,14.62,...,520.96,2388.04,8133.245,8.4149,0.03,392.0,2388.0,100.0,38.7,23.2218
50%,52.0,104.0,0.0,0.0,100.0,518.67,642.64,1590.1,1408.04,14.62,...,521.48,2388.09,8140.54,8.4389,0.03,393.0,2388.0,100.0,38.83,23.2979
75%,77.0,156.0,0.0015,0.0003,100.0,518.67,643.0,1594.38,1414.555,14.62,...,521.95,2388.14,8148.31,8.4656,0.03,394.0,2388.0,100.0,38.95,23.3668
max,100.0,362.0,0.0087,0.0006,100.0,518.67,644.53,1616.91,1441.49,14.62,...,523.38,2388.56,8293.72,8.5848,0.03,400.0,2388.0,100.0,39.43,23.6184


In [9]:
sentence = []
for index, row in data.iterrows():
    sentence_row = (f"unit number:{row['unit number']} at cycle number:{row['cycle number']}, "
                   f"operational setting 1:{row['operational setting 1']}, "
                   f"operational setting 2:{row['operational setting 2']}, "
                   f"operational setting 3:{row['operational setting 3']}, "
                   f"sensor measurement 1:{row['sensor measurement 1']}, "
                   f"sensor measurement 2:{row['sensor measurement 2']}, "
                   f"sensor measurement 3:{row['sensor measurement 3']}, "
                   f"sensor measurement 4:{row['sensor measurement 4']}, "
                   f"sensor measurement 5:{row['sensor measurement 5']}, "
                   f"sensor measurement 6:{row['sensor measurement 6']}, "
                   f"sensor measurement 7:{row['sensor measurement 7']}, "
                   f"sensor measurement 8:{row['sensor measurement 8']}, "
                   f"sensor measurement 9:{row['sensor measurement 9']}, "
                   f"sensor measurement 10:{row['sensor measurement 10']}, "
                   f"sensor measurement 11:{row['sensor measurement 11']}, "
                   f"sensor measurement 12:{row['sensor measurement 12']}, "
                   f"sensor measurement 13:{row['sensor measurement 13']}, "
                   f"sensor measurement 14:{row['sensor measurement 14']}, "
                   f"sensor measurement 15:{row['sensor measurement 15']}, "
                   f"sensor measurement 16:{row['sensor measurement 16']}, "
                   f"sensor measurement 17:{row['sensor measurement 17']}, "
                   f"sensor measurement 18:{row['sensor measurement 18']}, "
                   f"sensor measurement 19:{row['sensor measurement 19']}, "
                   f"sensor measurement 20:{row['sensor measurement 20']}, "
                   f"sensor measurement 21:{row['sensor measurement 21']}")
    sentence.append(sentence_row)

In [10]:
!pip install transformers



In [11]:
from transformers import pipeline
# Initialize the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Example of engine data (you can replace these values with actual data from the dataset)

engine_data = sentence[0]

# Define the possible candidate labels (categories)
candidate_labels = ["Normal", "Early Warning", "Moderate Anomaly", "Critical Anomaly"]

# Perform zero-shot classification on the engine data
result = classifier(engine_data, candidate_labels)

# Print the result
print(f"Input data: {engine_data}")
print(f"Predicted Label: {result['labels'][0]}")
print(f"Confidence Score: {result['scores'][0]:.4f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Input data: unit number:1.0 at cycle number:1.0, operational setting 1:-0.0007, operational setting 2:-0.0004, operational setting 3:100.0, sensor measurement 1:518.67, sensor measurement 2:641.82, sensor measurement 3:1589.7, sensor measurement 4:1400.6, sensor measurement 5:14.62, sensor measurement 6:21.61, sensor measurement 7:554.36, sensor measurement 8:2388.06, sensor measurement 9:9046.19, sensor measurement 10:1.3, sensor measurement 11:47.47, sensor measurement 12:521.66, sensor measurement 13:2388.02, sensor measurement 14:8138.62, sensor measurement 15:8.4195, sensor measurement 16:0.03, sensor measurement 17:392.0, sensor measurement 18:2388.0, sensor measurement 19:100.0, sensor measurement 20:39.06, sensor measurement 21:23.419
Predicted Label: Moderate Anomaly
Confidence Score: 0.4406


# Predicting Remaining Useful Cycle (RUL)

In [12]:
data = pd.read_csv(train_import, sep='\s+', header=None)

new_column_names = ['unit number', 'cycle number', 'operational setting 1', 'operational setting 2', 'operational setting 3',
                   'sensor measurement 1', 'sensor measurement 2', 'sensor measurement 3', 'sensor measurement 4',
                   'sensor measurement 5', 'sensor measurement 6', 'sensor measurement 7', 'sensor measurement 8',
                   'sensor measurement 9', 'sensor measurement 10', 'sensor measurement 11', 'sensor measurement 12',
                   'sensor measurement 13', 'sensor measurement 14', 'sensor measurement 15', 'sensor measurement 16',
                   'sensor measurement 17', 'sensor measurement 18', 'sensor measurement 19', 'sensor measurement 20',
                   'sensor measurement 21']

data.columns = new_column_names

data

Unnamed: 0,unit number,cycle number,operational setting 1,operational setting 2,operational setting 3,sensor measurement 1,sensor measurement 2,sensor measurement 3,sensor measurement 4,sensor measurement 5,...,sensor measurement 12,sensor measurement 13,sensor measurement 14,sensor measurement 15,sensor measurement 16,sensor measurement 17,sensor measurement 18,sensor measurement 19,sensor measurement 20,sensor measurement 21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,519.49,2388.26,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,...,519.68,2388.22,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,...,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,519.67,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.0640


In [13]:
data['label'] = 0

In [14]:
# Find the maximum cycle number for each unit
max_cycle_by_unit = data.groupby('unit number')['cycle number'].max()

# Iterate through each unit and assign "system failed" to the last cycle
for unit, max_cycle in max_cycle_by_unit.items():
    data.loc[(data['unit number'] == unit) & (data['cycle number'] == max_cycle), 'label'] = 1

In [15]:
max_cycle_by_unit

Unnamed: 0_level_0,cycle number
unit number,Unnamed: 1_level_1
1,192
2,287
3,179
4,189
5,269
...,...
96,336
97,202
98,156
99,185


In [16]:
# Calculate RUL for each row
data['RUL'] = data.groupby('unit number')['cycle number'].transform(lambda x: x.max() - x)

data

Unnamed: 0,unit number,cycle number,operational setting 1,operational setting 2,operational setting 3,sensor measurement 1,sensor measurement 2,sensor measurement 3,sensor measurement 4,sensor measurement 5,...,sensor measurement 14,sensor measurement 15,sensor measurement 16,sensor measurement 17,sensor measurement 18,sensor measurement 19,sensor measurement 20,sensor measurement 21,label,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190,0,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236,0,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,0,189
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,0,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044,0,187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735,0,4
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,...,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594,0,3
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,...,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333,0,2
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.0640,0,1


In [17]:
data

Unnamed: 0,unit number,cycle number,operational setting 1,operational setting 2,operational setting 3,sensor measurement 1,sensor measurement 2,sensor measurement 3,sensor measurement 4,sensor measurement 5,...,sensor measurement 14,sensor measurement 15,sensor measurement 16,sensor measurement 17,sensor measurement 18,sensor measurement 19,sensor measurement 20,sensor measurement 21,label,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190,0,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236,0,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,0,189
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,0,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044,0,187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735,0,4
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,...,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594,0,3
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,...,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333,0,2
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.0640,0,1


In [18]:
data.shape

(20631, 28)

## Autoencoder building

In [19]:

failed_data = data[data['label'] == 1]  # 'system failed' samples
failed_data =  failed_data.drop(['label','unit number', 'RUL'], axis=1)
# Scale the data
scaler = MinMaxScaler()
failed_data_scaled = scaler.fit_transform(failed_data)


In [20]:
failed_data_scaled.shape[1]

25

In [21]:


# Autoencoder architecture
input_dim = failed_data_scaled.shape[1]  # Number of features
encoding_dim = 5  # Latent space size (adjust as necessary)

input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)  # Reconstruct the input

autoencoder = Model(input_layer, decoded)

# Compile and train the autoencoder
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(failed_data_scaled, failed_data_scaled, epochs=50, batch_size=32, validation_split=0.2)


Epoch 1/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 150ms/step - loss: 0.1182 - val_loss: 0.1202
Epoch 2/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 0.1181 - val_loss: 0.1198
Epoch 3/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - loss: 0.1178 - val_loss: 0.1194
Epoch 4/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 0.1174 - val_loss: 0.1190
Epoch 5/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 0.1171 - val_loss: 0.1186
Epoch 6/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 0.1166 - val_loss: 0.1182
Epoch 7/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 0.1161 - val_loss: 0.1178
Epoch 8/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 0.1157 - val_loss: 0.1173
Epoch 9/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x790b62660670>

In [22]:
# Train normal data
normal_data = data[data['label'] == 0]
normal_data = normal_data.drop(['label','unit number', 'RUL'], axis=1)
# Scale the normal data
normal_data_scaled = scaler.transform(normal_data)

# Get the reconstruction for the new data
reconstructed = autoencoder.predict(normal_data_scaled)

# Compute the reconstruction error (Mean Squared Error)
reconstruction_error = np.mean(np.square(normal_data_scaled - reconstructed), axis=1)
print(f"Reconstruction Error for Normal Data: {reconstruction_error}")


[1m642/642[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
Reconstruction Error for Normal Data: [0.73097169 0.66747345 0.77202539 ... 0.08205953 0.08351597 0.0860955 ]


In [23]:
# Scale Reconstruction error to probabilities
prob_scaler = MinMaxScaler()
prob_scores = prob_scaler.fit_transform(reconstruction_error.reshape(-1, 1))
prob_scores = 1 - prob_scores

In [24]:
normal_data

Unnamed: 0,cycle number,operational setting 1,operational setting 2,operational setting 3,sensor measurement 1,sensor measurement 2,sensor measurement 3,sensor measurement 4,sensor measurement 5,sensor measurement 6,...,sensor measurement 12,sensor measurement 13,sensor measurement 14,sensor measurement 15,sensor measurement 16,sensor measurement 17,sensor measurement 18,sensor measurement 19,sensor measurement 20,sensor measurement 21
0,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,21.61,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190
1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236
2,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,21.61,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,...,522.19,2388.04,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20625,195,-0.0002,-0.0001,100.0,518.67,643.41,1600.04,1431.90,14.62,21.61,...,519.71,2388.28,8142.90,8.5519,0.03,394,2388,100.0,38.14,23.1923
20626,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,21.61,...,519.49,2388.26,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735
20627,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,21.61,...,519.68,2388.22,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594
20628,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,21.61,...,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333


In [25]:
updated_normal_data = data[data['label'] == 0]
updated_normal_data['prob_score'] = prob_scores
updated_normal_data = updated_normal_data.drop(['unit number','label'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_normal_data['prob_score'] = prob_scores


In [26]:
updated_normal_data[updated_normal_data['prob_score'] > 0.9]

Unnamed: 0,cycle number,operational setting 1,operational setting 2,operational setting 3,sensor measurement 1,sensor measurement 2,sensor measurement 3,sensor measurement 4,sensor measurement 5,sensor measurement 6,...,sensor measurement 14,sensor measurement 15,sensor measurement 16,sensor measurement 17,sensor measurement 18,sensor measurement 19,sensor measurement 20,sensor measurement 21,RUL,prob_score
161,162,-0.0005,0.0004,100.0,518.67,643.15,1592.22,1423.48,14.62,21.61,...,8123.77,8.5015,0.03,394,2388,100.0,38.78,23.1538,30,0.905198
162,163,0.0003,-0.0004,100.0,518.67,642.85,1600.54,1421.09,14.62,21.61,...,8124.06,8.5129,0.03,393,2388,100.0,38.65,23.1419,29,0.926781
163,164,0.0005,-0.0002,100.0,518.67,643.17,1598.96,1416.76,14.62,21.61,...,8124.63,8.4803,0.03,394,2388,100.0,38.62,23.1761,28,0.907694
165,166,-0.0022,-0.0003,100.0,518.67,643.34,1596.72,1422.37,14.62,21.61,...,8119.14,8.4663,0.03,395,2388,100.0,38.62,23.1450,26,0.936338
168,169,-0.0017,0.0004,100.0,518.67,643.20,1590.16,1418.05,14.62,21.61,...,8125.30,8.4767,0.03,394,2388,100.0,38.58,23.1870,23,0.901492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20625,195,-0.0002,-0.0001,100.0,518.67,643.41,1600.04,1431.90,14.62,21.61,...,8142.90,8.5519,0.03,394,2388,100.0,38.14,23.1923,5,0.968046
20626,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,21.61,...,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735,4,0.985831
20627,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,21.61,...,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594,3,0.978292
20628,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,21.61,...,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333,2,0.977114


In [27]:


# Prepare dataset
X = updated_normal_data.drop(columns=["RUL"])
y = updated_normal_data["RUL"]


# Apply PCA
pca = PCA(n_components=10)  # Retain 10 features
updated_normal_data_reduced = pca.fit_transform(X)

print(f"Number of components selected: {updated_normal_data_reduced.shape[1]}")

# Add a constant to the independent variables (intercept term)
updated_normal_data_reduced = sm.add_constant(updated_normal_data_reduced)

# Fit the OLS model
model_reduced = sm.OLS(y, updated_normal_data_reduced).fit()

# Print the regression results
print(model_reduced.summary())

Number of components selected: 10
                            OLS Regression Results                            
Dep. Variable:                    RUL   R-squared:                       0.652
Model:                            OLS   Adj. R-squared:                  0.651
Method:                 Least Squares   F-statistic:                     3837.
Date:                Wed, 27 Nov 2024   Prob (F-statistic):               0.00
Time:                        07:48:50   Log-Likelihood:            -1.0513e+05
No. Observations:               20531   AIC:                         2.103e+05
Df Residuals:                   20520   BIC:                         2.104e+05
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        108.3

In [28]:


# Prepare the data for regression
X = updated_normal_data.drop('RUL', axis=1)
y = updated_normal_data['RUL']

# Add a constant to the independent variables (intercept term)
X = sm.add_constant(X)

# Fit the OLS model
model = sm.OLS(y, X).fit()

# Print the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    RUL   R-squared:                       0.656
Model:                            OLS   Adj. R-squared:                  0.656
Method:                 Least Squares   F-statistic:                     2057.
Date:                Wed, 27 Nov 2024   Prob (F-statistic):               0.00
Time:                        07:48:50   Log-Likelihood:            -1.0500e+05
No. Observations:               20531   AIC:                         2.100e+05
Df Residuals:                   20511   BIC:                         2.102e+05
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
cycle number             -0.38

## Fit Test data (Autoencoder Model)


In [29]:
test_data = pd.read_csv(test_import, sep='\s+', header=None)

new_column_names = ['unit number', 'cycle number', 'operational setting 1', 'operational setting 2', 'operational setting 3',
                   'sensor measurement 1', 'sensor measurement 2', 'sensor measurement 3', 'sensor measurement 4',
                   'sensor measurement 5', 'sensor measurement 6', 'sensor measurement 7', 'sensor measurement 8',
                   'sensor measurement 9', 'sensor measurement 10', 'sensor measurement 11', 'sensor measurement 12',
                   'sensor measurement 13', 'sensor measurement 14', 'sensor measurement 15', 'sensor measurement 16',
                   'sensor measurement 17', 'sensor measurement 18', 'sensor measurement 19', 'sensor measurement 20',
                   'sensor measurement 21']

test_data.columns = new_column_names

test_data =  test_data.drop(['unit number'], axis=1)
test_data

Unnamed: 0,cycle number,operational setting 1,operational setting 2,operational setting 3,sensor measurement 1,sensor measurement 2,sensor measurement 3,sensor measurement 4,sensor measurement 5,sensor measurement 6,...,sensor measurement 12,sensor measurement 13,sensor measurement 14,sensor measurement 15,sensor measurement 16,sensor measurement 17,sensor measurement 18,sensor measurement 19,sensor measurement 20,sensor measurement 21
0,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,21.61,...,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735
1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,21.61,...,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916
2,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,21.61,...,521.97,2388.03,8130.10,8.4441,0.03,393,2388,100.0,39.08,23.4166
3,4,0.0042,0.0000,100.0,518.67,642.44,1584.12,1406.42,14.62,21.61,...,521.38,2388.05,8132.90,8.3917,0.03,391,2388,100.0,39.00,23.3737
4,5,0.0014,0.0000,100.0,518.67,642.51,1587.19,1401.92,14.62,21.61,...,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.4130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13091,194,0.0049,0.0000,100.0,518.67,643.24,1599.45,1415.79,14.62,21.61,...,520.69,2388.00,8213.28,8.4715,0.03,394,2388,100.0,38.65,23.1974
13092,195,-0.0011,-0.0001,100.0,518.67,643.22,1595.69,1422.05,14.62,21.61,...,521.05,2388.09,8210.85,8.4512,0.03,395,2388,100.0,38.57,23.2771
13093,196,-0.0006,-0.0003,100.0,518.67,643.44,1593.15,1406.82,14.62,21.61,...,521.18,2388.04,8217.24,8.4569,0.03,395,2388,100.0,38.62,23.2051
13094,197,-0.0038,0.0001,100.0,518.67,643.26,1594.99,1419.36,14.62,21.61,...,521.33,2388.08,8220.48,8.4711,0.03,395,2388,100.0,38.66,23.2699


In [30]:
test_data.isna().sum()

Unnamed: 0,0
cycle number,0
operational setting 1,0
operational setting 2,0
operational setting 3,0
sensor measurement 1,0
sensor measurement 2,0
sensor measurement 3,0
sensor measurement 4,0
sensor measurement 5,0
sensor measurement 6,0


In [31]:
# Scale the test data
test_data_scaled = scaler.transform(test_data)

# Get the reconstruction for the new data
reconstructed_test = autoencoder.predict(test_data_scaled)

# Compute the reconstruction error (Mean Squared Error)
reconstruction_error_test = np.mean(np.square(test_data_scaled - reconstructed_test), axis=1)
print(f"Reconstruction Error for Normal Data: {reconstruction_error_test}")

[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Reconstruction Error for Normal Data: [0.71990878 0.86640339 0.66091553 ... 0.30333069 0.26049569 0.17847528]


In [32]:
# Scale Reconstruction error to probabilities
prob_scaler = MinMaxScaler()
prob_scores_test = prob_scaler.fit_transform(reconstruction_error_test.reshape(-1, 1))
prob_scores_test = 1 - prob_scores_test

In [33]:
updated_test_data =  test_data
updated_test_data['prob_score'] = prob_scores_test
updated_test_data

Unnamed: 0,cycle number,operational setting 1,operational setting 2,operational setting 3,sensor measurement 1,sensor measurement 2,sensor measurement 3,sensor measurement 4,sensor measurement 5,sensor measurement 6,...,sensor measurement 13,sensor measurement 14,sensor measurement 15,sensor measurement 16,sensor measurement 17,sensor measurement 18,sensor measurement 19,sensor measurement 20,sensor measurement 21,prob_score
0,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,21.61,...,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735,0.438842
1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,21.61,...,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916,0.313316
2,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,21.61,...,2388.03,8130.10,8.4441,0.03,393,2388,100.0,39.08,23.4166,0.489391
3,4,0.0042,0.0000,100.0,518.67,642.44,1584.12,1406.42,14.62,21.61,...,2388.05,8132.90,8.3917,0.03,391,2388,100.0,39.00,23.3737,0.444417
4,5,0.0014,0.0000,100.0,518.67,642.51,1587.19,1401.92,14.62,21.61,...,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.4130,0.377922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13091,194,0.0049,0.0000,100.0,518.67,643.24,1599.45,1415.79,14.62,21.61,...,2388.00,8213.28,8.4715,0.03,394,2388,100.0,38.65,23.1974,0.848199
13092,195,-0.0011,-0.0001,100.0,518.67,643.22,1595.69,1422.05,14.62,21.61,...,2388.09,8210.85,8.4512,0.03,395,2388,100.0,38.57,23.2771,0.844893
13093,196,-0.0006,-0.0003,100.0,518.67,643.44,1593.15,1406.82,14.62,21.61,...,2388.04,8217.24,8.4569,0.03,395,2388,100.0,38.62,23.2051,0.795791
13094,197,-0.0038,0.0001,100.0,518.67,643.26,1594.99,1419.36,14.62,21.61,...,2388.08,8220.48,8.4711,0.03,395,2388,100.0,38.66,23.2699,0.832495


In [34]:
# Predict RUL using the trained OLS model
updated_test_data_with_constant = sm.add_constant(updated_test_data)
rul_predictions = model.predict(updated_test_data_with_constant)

updated_test_data['RUL'] = rul_predictions
updated_test_data

Unnamed: 0,cycle number,operational setting 1,operational setting 2,operational setting 3,sensor measurement 1,sensor measurement 2,sensor measurement 3,sensor measurement 4,sensor measurement 5,sensor measurement 6,...,sensor measurement 14,sensor measurement 15,sensor measurement 16,sensor measurement 17,sensor measurement 18,sensor measurement 19,sensor measurement 20,sensor measurement 21,prob_score,RUL
0,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,21.61,...,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735,0.438842,186.665053
1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,21.61,...,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916,0.313316,196.259355
2,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,21.61,...,8130.10,8.4441,0.03,393,2388,100.0,39.08,23.4166,0.489391,177.044338
3,4,0.0042,0.0000,100.0,518.67,642.44,1584.12,1406.42,14.62,21.61,...,8132.90,8.3917,0.03,391,2388,100.0,39.00,23.3737,0.444417,184.150876
4,5,0.0014,0.0000,100.0,518.67,642.51,1587.19,1401.92,14.62,21.61,...,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.4130,0.377922,193.280034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13091,194,0.0049,0.0000,100.0,518.67,643.24,1599.45,1415.79,14.62,21.61,...,8213.28,8.4715,0.03,394,2388,100.0,38.65,23.1974,0.848199,27.073974
13092,195,-0.0011,-0.0001,100.0,518.67,643.22,1595.69,1422.05,14.62,21.61,...,8210.85,8.4512,0.03,395,2388,100.0,38.57,23.2771,0.844893,27.509325
13093,196,-0.0006,-0.0003,100.0,518.67,643.44,1593.15,1406.82,14.62,21.61,...,8217.24,8.4569,0.03,395,2388,100.0,38.62,23.2051,0.795791,27.989304
13094,197,-0.0038,0.0001,100.0,518.67,643.26,1594.99,1419.36,14.62,21.61,...,8220.48,8.4711,0.03,395,2388,100.0,38.66,23.2699,0.832495,25.455646


In [35]:
updated_test_data[updated_test_data['prob_score'] > 0.9]

Unnamed: 0,cycle number,operational setting 1,operational setting 2,operational setting 3,sensor measurement 1,sensor measurement 2,sensor measurement 3,sensor measurement 4,sensor measurement 5,sensor measurement 6,...,sensor measurement 14,sensor measurement 15,sensor measurement 16,sensor measurement 17,sensor measurement 18,sensor measurement 19,sensor measurement 20,sensor measurement 21,prob_score,RUL
2418,168,0.0005,0.0003,100.0,518.67,643.36,1599.19,1415.20,14.62,21.61,...,8143.22,8.4651,0.03,394,2388,100.0,38.50,23.2343,0.908102,43.405898
2419,169,-0.0014,-0.0004,100.0,518.67,642.55,1593.43,1423.55,14.62,21.61,...,8139.38,8.5198,0.03,395,2388,100.0,38.74,23.0718,0.920232,44.527016
2420,170,0.0011,0.0003,100.0,518.67,643.16,1599.33,1418.72,14.62,21.61,...,8144.74,8.5027,0.03,394,2388,100.0,38.71,23.2006,0.911081,47.524441
2422,172,0.0036,-0.0004,100.0,518.67,643.25,1597.93,1423.52,14.62,21.61,...,8138.73,8.4624,0.03,393,2388,100.0,38.51,23.2388,0.909770,45.701826
2423,173,0.0029,0.0003,100.0,518.67,643.61,1599.27,1421.49,14.62,21.61,...,8142.99,8.4736,0.03,397,2388,100.0,38.44,23.1382,0.914448,46.201707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11982,150,-0.0010,-0.0004,100.0,518.67,644.03,1597.06,1414.07,14.62,21.61,...,8122.89,8.4804,0.03,394,2388,100.0,38.53,23.1211,0.922935,44.623999
12215,233,0.0029,0.0001,100.0,518.67,643.26,1596.52,1419.09,14.62,21.61,...,8147.02,8.4792,0.03,394,2388,100.0,38.80,23.2335,0.901954,21.922190
12224,242,-0.0002,0.0004,100.0,518.67,643.06,1592.51,1418.32,14.62,21.61,...,8155.46,8.5153,0.03,394,2388,100.0,38.58,23.1203,0.906697,18.069915
12225,243,-0.0011,-0.0003,100.0,518.67,642.80,1596.39,1421.40,14.62,21.61,...,8151.52,8.4909,0.03,394,2388,100.0,38.60,23.2553,0.913967,16.421502


In [36]:
# Re-adding unit number
unit_number = pd.read_csv(test_import, sep='\s+', header=None)[0]
updated_test_data['unit_number'] = unit_number

In [37]:
updated_test_data

Unnamed: 0,cycle number,operational setting 1,operational setting 2,operational setting 3,sensor measurement 1,sensor measurement 2,sensor measurement 3,sensor measurement 4,sensor measurement 5,sensor measurement 6,...,sensor measurement 15,sensor measurement 16,sensor measurement 17,sensor measurement 18,sensor measurement 19,sensor measurement 20,sensor measurement 21,prob_score,RUL,unit_number
0,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,21.61,...,8.4052,0.03,392,2388,100.0,38.86,23.3735,0.438842,186.665053,1
1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,21.61,...,8.3803,0.03,393,2388,100.0,39.02,23.3916,0.313316,196.259355,1
2,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,21.61,...,8.4441,0.03,393,2388,100.0,39.08,23.4166,0.489391,177.044338,1
3,4,0.0042,0.0000,100.0,518.67,642.44,1584.12,1406.42,14.62,21.61,...,8.3917,0.03,391,2388,100.0,39.00,23.3737,0.444417,184.150876,1
4,5,0.0014,0.0000,100.0,518.67,642.51,1587.19,1401.92,14.62,21.61,...,8.4031,0.03,390,2388,100.0,38.99,23.4130,0.377922,193.280034,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13091,194,0.0049,0.0000,100.0,518.67,643.24,1599.45,1415.79,14.62,21.61,...,8.4715,0.03,394,2388,100.0,38.65,23.1974,0.848199,27.073974,100
13092,195,-0.0011,-0.0001,100.0,518.67,643.22,1595.69,1422.05,14.62,21.61,...,8.4512,0.03,395,2388,100.0,38.57,23.2771,0.844893,27.509325,100
13093,196,-0.0006,-0.0003,100.0,518.67,643.44,1593.15,1406.82,14.62,21.61,...,8.4569,0.03,395,2388,100.0,38.62,23.2051,0.795791,27.989304,100
13094,197,-0.0038,0.0001,100.0,518.67,643.26,1594.99,1419.36,14.62,21.61,...,8.4711,0.03,395,2388,100.0,38.66,23.2699,0.832495,25.455646,100


In [38]:
RUL_real  = pd.read_csv(RUL_import, sep='\s+', header=None)
RUL_real.columns = ['RUL']
# Convert RUL_real to an array
RUL_real_array = RUL_real['RUL'].values

### Option 1 for predicting RUL: Predict using only the last data point

In [39]:
last_rul_by_unit = updated_test_data.groupby('unit_number')['RUL'].last()
last_rul_by_unit = last_rul_by_unit.values

In [40]:
option_1 = last_rul_by_unit - RUL_real_array
option_1 = option_1.astype(int)
np.mean(option_1)

15.55

### Option 2 for predicting RUL: Predict using the smallest RUL found in each unit number

In [41]:
# Group by 'unit_number' and get the minimum RUL for each group
min_rul_by_unit = updated_test_data.groupby('unit_number')['RUL'].min()

min_rul_by_unit

Unnamed: 0_level_0,RUL
unit_number,Unnamed: 1_level_1
1,169.627561
2,131.303469
3,75.957419
4,94.217529
5,101.335749
...,...
96,135.889329
97,98.180029
98,84.338581
99,148.775192


In [42]:
# Convert min_rul_by_unit to an array
min_rul_by_unit_array = min_rul_by_unit.values

In [43]:
option_2 = min_rul_by_unit_array - RUL_real_array
option_2 = option_2.astype(int)
np.mean(option_2)

10.49