In [7]:
import pandas as pd
import numpy as np

# Setting a random seed for reproducibility
np.random.seed(42)

# Generate 100 instances with hourly timestamps
timestamps = pd.date_range("2023-01-01", periods=100, freq="H")

# Synthetic Data Generation
data = {
    "Timestamp": timestamps,
    "Reallocated_Sector_Count": np.random.poisson(lam=5, size=100),  # Based on a Poisson distribution
    "Reported_Uncorrectable_Errors": np.random.poisson(lam=1, size=100),
    "Command_Timeout": np.random.poisson(lam=2, size=100),
    "Current_Pending_Sector_Count": np.random.poisson(lam=1, size=100),
    "Offline_Uncorrectable": np.random.poisson(lam=1, size=100),
    "Temperature": np.clip(np.random.normal(loc=35, scale=3, size=100), 20, 50),  # Normal distribution for temperature
    "Power_on_hours": np.arange(1000, 1100),  # Incremental by hour
}

# Simulate failure based on threshold triggers for demo purposes
failure_flags = []
for i in range(100):
    if (data["Reallocated_Sector_Count"][i] > 5 or
        data["Reported_Uncorrectable_Errors"][i] > 5 or
        data["Current_Pending_Sector_Count"][i] > 5 or
        data["Temperature"][i] > 45):  # Arbitrary failure criteria for demo
        failure_flags.append(1)
    else:
        failure_flags.append(0)

data["Failure_Flag"] = failure_flags

# Create the DataFrame
synthetic_df = pd.DataFrame(data)
synthetic_df.head(10)


  timestamps = pd.date_range("2023-01-01", periods=100, freq="H")


Unnamed: 0,Timestamp,Reallocated_Sector_Count,Reported_Uncorrectable_Errors,Command_Timeout,Current_Pending_Sector_Count,Offline_Uncorrectable,Temperature,Power_on_hours,Failure_Flag
0,2023-01-01 00:00:00,5,2,1,3,3,38.86927,1000,0
1,2023-01-01 01:00:00,4,0,1,1,1,34.2158,1001,0
2,2023-01-01 02:00:00,4,0,2,0,0,35.3615,1002,0
3,2023-01-01 03:00:00,5,0,2,0,1,31.913977,1003,0
4,2023-01-01 04:00:00,5,0,3,0,1,31.165932,1004,0
5,2023-01-01 05:00:00,3,0,1,0,2,30.068778,1005,0
6,2023-01-01 06:00:00,5,0,3,0,0,32.34451,1006,0
7,2023-01-01 07:00:00,4,0,2,0,1,39.657685,1007,0
8,2023-01-01 08:00:00,6,0,2,0,0,30.341819,1008,1
9,2023-01-01 09:00:00,7,1,6,0,0,34.089392,1009,1


### LSTM

In [2]:
!pip install tensorflow



In [10]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load the synthetic data
data = synthetic_df.copy()  # Replace with actual data if available

# Data preprocessing
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data.drop(columns=["Timestamp", "Failure_Flag"]))
data_scaled = pd.DataFrame(scaled_features, columns=data.columns[1:-1])

# Parameters for sequence-based data
sequence_length = 24  # Use the last 24 hours as input for each prediction
X = []
y = []

# Create sequences for LSTM
for i in range(sequence_length, len(data_scaled)):
    X.append(data_scaled.iloc[i-sequence_length:i].values)  # Last 24 hours
    y.append(data["Failure_Flag"].iloc[i])  # Current label

X = np.array(X)
y = np.array(y)

# Train-test split (80-20 split)
split_index = int(0.8 * len(X))
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# LSTM Model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.2),
    LSTM(50, return_sequences=False),
    Dropout(0.2),
    Dense(1, activation="sigmoid")  # Output layer for binary classification
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Predict on test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary predictions

# Evaluate the model
print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


  super().__init__(**kwargs)


Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 521ms/step - accuracy: 0.5243 - loss: 0.6952 - val_accuracy: 0.5833 - val_loss: 0.6917
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.7604 - loss: 0.6804 - val_accuracy: 0.3333 - val_loss: 0.6967
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.6875 - loss: 0.6720 - val_accuracy: 0.2500 - val_loss: 0.7025
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.7118 - loss: 0.6601 - val_accuracy: 0.2500 - val_loss: 0.7087
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.7153 - loss: 0.6466 - val_accuracy: 0.2500 - val_loss: 0.7170
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.7153 - loss: 0.6353 - val_accuracy: 0.2500 - val_loss: 0.7280
Epoch 7/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
