### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import confusion_matrix
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense
from sklearn.ensemble import AdaBoostClassifier

### Reading the data

In [5]:
downtime_data = pd.read_excel('../Raw Data/Equipment downtime data (202308).xlsx')
hierarchy_data = pd.read_excel('../Raw Data/Operation level of equipment.xlsx')
downtimeNew = pd.read_excel('../Raw Data/Equipment downtime data (202310).xlsx')
downtime_data.to_csv('../Raw Data/Equipment downtime data (202308).csv', index=False)
hierarchy_data.to_csv('../Raw Data/Operation level of equipment.csv', index=False)
downtimeNew.to_csv('../Raw Data/Equipment downtime data (202310).csv', index=False)

### Reformatting date and time

In [6]:
downtime_data['FaultDate'] = pd.to_datetime(downtime_data['FaultDate'], errors='coerce').dt.date
downtimeNew['FaultDate'] = pd.to_datetime(downtimeNew['FaultDate'], errors='coerce').dt.date
downtime_data['FaultTime'] = downtime_data['FaultTime'].apply(lambda x: x.strftime('%H:%M:%S') if len(str(x)) > 8 else x)
downtimeNew['FaultTime'] = downtimeNew['FaultTime'].apply(lambda x: x.strftime('%H:%M:%S') if len(str(x)) > 8 else x)
downtimeNew = downtimeNew.dropna(subset=['FaultDate', 'FaultTime'])

### Dropping data that is not needed

In [7]:
pattern_data = downtime_data.iloc[:, :14]
pattern_new = downtimeNew.iloc[:, :14]
pattern_data = pattern_data.drop(['ID', 'DutyOfficer', 'Manager email address'], axis=1)
pattern_new = pattern_new.drop(['ID', 'DutyOfficer', 'Manager email address'], axis=1)
pattern_data['FaultDateTime'] = pd.to_datetime(pattern_data['FaultDate'].astype(str) + ' ' + pattern_data['FaultTime'].astype(str))
pattern_new['FaultDateTime'] = pd.to_datetime(pattern_new['FaultDate'].astype(str) + ' ' + pattern_new['FaultTime'].astype(str))
dt2010 = pd.to_datetime('2010-01-01 00:00:00')
dtnew = pd.to_datetime('2023-08-04 06:00:00')
pattern_data.drop(['FaultDate', 'FaultTime'], axis=1, inplace=True)
pattern_new.drop(['FaultDate', 'FaultTime'], axis=1, inplace=True)
pattern_data.sort_values(by=['FaultDateTime'], inplace=True)
pattern_new.sort_values(by=['FaultDateTime'], inplace=True)
pattern_data = pattern_data[pattern_data['FaultDateTime'] >= dt2010]
pattern_new = pattern_new[pattern_new['FaultDateTime'] >= dtnew]

In [8]:
pattern_data = pattern_data.drop(['LogEntry', 'DutyOfficer comments', 'Managerscomments', 'FaultRepair', 'FaultDescription', 'Group', 'Downtime', 'User Run'], axis=1)
pattern_new = pattern_new.drop(['LogEntry', 'DutyOfficer comments', 'Managerscomments', 'FaultRepair', 'FaultDescription', 'Group', 'Downtime', 'User Run'], axis=1)

### Preparing for labeling and encoding

In [9]:
pattern_data['Equipment'] = pattern_data['Equipment'].str.lower()
pattern_data['Equipment'] = pattern_data['Equipment'].str.replace('[^\w\s]', '')

  pattern_data['Equipment'] = pattern_data['Equipment'].str.replace('[^\w\s]', '')


In [10]:
pattern_new['Equipment'] = pattern_new['Equipment'].str.lower()
pattern_new['Equipment'] = pattern_new['Equipment'].str.replace('[^\w\s]', '')

  pattern_new['Equipment'] = pattern_new['Equipment'].str.replace('[^\w\s]', '')


### Creating the data frame for backwards tracking from failure

In [11]:
dfprec = pd.DataFrame(columns=set(pattern_data['Equipment'].tolist()))
for label in dfprec:
    dfprec[label] = [0]*pattern_data.shape[0]
labels = pd.DataFrame()
labels['Label'] = [0]*pattern_data.shape[0]

### Filling out the data frame and creating labels

In [12]:
twindow = pd.Timedelta(days=5)
for i in range(pattern_data.shape[0]):
    cur_eq = pattern_data.iloc[i]['Equipment']
    cur_dt = pattern_data.iloc[i]['FaultDateTime']
    if cur_eq == 'ion source':
        labels.iloc[i]['Label'] = 1
        dfprec[cur_eq][i] -= 1
    for j in range(i, -1, -1):
        inner_eq = pattern_data.iloc[j]['Equipment']
        inner_dt = pattern_data.iloc[j]['FaultDateTime']
        if (cur_dt - inner_dt) <= twindow:
            if j != i:
                dfprec[inner_eq][i] += 1
        else:
            break

KeyboardInterrupt: 

### Train-test split

In [None]:
training_x, testing_x, training_y, testing_y = train_test_split(dfprec, labels, test_size=0.2, shuffle=False)

### Models

In [None]:
rf = RandomForestClassifier(
    n_estimators=100,
    min_samples_split=2,
    min_samples_leaf=1,
)
training_x.fillna(-1, inplace=True)
testing_x.fillna(-1, inplace=True)

rf.fit(training_x, training_y['Label'])
rf_pred = rf.predict(testing_x)

accuracy = accuracy_score(testing_y, rf_pred)
precision = precision_score(testing_y, rf_pred, zero_division=1)
recall = recall_score(testing_y, rf_pred, zero_division=1)
f1 = f1_score(testing_y, rf_pred, zero_division=1)
conf = confusion_matrix(testing_y, rf_pred)

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1}")
print(f"Confusion Matrix: \n {conf}")

Accuracy: 0.9674713584288053, Precision: 0.7883211678832117, Recall: 0.453781512605042, F1-Score: 0.576
Confusion Matrix: 
 [[4621   29]
 [ 130  108]]


In [None]:
dtrain = xgb.DMatrix(training_x, label=training_y)

dtest = xgb.DMatrix(testing_x)

num_round = 100

params = {
    'objective': 'binary:logistic',
    'max_depth': 3,
    'eta': 0.1,
    'eval_metric': 'logloss',
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'scale_pos_weight': len(training_y[training_y == 0]) / len(training_y[training_y == 1])
}

model_labels = xgb.train(params, dtrain, num_round)

model_pred = model_labels.predict(dtest)

model_pred = [int(round(value)) for value in model_pred]

accuracy = accuracy_score(testing_y, model_pred)
precision = precision_score(testing_y, model_pred)
recall = recall_score(testing_y, model_pred)
f1 = f1_score(testing_y, model_pred)

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1}")

Accuracy: 0.9615384615384616, Precision: 0.9166666666666666, Recall: 0.23109243697478993, F1-Score: 0.3691275167785235


### Look at n number of failures and predict if fail will happen in X days (3 for now)

In [None]:
set1 = set(pattern_data['Equipment'].tolist())
set2 = set(pattern_new['Equipment'].tolist())
combined = set1.union(set2)

In [None]:
dfprec1 = pd.DataFrame(columns=combined)
for label in dfprec1:
    dfprec1[label] = [0]*pattern_data.shape[0]
labels1 = pd.DataFrame()
labels1['Label'] = [0]*pattern_data.shape[0]

### For verification purposes for latest user run

In [None]:
dfprecnew = pd.DataFrame(columns=combined)
for label in dfprecnew:
    dfprecnew[label] = [0]*pattern_new.shape[0]
labelsnew = pd.DataFrame()
labelsnew['Label'] = [0]*pattern_new.shape[0]

### Data frames and labels

In [None]:
twindow = pd.Timedelta(days=3)
for i in range(pattern_data.shape[0]):
    temp = 0
    for j in range(i, i+40):
        if j>=pattern_data.shape[0]:
            break
        inner_eq = pattern_data.iloc[j]['Equipment']
        dfprec1[inner_eq][i] += 1
        temp = j
    cur_dt = pattern_data.iloc[temp]['FaultDateTime']
    for j in range(temp, pattern_data.shape[0]):
        inner_eq = pattern_data.iloc[j]['Equipment']
        inner_dt = pattern_data.iloc[j]['FaultDateTime']
        if inner_eq == 'ion source':
            labels1.iloc[i]['Label'] = 1
            break
        if (inner_dt - cur_dt) >= twindow:
            break

In [None]:
twindow = pd.Timedelta(days=3)
for i in range(pattern_new.shape[0]):
    temp = 0
    for j in range(i, i+40):
        if j>=pattern_new.shape[0]:
            break
        inner_eq = pattern_new.iloc[j]['Equipment']
        dfprecnew[inner_eq][i] += 1
        temp = j
    cur_dt = pattern_new.iloc[temp]['FaultDateTime']
    for j in range(temp, pattern_new.shape[0]):
        inner_eq = pattern_new.iloc[j]['Equipment']
        inner_dt = pattern_new.iloc[j]['FaultDateTime']
        if inner_eq == 'ion source':
            labelsnew.iloc[i]['Label'] = 1
            break
        if (inner_dt - cur_dt) >= twindow:
            break

In [None]:
pattern_combined = pd.concat([pattern_data, pattern_new], axis=0)

In [None]:
dfpreccomb = pd.DataFrame(columns=combined)
for label in dfpreccomb:
    dfpreccomb[label] = [0]*pattern_combined.shape[0]
labelscombined = pd.DataFrame()
labelscombined['Label'] = [0]*pattern_combined.shape[0]

In [None]:
twindow = pd.Timedelta(days=3)
for i in range(pattern_combined.shape[0]):
    temp = 0
    for j in range(i, i+40):
        if j>=pattern_combined.shape[0]:
            break
        inner_eq = pattern_combined.iloc[j]['Equipment']
        dfpreccomb[inner_eq][i] += 1
        temp = j
    cur_dt = pattern_combined.iloc[temp]['FaultDateTime']
    for j in range(temp, pattern_combined.shape[0]):
        inner_eq = pattern_combined.iloc[j]['Equipment']
        inner_dt = pattern_combined.iloc[j]['FaultDateTime']
        if inner_eq == 'ion source':
            labelscombined.iloc[i]['Label'] = 1
            break
        if (inner_dt - cur_dt) >= twindow:
            break

In [None]:
training_x, testing_x, training_y, testing_y = train_test_split(dfprec1, labels1, test_size=0.2, shuffle=False)

### Models

In [None]:
rf = RandomForestClassifier(
    n_estimators=100,
    criterion='entropy',
    min_samples_split=2,
    min_samples_leaf=1,
    bootstrap=True,

)
training_x.fillna(-1, inplace=True)
testing_x.fillna(-1, inplace=True)

rf.fit(training_x, training_y['Label'])
rf_pred = rf.predict(testing_x)

accuracy = accuracy_score(testing_y, rf_pred)
precision = precision_score(testing_y, rf_pred, zero_division=1)
recall = recall_score(testing_y, rf_pred, zero_division=1)
f1 = f1_score(testing_y, rf_pred, zero_division=1)
conf = confusion_matrix(testing_y, rf_pred)

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1}")
print(f"Confusion Matrix: \n {conf}")

Accuracy: 0.978518821603928, Precision: 0.9759463722397477, Recall: 0.982532751091703, F1-Score: 0.9792284866468843
Confusion Matrix: 
 [[2308   61]
 [  44 2475]]


In [None]:
rf_prednew = rf.predict(dfprecnew)
accuracy = accuracy_score(labelsnew, rf_prednew)
precision = precision_score(labelsnew, rf_prednew, zero_division=1)
recall = recall_score(labelsnew, rf_prednew, zero_division=1)
f1 = f1_score(labelsnew, rf_prednew, zero_division=1)
conf = confusion_matrix(labelsnew, rf_prednew)

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1}")
print(f"Confusion Matrix: \n {conf}")

Accuracy: 0.6092436974789915, Precision: 0.6420454545454546, Recall: 0.7902097902097902, F1-Score: 0.7084639498432602
Confusion Matrix: 
 [[ 64 126]
 [ 60 226]]


In [None]:
dtrain = xgb.DMatrix(training_x, label=training_y)

dtest = xgb.DMatrix(testing_x)

num_round = 100

params = {
    'max_depth': 3,
    'eta': 0.1
}

model_labels = xgb.train(params, dtrain, num_round)

model_pred = model_labels.predict(dtest)

model_pred = [int(round(value)) for value in model_pred]

accuracy = accuracy_score(testing_y, model_pred)
precision = precision_score(testing_y, model_pred)
recall = recall_score(testing_y, model_pred)
f1 = f1_score(testing_y, model_pred)
conf = confusion_matrix(testing_y, model_pred)

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1}")
print(f"Confusion Matrix: \n {conf}")

Accuracy: 0.7780278232405892, Precision: 0.7858851674641149, Recall: 0.7824533545057563, F1-Score: 0.7841655062661628
Confusion Matrix: 
 [[1832  537]
 [ 548 1971]]


In [None]:
svc = SVC(
    C=1,
    kernel='rbf',
    gamma=0.001,
    max_iter=-1
)

svc.fit(training_x, training_y['Label'])
svc_pred = svc.predict(testing_x)

accuracy = accuracy_score(testing_y, svc_pred)
precision = precision_score(testing_y, svc_pred)
recall = recall_score(testing_y, svc_pred)
f1 = f1_score(testing_y, svc_pred)
conf = confusion_matrix(testing_y, svc_pred)

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1}")
print(f"Confusion Matrix: \n {conf}")

Accuracy: 0.7375204582651391, Precision: 0.7634271099744245, Recall: 0.7109964271536324, F1-Score: 0.7362795477903392
Confusion Matrix: 
 [[1814  555]
 [ 728 1791]]


In [None]:
svc_prednew = svc.predict(dfprecnew)
accuracy = accuracy_score(labelsnew, svc_prednew)
precision = precision_score(labelsnew, svc_prednew)
recall = recall_score(labelsnew, svc_prednew)
f1 = f1_score(labelsnew, svc_prednew)
conf = confusion_matrix(labelsnew, svc_prednew)

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1}")
print(f"Confusion Matrix: \n {conf}")

Accuracy: 0.5861344537815126, Precision: 0.6072289156626506, Recall: 0.8811188811188811, F1-Score: 0.7189728958630528
Confusion Matrix: 
 [[ 27 163]
 [ 34 252]]


In [None]:
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=1)

model = keras.Sequential([
    keras.layers.Dense(512, input_dim=training_x.shape[1], activation='relu'),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(training_x, training_y, epochs=10, batch_size=32)

y_pred = (model.predict(testing_x) > 0.5).astype(int)

recall = recall_score(testing_y, y_pred)
print(recall)
# Summary of the model architecture

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.9722111949186185


In [None]:
y_prednew = (model.predict(dfprecnew) > 0.5).astype(int)

recall = recall_score(labelsnew, y_prednew)
print(recall)

0.6678321678321678


In [None]:
ada1 = AdaBoostClassifier(
    estimator=RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        criterion='entropy',
        min_samples_split=2,
        min_samples_leaf=2,
        max_features='sqrt',
        bootstrap=True,
    ),
    n_estimators=100,
    learning_rate=0.1
)
ada1.fit(training_x, training_y['Label'])


In [None]:
ada_pred1 = ada1.predict(testing_x)

accuracy = accuracy_score(testing_y, ada_pred1)
precision = precision_score(testing_y, ada_pred1)
recall = recall_score(testing_y, ada_pred1)
f1 = f1_score(testing_y, ada_pred1)
conf = confusion_matrix(testing_y, ada_pred1)

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1}")
print(f"Confusion Matrix: \n {conf}")

Accuracy: 0.9195990180032734, Precision: 0.9289749798224375, Recall: 0.9138547042477173, F1-Score: 0.9213528116870122
Confusion Matrix: 
 [[2193  176]
 [ 217 2302]]


In [None]:
ada_prednew1 = ada1.predict(dfprecnew)
accuracy = accuracy_score(labelsnew, ada_prednew1)
precision = precision_score(labelsnew, ada_prednew1)
recall = recall_score(labelsnew, ada_prednew1)
f1 = f1_score(labelsnew, ada_prednew1)
conf = confusion_matrix(labelsnew, ada_prednew1)

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1}")
print(f"Confusion Matrix: \n {conf}")

Accuracy: 0.6638655462184874, Precision: 0.6730769230769231, Recall: 0.8566433566433567, F1-Score: 0.7538461538461538
Confusion Matrix: 
 [[ 71 119]
 [ 41 245]]


### Changing approach, instead of having a dataframe where equipment pieces are rows and the values are number 
### of fails to having a each column be a specific failed piece

In [None]:
encoded = pattern_data.copy()
encoded['Equipment'] = LabelEncoder().fit_transform(encoded['Equipment'])

In [None]:
column_names = []
for i in range(20):
    column_names.append(f"Failed Piece {i+1}")
newTactic = pd.DataFrame(columns = column_names)
for column in newTactic:
    newTactic[column] = [-1]*encoded.shape[0]
newTacticLabel = [0]*encoded.shape[0]

In [None]:
twindow = pd.Timedelta(days=3)

for i in range(encoded.shape[0]):
    temp = 0
    for j in range(i, i+20):
        if j>=encoded.shape[0]:
            break
        inner_eq = encoded.iloc[j]['Equipment']
        newTactic[f"Failed Piece {j-i+1}"][i] = inner_eq
        temp = j
    cur_dt = encoded.iloc[temp]['FaultDateTime']
    for j in range(temp, encoded.shape[0]):
        inner_eq = pattern_data.iloc[j]['Equipment']
        inner_dt = encoded.iloc[j]['FaultDateTime']
        if inner_eq == 'ion source':
            newTacticLabel[i] = 1
            break
        if (inner_dt - cur_dt) >= twindow:
            break

In [None]:
training_x, testing_x, training_y, testing_y = train_test_split(newTactic, newTacticLabel, test_size=0.2, shuffle=True)

In [None]:
rf = RandomForestClassifier(
    n_estimators=100,
    criterion='entropy',
    min_samples_split=2,
    min_samples_leaf=1,
    bootstrap=True,

)
training_x.fillna(-1, inplace=True)
testing_x.fillna(-1, inplace=True)

rf.fit(training_x, training_y)
rf_pred = rf.predict(testing_x)

accuracy = accuracy_score(testing_y, rf_pred)
precision = precision_score(testing_y, rf_pred, zero_division=1)
recall = recall_score(testing_y, rf_pred, zero_division=1)
f1 = f1_score(testing_y, rf_pred, zero_division=1)
conf = confusion_matrix(testing_y, rf_pred)

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1}")
print(f"Confusion Matrix: \n {conf}")

Accuracy: 0.661620294599018, Precision: 0.6768935762224353, Recall: 0.9022364217252397, F1-Score: 0.7734867159682279
Confusion Matrix: 
 [[ 410 1348]
 [ 306 2824]]


Change the zeros table to a time window instead of n components

Change the second approach to include timestamps
