In [9]:
import pandas as pd
import numpy as np 
import sklearn as sk 
from scipy.stats import kurtosis, skew

In [10]:
df_failure = pd.read_csv('PdM_failures.csv')
df_errors = pd.read_csv('PdM_errors.csv')
df_machines = pd.read_csv('PdM_machines.csv')
df_maint = pd.read_csv('PdM_maint.csv')
df_telemetry = pd.read_csv('PdM_telemetry.csv')
collated = pd.read_csv('collated.csv')
df = df_telemetry.copy()
df['datetime'] = pd.to_datetime(df['datetime'])
df['date'] = df['datetime'].dt.date
df['hour'] = df['datetime'].dt.strftime('%H:%M:%S')

In [11]:
# Function to calculate Mean Absolute Percentage Error (MAPE)
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Function to calculate Root Mean Squared Error (RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

# Function to calculate cross-correlation between two signals
def cross_correlation(x, y):
    return np.correlate(x, y, mode='same')

# Function to calculate energy of the signal
def energy(x):
    return np.sum(x**2)/100



In [12]:
df['fft_rotate'] = np.fft.fft(df['rotate'])
df['fft_rotate_magnitude'] = np.abs(df['fft_rotate'])
df['fft_rotate_phase'] = np.angle(df['fft_rotate'])
df['fft_vibration'] = np.fft.fft(df['vibration'])
df['fft_vibration_magnitude'] = np.abs(df['fft_vibration'])
df['fft_vibration_phase'] = np.angle(df['fft_vibration'])
df['fft_pressure'] = np.fft.fft(df['pressure'])
df['fft_pressure_magnitude'] = np.abs(df['fft_pressure'])
df['fft_pressure_phase'] = np.angle(df['fft_pressure'])
df['fft_volt'] = np.fft.fft(df['volt'])
df['fft_volt_magnitude'] = np.abs(df['fft_volt'])
df['fft_volt_phase'] = np.angle(df['fft_volt'])


df['signal_mag_area'] = (df['fft_rotate_magnitude'] + df['fft_vibration_magnitude'] + df['fft_pressure_magnitude'] + df['fft_volt_magnitude'])/100


In [13]:
# Step 1: Group the DataFrame by 'date' and 'machineID'
grouped_data = df.groupby(['date', 'machineID'])

# Step 2: Apply aggregation functions on sensor columns for each group
aggregated_data = grouped_data.agg({
    'rotate': ['min', 'max', lambda x: kurtosis(x), lambda x: skew(x), lambda x: x.quantile(0.75) - x.quantile(0.25), 'median', 'mean','count'],
    'volt': ['min', 'max', lambda x: kurtosis(x), lambda x: skew(x), lambda x: x.quantile(0.75) - x.quantile(0.25), 'median', 'mean','count'],
    'pressure': ['min', 'max', lambda x: kurtosis(x), lambda x: skew(x), lambda x: x.quantile(0.75) - x.quantile(0.25), 'median', 'mean','count'],
    'vibration': ['min', 'max', lambda x: kurtosis(x), lambda x: skew(x), lambda x: x.quantile(0.75) - x.quantile(0.25), 'median', 'mean','count'],
    'fft_rotate_magnitude': ['min', 'max',lambda x: skew(x), lambda x: x.quantile(0.75) - x.quantile(0.25), 'median', 'mean','count'],
    'fft_pressure_magnitude': ['min', 'max',lambda x: skew(x), lambda x: x.quantile(0.75) - x.quantile(0.25), 'median', 'mean','count'],
    'fft_vibration_magnitude': ['min', 'max',lambda x: skew(x), lambda x: x.quantile(0.75) - x.quantile(0.25), 'median', 'mean','count'],
    'fft_volt_magnitude': ['min', 'max',lambda x: skew(x), lambda x: x.quantile(0.75) - x.quantile(0.25), 'median', 'mean','count']
})

# Add point MAPE, RMSE, cross-correlation, energy, and frequency domain features
for sensor in ['rotate','volt','pressure','vibration','fft_rotate_magnitude','fft_pressure_magnitude','fft_vibration_magnitude','fft_volt_magnitude','signal_mag_area']:
    aggregated_data[sensor + '_mape'] = grouped_data[sensor].apply(lambda x: mape(x, x.median()))
    aggregated_data[sensor + '_rmse'] = grouped_data[sensor].apply(lambda x: rmse(x, x.median()))
    aggregated_data[sensor + '_energy'] = grouped_data[sensor].apply(lambda x: energy(x))

# Step 3: Flatten the multi-level column index
aggregated_data.columns = ['_'.join(col).strip() for col in aggregated_data.columns.values]

# Optionally, you can reset the index to get 'date' and 'machineID' as separate columns
aggregated_data.reset_index(inplace=True)

aggregated_data


Unnamed: 0,date,machineID,rotate_min,rotate_max,rotate_<lambda_0>,rotate_<lambda_1>,rotate_<lambda_2>,rotate_median,rotate_mean,rotate_count,...,fft_pressure_magnitude_energy_,fft_vibration_magnitude_mape_,fft_vibration_magnitude_rmse_,fft_vibration_magnitude_energy_,fft_volt_magnitude_mape_,fft_volt_magnitude_rmse_,fft_volt_magnitude_energy_,signal_mag_area_mape_,signal_mag_area_rmse_,signal_mag_area_energy_
0,2015-01-01,1,346.149335,527.349825,-0.790617,0.156417,78.750562,432.850118,440.515328,18,...,7.807908e+13,34.686905,8.337336e+06,1.251839e+13,50.344818,3.526230e+07,2.238566e+14,31.095042,1.566259e+06,4.417369e+11
1,2015-01-01,2,369.738792,543.802540,0.093926,0.364777,47.779860,444.667492,445.200094,18,...,8.793219e+07,78.885919,4.052344e+03,1.461031e+07,59.472938,1.233967e+04,1.327563e+08,21.840449,3.890529e+02,2.483783e+05
2,2015-01-01,3,382.648588,531.139800,-1.240765,0.072763,70.039863,461.496434,454.152365,18,...,1.933841e+07,52.362296,2.165231e+03,3.243161e+06,51.215365,6.231189e+03,4.040704e+07,26.900778,2.764719e+02,1.199865e+05
3,2015-01-01,4,327.243866,551.327283,0.721164,0.150727,43.058493,444.503545,447.758764,18,...,3.488463e+07,157.143177,3.017703e+03,4.651890e+06,43.115204,5.029131e+03,2.867365e+07,22.249581,2.267437e+02,1.382671e+05
4,2015-01-01,5,308.578855,539.732729,-0.368387,-0.589313,94.737470,461.502012,450.183235,18,...,1.594540e+07,90.424001,2.451706e+03,4.864141e+06,65.013985,5.540018e+03,3.057701e+07,25.863185,2.079213e+02,1.065060e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36595,2016-01-01,96,360.998546,520.313613,-1.145422,-0.703820,78.128461,487.628145,458.058551,7,...,7.920819e+06,138.299589,2.620230e+03,1.947443e+06,105.411223,5.738522e+03,1.126734e+07,28.126394,1.757182e+02,2.781661e+04
36596,2016-01-01,97,392.702026,522.348411,-1.148606,0.216379,63.044880,458.143799,449.519362,7,...,2.064831e+07,88.794129,2.270597e+03,6.343661e+05,51.068321,5.682096e+03,1.064753e+07,14.330537,1.305454e+02,4.452638e+04
36597,2016-01-01,98,389.828191,526.828641,-0.930038,-0.102140,53.929362,450.198921,461.853080,7,...,9.068055e+06,39.005014,1.100892e+03,4.642791e+05,73.961903,5.745381e+03,1.249095e+07,21.871390,3.041467e+02,5.399709e+04
36598,2016-01-01,99,416.284422,491.390537,-1.500171,0.009017,44.860950,462.373730,450.518923,7,...,2.070457e+07,113.770082,4.705816e+03,6.433879e+06,28.340584,1.119315e+04,5.542648e+07,20.784016,5.197250e+02,1.217503e+05


In [15]:
#aggregated_data.to_csv('aggregated.csv')
df_agg = aggregated_data.copy()


def merger_with_duplicate_row_remover(df1 , df2 ):
    print("*"*200)
    if ("date" in df2.columns):
        merged_df =pd.merge(df1, df2, on=['date','machineID'],how='left')
        merged_df = merged_df.replace(np.NaN,0)
        print("Shape of left dataset:                             ",df1.shape)
        print("Shape of the right dataset:                        ",df2.shape)
        print("Shape of merged dataset before checking duplicates:",merged_df.shape)

        #creating an extra column that will have unique datetime+machineID
        merged_df['combo'] = merged_df['machineID'].astype(str) +"~"+ merged_df['date'].astype(str) 
        # merged_df['combo'].value_counts() to check duplicates Anything greater than 1 will be duplicated
        li = merged_df['combo'].value_counts()
        valids = li[li > 1].index  
        print("Duplicate rows found:", len(valids))

        merged_df[merged_df['combo'].isin(valids)] #create a dataframe To get rows of deficit indices
        # Here dropping the duplicate rows becomes essential 
        merged_df = merged_df.drop_duplicates(subset=['combo']) 
        print("Duplicates rows removed:", len(valids)/2 )
        print("Shape of merged dataset after removing duplicate columns:", merged_df.shape)
    else:
        # Machine dataframe has no datatime plus no duplicates
        merged_df =pd.merge(df1, df2, on=['machineID'],how='left')
        merged_df = merged_df.replace(np.NaN,0)
        print("Shape of left dataset:                             ",df1.shape)
        print("Shape of the right dataset:                        ",df2.shape)
        print("Shape of merged dataset before checking duplicates:",merged_df.shape)
        
    return merged_df

err = df_errors.copy()
err['datetime'] = pd.to_datetime(err['datetime'])
err['date'] = err['datetime'].dt.date
maint = df_maint.copy()
maint['datetime'] = pd.to_datetime(maint['datetime'])
maint['date'] = maint['datetime'].dt.date

fail = df_failure.copy()
fail['datetime'] = pd.to_datetime(fail['datetime'])
fail['date'] = fail['datetime'].dt.date

data = pd.merge(df_agg, df_machines, how= 'left', on='machineID')
data = merger_with_duplicate_row_remover(data,err)
data = merger_with_duplicate_row_remover(data,maint)
data = merger_with_duplicate_row_remover(data,fail)
data.head()

********************************************************************************************************************************************************************************************************
Shape of left dataset:                              (36600, 91)
Shape of the right dataset:                         (3919, 4)
Shape of merged dataset before checking duplicates: (37078, 93)
Duplicate rows found: 422
Duplicates rows removed: 211.0
Shape of merged dataset after removing duplicate columns: (36600, 94)
********************************************************************************************************************************************************************************************************
Shape of left dataset:                              (36600, 94)
Shape of the right dataset:                         (3286, 4)
Shape of merged dataset before checking duplicates: (37323, 96)
Duplicate rows found: 723
Duplicates rows removed: 361.5
Shape of merged dataset after remo

Unnamed: 0,date,machineID,rotate_min,rotate_max,rotate_<lambda_0>,rotate_<lambda_1>,rotate_<lambda_2>,rotate_median,rotate_mean,rotate_count,...,signal_mag_area_energy_,model,age,datetime_x,errorID,combo,datetime_y,comp,datetime,failure
0,2015-01-01,1,346.149335,527.349825,-0.790617,0.156417,78.750562,432.850118,440.515328,18,...,441736900000.0,model3,18,0,0,1~2015-01-01,0,0,0,0
1,2015-01-01,2,369.738792,543.80254,0.093926,0.364777,47.77986,444.667492,445.200094,18,...,248378.3,model4,7,0,0,2~2015-01-01,0,0,0,0
2,2015-01-01,3,382.648588,531.1398,-1.240765,0.072763,70.039863,461.496434,454.152365,18,...,119986.5,model3,8,0,0,3~2015-01-01,0,0,0,0
3,2015-01-01,4,327.243866,551.327283,0.721164,0.150727,43.058493,444.503545,447.758764,18,...,138267.1,model3,7,0,0,4~2015-01-01,0,0,0,0
4,2015-01-01,5,308.578855,539.732729,-0.368387,-0.589313,94.73747,461.502012,450.183235,18,...,106506.0,model3,2,0,0,5~2015-01-01,0,0,0,0


In [16]:
#Further target column will be transformed with label encoding ,and other categorical columns with dummy encoding 

data_encoded = pd.get_dummies(data, columns=['comp', 'errorID', 'model'], drop_first=True)
data_encoded = data_encoded.replace({'failure' : 0}, '0')
print(data_encoded.failure.unique()) #To verify the change 
data_encoded.head(2)
data_encoded.drop(["combo","datetime_x", 'datetime_y','datetime'] , axis=1 , inplace=True)

['0' 'comp1' 'comp4' 'comp3' 'comp2']


In [26]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Get X_train and y_train : original training data
X_train, X_test, y_train, y_test = train_test_split(data_encoded.drop(['failure','date'], axis=1), data_encoded['failure'], test_size=0.2, random_state=42)

# Split the data into training set and validation set (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.18, random_state=42)

# Now we have X_train and y_train as the new training set, and X_val and y_val as the validation set.

# Fit the encoder on y_train to learn the mapping
enc = LabelEncoder()
enc.fit(y_train)
enc.fit(y_val)

y_train = enc.transform(y_train)
y_test = enc.transform(y_test)
y_val_encoded = enc.transform(y_val)

#Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
# Create and train the Logistic Regression model
mdl = LogisticRegression(max_iter=8000)
mdl.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = mdl.predict(X_val) 
y_val_pred_prob = mdl.predict_proba(X_val)[:,1]    # Predicts the probability of the validation data

# Evaluate the model on the validation set
print("Validation Set Evaluation:")
print("Accuracy: ", accuracy_score(y_val_encoded, y_val_pred))
print("Precision: ", precision_score(y_val_encoded, y_val_pred, average='macro'))
print("Recall: ", recall_score(y_val_encoded, y_val_pred, average='macro'))
print("F1: ", f1_score(y_val_encoded, y_val_pred, average='macro'))
print("Confusion Matrix:")
print(confusion_matrix(y_val_encoded, y_val_pred))
print("Classification Report:")
print(classification_report(y_val_encoded, y_val_pred))

# Predict on the test set
y_test_pred = mdl.predict(X_test) 
y_test_pred_prob = mdl.predict_proba(X_test)[:,1]    # Predicts the probability of the test data

# Evaluate the model on the test set
print("Test Set Evaluation:")
print("Accuracy: ", accuracy_score(y_test, y_test_pred))
print("Precision: ", precision_score(y_test, y_test_pred, average='macro'))
print("Recall: ", recall_score(y_test, y_test_pred, average='macro'))
print("F1: ", f1_score(y_test, y_test_pred, average='macro'))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print("Classification Report:")
print(classification_report(y_test, y_test_pred))


Validation Set Evaluation:
Accuracy:  0.9893758300132802
Precision:  0.7629240169764507
Recall:  0.6915949178151
F1:  0.7182985952330304
Confusion Matrix:
[[5159    6    8    4    1]
 [  10    9    1    2    0]
 [  12    0   17    0    0]
 [   3    0    0   13    0]
 [   7    1    1    0   17]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5178
           1       0.56      0.41      0.47        22
           2       0.63      0.59      0.61        29
           3       0.68      0.81      0.74        16
           4       0.94      0.65      0.77        26

    accuracy                           0.99      5271
   macro avg       0.76      0.69      0.72      5271
weighted avg       0.99      0.99      0.99      5271

Test Set Evaluation:
Accuracy:  0.9901639344262295
Precision:  0.7812758397932816
Recall:  0.7340519549271931
F1:  0.7518573959815164
Confusion Matrix:
[[7162    8   16    2    1]
 [  19   16 

In [18]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Assuming you have the features (X_train) and labels (y_train) ready in a DataFrame format
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define the parameters for XGBoost
params = {
    'objective': 'binary:logistic',  # For binary classification
    'eval_metric': ['logloss', 'error', 'auc'],  # Define multiple evaluation metrics
    'max_depth': 3,
    'eta': 0.1,
    'seed': 42
}

# Train the XGBoost model
num_rounds = 100
xgb_model = xgb.train(params, dtrain, num_rounds)

# Make predictions
y_pred_probs = xgb_model.predict(dtest)
y_pred = [1 if prob >= 0.5 else 0 for prob in y_pred_probs]

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_probs)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


ModuleNotFoundError: No module named 'xgboost'