<a href="https://colab.research.google.com/github/ShivamG0897/Predictive-Maintenance/blob/ShivamG0897-patch-5/Log_reg_both_methods_PdM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Accessing GPU
# From Runtime in tools bar mentioned above < change runtime to gpu and verify using below code
import tensorflow as tf
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

Mounted at /content/drive
GPU is available


----------
***Method 1 (Traditional Approach)***

In [None]:
import pandas as pd
import numpy as np
import sklearn as sk
from scipy.stats import kurtosis, skew

In [None]:
df_failure = pd.read_csv('/content/drive/MyDrive/data files/PdM_failures.csv')
df_errors = pd.read_csv('/content/drive/MyDrive/data files/PdM_errors.csv')
df_machines = pd.read_csv('/content/drive/MyDrive/data files/PdM_machines.csv')
df_maint = pd.read_csv('/content/drive/MyDrive/data files/PdM_maint.csv')
df_telemetry = pd.read_csv('/content/drive/MyDrive/data files/PdM_telemetry.csv')

df = df_telemetry.copy()
df['datetime'] = pd.to_datetime(df['datetime'])
df['date'] = df['datetime'].dt.date
df['hour'] = df['datetime'].dt.strftime('%H:%M:%S')

In [None]:
# Function to calculate Mean Absolute Percentage Error (MAPE)
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Function to calculate Root Mean Squared Error (RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

# Function to calculate energy of the signal
def energy(x):
    return np.sum(x**2)/100

df['fft_rotate'] = np.fft.fft(df['rotate'])
df['fft_rotate_magnitude'] = np.abs(df['fft_rotate'])
df['fft_rotate_phase'] = np.angle(df['fft_rotate'])
df['fft_vibration'] = np.fft.fft(df['vibration'])
df['fft_vibration_magnitude'] = np.abs(df['fft_vibration'])
df['fft_vibration_phase'] = np.angle(df['fft_vibration'])
df['fft_pressure'] = np.fft.fft(df['pressure'])
df['fft_pressure_magnitude'] = np.abs(df['fft_pressure'])
df['fft_pressure_phase'] = np.angle(df['fft_pressure'])
df['fft_volt'] = np.fft.fft(df['volt'])
df['fft_volt_magnitude'] = np.abs(df['fft_volt'])
df['fft_volt_phase'] = np.angle(df['fft_volt'])

In [None]:
# Step 1: Group the DataFrame by 'date' and 'machineID'
grouped_data = df.groupby(['date', 'machineID'])

# Step 2: Apply aggregation functions on sensor columns for each group
aggregated_data = grouped_data.agg({
    'rotate': ['min', 'max', lambda x: kurtosis(x), lambda x: skew(x), lambda x: x.quantile(0.75) - x.quantile(0.25), 'median', 'mean','count'],
    'volt': ['min', 'max', lambda x: kurtosis(x), lambda x: skew(x), lambda x: x.quantile(0.75) - x.quantile(0.25), 'median', 'mean','count'],
    'pressure': ['min', 'max', lambda x: kurtosis(x), lambda x: skew(x), lambda x: x.quantile(0.75) - x.quantile(0.25), 'median', 'mean','count'],
    'vibration': ['min', 'max', lambda x: kurtosis(x), lambda x: skew(x), lambda x: x.quantile(0.75) - x.quantile(0.25), 'median', 'mean','count'],
    'fft_rotate_magnitude': ['min', 'max',lambda x: skew(x), lambda x: x.quantile(0.75) - x.quantile(0.25), 'median', 'mean','count'],
    'fft_pressure_magnitude': ['min', 'max',lambda x: skew(x), lambda x: x.quantile(0.75) - x.quantile(0.25), 'median', 'mean','count'],
    'fft_vibration_magnitude': ['min', 'max',lambda x: skew(x), lambda x: x.quantile(0.75) - x.quantile(0.25), 'median', 'mean','count'],
    'fft_volt_magnitude': ['min', 'max',lambda x: skew(x), lambda x: x.quantile(0.75) - x.quantile(0.25), 'median', 'mean','count']
})

# Add point MAPE, RMSE, cross-correlation, energy, and frequency domain features
for sensor in ['rotate','volt','pressure','vibration']:
    aggregated_data[sensor + '_mape'] = grouped_data[sensor].apply(lambda x: mape(x, x.median()))
    aggregated_data[sensor + '_rmse'] = grouped_data[sensor].apply(lambda x: rmse(x, x.median()))
    aggregated_data[sensor + '_energy'] = grouped_data[sensor].apply(lambda x: energy(x))

# Step 3: Flatten the multi-level column index
aggregated_data.columns = ['_'.join(col).strip() for col in aggregated_data.columns.values]

# Optionally, you can reset the index to get 'date' and 'machineID' as separate columns
aggregated_data.reset_index(inplace=True)

aggregated_data # Runtime using T4 GPU 15 mins

Unnamed: 0,date,machineID,rotate_min,rotate_max,rotate_<lambda_0>,rotate_<lambda_1>,rotate_<lambda_2>,rotate_median,rotate_mean,rotate_count,...,rotate_energy_,volt_mape_,volt_rmse_,volt_energy_,pressure_mape_,pressure_rmse_,pressure_energy_,vibration_mape_,vibration_rmse_,vibration_energy_
0,2015-01-01,1,346.149335,527.349825,-0.790617,0.156417,78.750562,432.850118,440.515328,18,...,35347.738788,4.575802,9.302293,5069.445363,8.604186,10.548155,1766.257442,11.743593,5.704275,294.314920
1,2015-01-01,2,369.738792,543.802540,0.093926,0.364777,47.779860,444.667492,445.200094,18,...,35974.143458,5.363811,12.454714,5458.087050,8.239716,10.058523,1844.523251,8.772652,4.556427,312.779934
2,2015-01-01,3,382.648588,531.139800,-1.240765,0.072763,70.039863,461.496434,454.152365,18,...,37532.339188,4.774495,10.525317,5156.185616,8.394769,10.250305,1831.949912,11.791365,5.089333,241.610255
3,2015-01-01,4,327.243866,551.327283,0.721164,0.150727,43.058493,444.503545,447.758764,18,...,36543.303589,7.806184,16.013438,5150.537714,5.826959,7.953796,1744.591585,9.420820,4.935726,315.492786
4,2015-01-01,5,308.578855,539.732729,-0.368387,-0.589313,94.737470,461.502012,450.183235,18,...,37149.047008,6.147190,14.366183,5430.741624,10.299086,12.004830,1738.507780,7.941181,4.015031,280.726882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36595,2016-01-01,96,360.998546,520.313613,-1.145422,-0.703820,78.128461,487.628145,458.058551,7,...,14922.819557,10.335148,19.816359,1903.101990,6.149300,7.904610,687.349701,8.110237,3.908547,102.033080
36596,2016-01-01,97,392.702026,522.348411,-1.148606,0.216379,63.044880,458.143799,449.519362,7,...,14278.171812,7.620247,15.388704,2166.255084,7.379155,9.183053,708.633796,9.740416,5.066905,117.978487
36597,2016-01-01,98,389.828191,526.828641,-0.930038,-0.102140,53.929362,450.198921,461.853080,7,...,15059.535978,7.503993,18.609660,2141.609528,5.883867,8.264347,1021.755656,9.183485,5.863154,127.128267
36598,2016-01-01,99,416.284422,491.390537,-1.500171,0.009017,44.860950,462.373730,450.518923,7,...,14259.507392,6.280918,13.993774,2017.681858,3.266363,4.356264,695.480434,9.846259,5.106796,126.531449


In [None]:
def merger_with_duplicate_row_remover(df1 , df2 ):
    print("*"*200)
    if ("date" in df2.columns):
        merged_df =pd.merge(df1, df2, on=['date','machineID'],how='left')
        merged_df = merged_df.replace(np.NaN,0)
        print("Shape of left dataset:                             ",df1.shape)
        print("Shape of the right dataset:                        ",df2.shape)
        print("Shape of merged dataset before checking duplicates:",merged_df.shape)

        #creating an extra column that will have unique datetime+machineID
        merged_df['combo'] = merged_df['machineID'].astype(str) +"~"+ merged_df['date'].astype(str)
        # merged_df['combo'].value_counts() to check duplicates Anything greater than 1 will be duplicated
        li = merged_df['combo'].value_counts()
        valids = li[li > 1].index
        print("Duplicate rows found:", len(valids))

        merged_df[merged_df['combo'].isin(valids)] #create a dataframe To get rows of deficit indices
        # Here dropping the duplicate rows becomes essential
        merged_df = merged_df.drop_duplicates(subset=['combo'])
        print("Duplicates rows removed:", len(valids)/2 )
        print("Shape of merged dataset after removing duplicate columns:", merged_df.shape)
    else:
        # Machine dataframe has no datatime plus no duplicates
        merged_df =pd.merge(df1, df2, on=['machineID'],how='left')
        merged_df = merged_df.replace(np.NaN,0)
        print("Shape of left dataset:                             ",df1.shape)
        print("Shape of the right dataset:                        ",df2.shape)
        print("Shape of merged dataset before checking duplicates:",merged_df.shape)

    return merged_df

err = df_errors.copy()
err['datetime'] = pd.to_datetime(err['datetime'])
err['date'] = err['datetime'].dt.date
maint = df_maint.copy()
maint['datetime'] = pd.to_datetime(maint['datetime'])
maint['date'] = maint['datetime'].dt.date

fail = df_failure.copy()
fail['datetime'] = pd.to_datetime(fail['datetime'])
fail['date'] = fail['datetime'].dt.date

data = pd.merge(aggregated_data, df_machines, how= 'left', on='machineID')
data = merger_with_duplicate_row_remover(data,err)
data = merger_with_duplicate_row_remover(data,maint)
data = merger_with_duplicate_row_remover(data,fail)
data.head()

********************************************************************************************************************************************************************************************************
Shape of left dataset:                              (36600, 76)
Shape of the right dataset:                         (3919, 4)
Shape of merged dataset before checking duplicates: (37078, 78)
Duplicate rows found: 422
Duplicates rows removed: 211.0
Shape of merged dataset after removing duplicate columns: (36600, 79)
********************************************************************************************************************************************************************************************************
Shape of left dataset:                              (36600, 79)
Shape of the right dataset:                         (3286, 4)
Shape of merged dataset before checking duplicates: (37323, 81)
Duplicate rows found: 723
Duplicates rows removed: 361.5
Shape of merged dataset after remo

Unnamed: 0,date,machineID,rotate_min,rotate_max,rotate_<lambda_0>,rotate_<lambda_1>,rotate_<lambda_2>,rotate_median,rotate_mean,rotate_count,...,vibration_energy_,model,age,datetime_x,errorID,combo,datetime_y,comp,datetime,failure
0,2015-01-01,1,346.149335,527.349825,-0.790617,0.156417,78.750562,432.850118,440.515328,18,...,294.31492,model3,18,0,0,1~2015-01-01,0,0,0,0
1,2015-01-01,2,369.738792,543.80254,0.093926,0.364777,47.77986,444.667492,445.200094,18,...,312.779934,model4,7,0,0,2~2015-01-01,0,0,0,0
2,2015-01-01,3,382.648588,531.1398,-1.240765,0.072763,70.039863,461.496434,454.152365,18,...,241.610255,model3,8,0,0,3~2015-01-01,0,0,0,0
3,2015-01-01,4,327.243866,551.327283,0.721164,0.150727,43.058493,444.503545,447.758764,18,...,315.492786,model3,7,0,0,4~2015-01-01,0,0,0,0
4,2015-01-01,5,308.578855,539.732729,-0.368387,-0.589313,94.73747,461.502012,450.183235,18,...,280.726882,model3,2,0,0,5~2015-01-01,0,0,0,0


In [None]:
#Further target column will be transformed with label encoding ,and other categorical columns with dummy encoding

data_encoded = pd.get_dummies(data, columns=['comp', 'errorID', 'model'], drop_first=True)
data_encoded = data_encoded.replace({'failure' : 0}, '0')
print(data_encoded.failure.unique()) #To verify the change
data_encoded.head(2)
data_encoded.drop(["combo","datetime_x", 'datetime_y','datetime'] , axis=1 , inplace=True)

['0' 'comp1' 'comp4' 'comp3' 'comp2']


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Get X_train and y_train : original training data
X_train, X_test, y_train, y_test = train_test_split(data_encoded.drop(['failure','date'], axis=1), data_encoded['failure'], test_size=0.2, random_state=42)

# Split the data into training set and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.18, random_state=42)

# Now we have X_train and y_train as the new training set, and X_val and y_val as the validation set.

# Fit the encoder on y_train to learn the mapping
enc = LabelEncoder()
enc.fit(y_train)
enc.fit(y_val)

y_train = enc.transform(y_train)
y_test = enc.transform(y_test)
y_val_encoded = enc.transform(y_val)

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

scalers = [StandardScaler(), RobustScaler(), MinMaxScaler(), MaxAbsScaler(), Normalizer()]
scaler_names = ['StandardScaler', 'RobustScaler', 'MinMaxScaler', 'MaxAbsScaler', 'Normalizer']

results = {'Scaler': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1': []}

for scaler, name in zip(scalers, scaler_names):
    # Scale the data
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    # Train the model
    mdl = LogisticRegression(max_iter=8000)
    mdl.fit(X_train_scaled, y_train)

    # Make predictions
    y_val_pred = mdl.predict(X_val_scaled)
    y_test_pred = mdl.predict(X_test_scaled)

    # Compute metrics
    accuracy = accuracy_score(y_val_encoded, y_val_pred)
    precision = precision_score(y_val_encoded, y_val_pred, average='macro')
    recall = recall_score(y_val_encoded, y_val_pred, average='macro')
    f1 = f1_score(y_val_encoded, y_val_pred, average='macro')

    # Store results
    results['Scaler'].append(name)
    results['Accuracy'].append(accuracy)
    results['Precision'].append(precision)
    results['Recall'].append(recall)
    results['F1'].append(f1)

    # Print the results
    print(f"Results for {name}:")
    print("Validation Set Evaluation:")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1: ", f1)
    print("Confusion Matrix:")
    print(confusion_matrix(y_val_encoded, y_val_pred))
    print("Classification Report:")
    print(classification_report(y_val_encoded, y_val_pred))

    print("Test Set Evaluation:")
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, average='macro')
    test_recall = recall_score(y_test, y_test_pred, average='macro')
    test_f1 = f1_score(y_test, y_test_pred, average='macro')
    print("Accuracy: ", test_accuracy)
    print("Precision: ", test_precision)
    print("Recall: ", test_recall)
    print("F1: ", test_f1)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_test_pred))

# Convert results to DataFrame
results_df = pd.DataFrame(results)
display(results_df)


----------------
***Method 2 (Window and lag method)***
----------------

In [None]:
# Failures
df_failure['datetime'] = pd.to_datetime(df_failure['datetime'], format="%Y-%m-%d %H:%M:%S")
df_failure['failure'] = df_failure['failure'].astype('category')
# Errors
df_errors['datetime'] = pd.to_datetime(df_errors['datetime'],format = '%Y-%m-%d %H:%M:%S')
df_errors['errorID'] = df_errors['errorID'].astype('category')
# Maintenance
df_maint['datetime'] = pd.to_datetime(df_maint['datetime'], format='%Y-%m-%d %H:%M:%S')
df_maint['comp'] = df_maint['comp'].astype('category')
# Machines Meta
df_machines['model'] = df_machines['model'].astype('category')
# Telemetry
df_telemetry['datetime'] = pd.to_datetime(df_telemetry['datetime'])
df_telemetry.set_index('datetime', inplace=True)

In [None]:
# Calculate mean values for telemetry features
# https://www.youtube.com/watch?v=l4dvMiSDBzs # how resample works
# https://www.youtube.com/watch?v=KuT2n1w0Ixc&list=LL&index=1 ; explains pivot shift is good for reliability
temp = []
fields = ['volt', 'rotate', 'pressure', 'vibration']
for col in fields:
    temp.append(pd.pivot_table(df_telemetry,
                               index='datetime',
                               columns='machineID',
                               values=col).resample('3H', closed='left', label='right').mean().unstack())
telemetry_mean_3h = pd.concat(temp, axis=1)
telemetry_mean_3h.columns = [i + 'mean_3h' for i in fields]
telemetry_mean_3h.reset_index(inplace=True)

# repeat for standard deviation
temp = []
for col in fields:
    temp.append(pd.pivot_table(df_telemetry,
                               index='datetime',
                               columns='machineID',
                               values=col).resample('3H', closed='left', label='right').std().unstack())
telemetry_sd_3h = pd.concat(temp, axis=1)
telemetry_sd_3h.columns = [i + 'sd_3h' for i in fields]
telemetry_sd_3h.reset_index(inplace=True)

temp = []
fields = ['volt', 'rotate', 'pressure', 'vibration']
for col in fields:
    temp.append(df_telemetry.pivot_table(index='datetime', columns='machineID', values=col)
                    .rolling(window=24)
                    .mean()
                    .resample('3H', closed='left', label='right')
                    .first()
                    .unstack())
telemetry_mean_24h = pd.concat(temp, axis=1)
telemetry_mean_24h.columns = [i + 'mean_24h' for i in fields]
telemetry_mean_24h.reset_index(inplace=True)
telemetry_mean_24h = telemetry_mean_24h.loc[-telemetry_mean_24h['voltmean_24h'].isnull()]

# Calculate rolling standard deviations
temp = []
for col in fields:
    temp.append(pd.pivot_table(df_telemetry,
                               index='datetime',
                               columns='machineID',
                               values=col)
                .rolling(window=24)
                .std()
                .resample('3H', closed='left', label='right')
                .first()
                .unstack())
telemetry_sd_24h = pd.concat(temp, axis=1)
telemetry_sd_24h.columns = [i + 'sd_24h' for i in fields]
telemetry_sd_24h.reset_index(inplace=True)
telemetry_sd_24h = telemetry_sd_24h.loc[-telemetry_sd_24h['voltsd_24h'].isnull()]

# merge columns of feature sets created earlier
telemetry_feat = pd.concat([telemetry_mean_3h,
                            telemetry_sd_3h.iloc[:, 2:6],
                            telemetry_mean_24h.iloc[:, 2:6],
                            telemetry_sd_24h.iloc[:, 2:6]], axis=1).dropna()
telemetry_feat.keys()

In [None]:

comp_rep = pd.get_dummies(df_maint.set_index('datetime')).reset_index() # create a column for each maintenance type
comp_rep.columns = ['datetime', 'machineID', 'comp1', 'comp2', 'comp3', 'comp4']

# combine repairs for a given machine in a given hour
comp_rep = comp_rep.groupby(['machineID', 'datetime']).sum().reset_index()

# add timepoints where no components were replaced
df_telemetry.reset_index(inplace = True)
comp_rep = df_telemetry[['datetime', 'machineID']].merge(comp_rep,
                                                      on=['datetime', 'machineID'],
                                                      how='outer').fillna(0).sort_values(by=['machineID', 'datetime'])

components = ['comp1', 'comp2', 'comp3', 'comp4']
for comp in components:
    # convert indicator to most recent date of component change
    comp_rep.loc[comp_rep[comp] < 1, comp] = None
    comp_rep.loc[-comp_rep[comp].isnull(), comp] = comp_rep.loc[-comp_rep[comp].isnull(), 'datetime']

    # forward-fill the most-recent date of component change
    comp_rep[comp] = comp_rep[comp].fillna(method='ffill')

# remove dates in 2014 (may have NaN or future component change dates)
comp_rep = comp_rep.loc[comp_rep['datetime'] > pd.to_datetime('2015-01-01')]

comp_rep = comp_rep.sort_values(by=['machineID', 'datetime']).copy()

# replace dates of most recent component change with days since most recent component change
for comp in components:
    comp_rep.loc[:, comp] = (comp_rep['datetime'] - comp_rep[comp]) / np.timedelta64(1, 'D')

In [None]:
error_count = pd.get_dummies(df_errors.set_index('datetime')).reset_index()
error_count.columns = ['datetime', 'machineID', 'error1', 'error2', 'error3', 'error4', 'error5']
# combine errors for a given machine in a given hour
error_count = error_count.groupby(['machineID','datetime']).sum().reset_index()
error_count = df_telemetry[['datetime', 'machineID']].merge(error_count, on=['machineID', 'datetime'], how='left').fillna(0.0)
#display(error_count.describe())
# compute the total number of errors of each type over the last 24 hours, for timepoints taken every three hours

temp = []
fields = ['error%d' % i for i in range(1, 6)] # creates strings like "error1"..."error5".

for col in fields:
    temp.append(
        pd.pivot_table(error_count,
                       index='datetime',
                       columns='machineID',
                       values=col)
        .rolling(window=24)
        .sum()
        .resample('3H', closed='left', label='right')
        .first()
        .unstack()
    )

error_count = pd.concat(temp, axis=1)
error_count.columns = [i + 'count' for i in fields]
error_count.reset_index(inplace=True)
error_count = error_count.dropna()

final_feat = telemetry_feat.merge(error_count, on=['datetime', 'machineID'], how='left')
final_feat = final_feat.merge(comp_rep, on=['datetime', 'machineID'], how='left')
final_feat = final_feat.merge(df_machines, on=['machineID'], how='left')

In [None]:
# Failure type 'none' couldn't be directly added to the column.
labeled_features = final_feat.merge(df_failure, on=['datetime', 'machineID'], how='left')
labeled_features = labeled_features.fillna(method='bfill', limit=7)
labeled_features = labeled_features.fillna(method='ffill', limit=7)
labeled_features['failure'] = labeled_features['failure'].cat.add_categories(['none'])

# Fill missing values with the 'missing' category
labeled_features['failure'].fillna('none', inplace=True)
labeled_features = pd.get_dummies(data = labeled_features, columns=['model'])

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Get X_train and y_train : original training data
X_train, X_test, y_train, y_test = train_test_split(labeled_features.drop(['failure','datetime'], axis=1), labeled_features['failure'], test_size=0.2, random_state=42)

# Split the data into training set and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.18, random_state=42)

# Now we have X_train and y_train as the new training set, and X_val and y_val as the validation set.

# Fit the encoder on y_train to learn the mapping
enc = LabelEncoder()
enc.fit(y_train)
enc.fit(y_val)

y_train = enc.transform(y_train)
y_test = enc.transform(y_test)
y_val_encoded = enc.transform(y_val)

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

scalers = [StandardScaler(), RobustScaler(), MinMaxScaler(), MaxAbsScaler(), Normalizer()]
scaler_names = ['StandardScaler', 'RobustScaler', 'MinMaxScaler', 'MaxAbsScaler', 'Normalizer']

results = {'Scaler': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1': []}

for scaler, name in zip(scalers, scaler_names):
    # Scale the data
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    # Train the model
    mdl = LogisticRegression(max_iter=8000)
    mdl.fit(X_train_scaled, y_train)

    # Make predictions
    y_val_pred = mdl.predict(X_val_scaled)
    y_test_pred = mdl.predict(X_test_scaled)

    # Compute metrics
    accuracy = accuracy_score(y_val_encoded, y_val_pred)
    precision = precision_score(y_val_encoded, y_val_pred, average='macro')
    recall = recall_score(y_val_encoded, y_val_pred, average='macro')
    f1 = f1_score(y_val_encoded, y_val_pred, average='macro')

    # Store results
    results['Scaler'].append(name)
    results['Accuracy'].append(accuracy)
    results['Precision'].append(precision)
    results['Recall'].append(recall)
    results['F1'].append(f1)

    # Print the results
    print(f"Results for {name}:")
    print("Validation Set Evaluation:")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1: ", f1)
    print("Confusion Matrix:")
    print(confusion_matrix(y_val_encoded, y_val_pred))
    print("Classification Report:")
    print(classification_report(y_val_encoded, y_val_pred))

    print("Test Set Evaluation:")
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, average='macro')
    test_recall = recall_score(y_test, y_test_pred, average='macro')
    test_f1 = f1_score(y_test, y_test_pred, average='macro')
    print("Accuracy: ", test_accuracy)
    print("Precision: ", test_precision)
    print("Recall: ", test_recall)
    print("F1: ", test_f1)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_test_pred))

# Convert results to DataFrame
results_df = pd.DataFrame(results)
display(results_df)

--------------------------------------------------

***LightGBM on Method 1***

In [None]:
from sklearn.model_selection import train_test_split

# Get X_train and y_train : original training data
X_train, X_test, y_train, y_test = train_test_split(data_encoded.drop(['failure','date'], axis=1), data_encoded['failure'], test_size=0.2, random_state=42)

# Split the data into training set and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.18, random_state=42)

import lightgbm as lgb

# Initialize the LightGBM model
lgb_model = lgb.LGBMClassifier(random_state=42, objective='multiclass', verbose=2, boosting_type='gbdt') #
lgb_model.fit(X_train, y_train)

# Make predictions
y_val_pred = lgb_model.predict(X_val)
y_test_pred = lgb_model.predict(X_test)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.988294
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.202040
[LightGBM] [Debug] init for col-wise cost 0.002957 seconds, init for row-wise cost 0.013710 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16497
[LightGBM] [Info] Number of data points in the train set: 24009, number of used features: 86
[LightGBM] [Info] Start training from score -0.020790
[LightGBM] [Info] Start training from score -5.188344
[LightGBM] [Info] Start training from score -4.893227
[LightGBM] [Info] Start training from score -5.755451
[LightGBM] [Info] Start training from score -5.441793
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 24 and depth = 7
[LightGBM] [Debug] Trained 

In [None]:
# Calculating metrics for the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred, average='macro')
val_recall = recall_score(y_val, y_val_pred, average='macro')
val_f1 = f1_score(y_val, y_val_pred, average='macro')

# Calculating metrics for the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='macro')
test_recall = recall_score(y_test, y_test_pred, average='macro')
test_f1 = f1_score(y_test, y_test_pred, average='macro')

# Creating a DataFrame to store these metrics
results_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1'],
    'Validation': [val_accuracy, val_precision, val_recall, val_f1],
    'Test': [test_accuracy, test_precision, test_recall, test_f1]
})

display(results_df)

