In [1]:
pip install tensorflow pandas numpy matplotlib scikit-learn

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp312-cp312-win_amd64.whl.metadata (3.3 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Downloading numpy-2.2.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow)
  Downloading tensorflow_intel-2.18.0-cp312-cp312-win_amd64.whl.metadata (4.9 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-n

In [None]:
import numpy as np
import pandas as pd
import scipy
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from scipy.stats import skew, kurtosis, mode

print("numpy version:", np.__version__)
print("pandas version:", pd.__version__)
print("scipy version:", scipy.__version__)
print("scikit-learn version:", sklearn.__version__)

numpy version: 2.0.2
pandas version: 2.2.3
scipy version: 1.15.2
scikit-learn version: 1.6.1


In [2]:
# Load the datasets
data_18_noleak = pd.read_csv('Accelerometer_readings/LO_NL_0.18 LPS_A2.csv')
data_noFlow_noleak = pd.read_csv('Accelerometer_readings/LO_NL_ND_A2.csv')
data_47_noleak = pd.read_csv('Accelerometer_readings/LO_NL_0.47 LPS_A2.csv')


data_18_gasket = pd.read_csv('Accelerometer_readings/LO_GL_0.18 LPS_A2.csv')
data_gasket_noFlow = pd.read_csv('Accelerometer_readings/LO_GL_ND_A2.csv')
data_47_nd = pd.read_csv('Accelerometer_readings/LO_GL_0.47 LPS_A2.csv')

data_18_CC = pd.read_csv('Accelerometer_readings/LO_CC_0.18 LPS_A2.csv')
data_CC_noFlow = pd.read_csv('Accelerometer_readings/LO_CC_ND_A2.csv')
data_47_nd_CC = pd.read_csv('Accelerometer_readings/LO_CC_0.47 LPS_A2.csv')

data_18_OL = pd.read_csv('Accelerometer_readings/LO_OL_0.18 LPS_A2.csv')
data_OL_noFlow = pd.read_csv('Accelerometer_readings/LO_OL_ND_A2.csv')
data_47_nd_OL = pd.read_csv('Accelerometer_readings/LO_OL_0.47 LPS_A2.csv')
# Combine No Leak and Leak data
no_leak_data = pd.concat([data_18_noleak , data_noFlow_noleak , data_47_noleak])
no_leak_data['Category'] = 'No Leak'

gasket_leak_data = pd.concat([data_18_gasket , data_gasket_noFlow , data_47_nd])
gasket_leak_data['Category'] = 'Gasket Leak'

CC_leak_data = pd.concat([data_18_CC , data_CC_noFlow , data_47_nd_CC])
CC_leak_data['Category'] = 'circumferential leak'

OL_leak_data = pd.concat([data_18_OL , data_OL_noFlow , data_47_nd_OL])
OL_leak_data['Category'] = 'orifice leak'

# Combine all data
combined_data = pd.concat([no_leak_data ,gasket_leak_data , CC_leak_data , OL_leak_data]).reset_index(drop=True)

In [3]:
num_rows = combined_data.shape[0]
print(f"Number of rows: {num_rows}")

Number of rows: 11100621


In [10]:
# Shell 3: Windowing and Feature Extraction (Memory-Efficient with 0.5-Second Windows)
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis

# Load data with system features from Shell 2


# Debug: Check the sampling rate
sample_diff = combined_data['Sample'].diff().dropna().mean()
sampling_rate = 1 / sample_diff  # Samples per second
window_duration_seconds = 0.004  # Desired window duration in seconds
window_size = int(window_duration_seconds * sampling_rate)  # Number of rows per 0.5-second window
print(f"Sampling rate: {sampling_rate:.2f} Hz")
print(f"Time difference between samples: {sample_diff:.6f} seconds")
print(f"Window size for 0.004 seconds: {window_size} rows")

# Windowing function
def extract_windows(df, window_size):
    windows = [df.iloc[i:i + window_size] for i in range(0, len(df) - window_size + 1, window_size)]
    return windows

# Feature extraction function
def extract_features_per_window(window):
    features = {}
    values = window['Value'].values

    # Time-Domain Features
    features['Mean'] = np.mean(values)
    features['Std'] = np.std(values)
    features['Min'] = np.min(values)
    features['Max'] = np.max(values)
    features['Skewness'] = skew(values)
    features['Kurtosis'] = kurtosis(values)
    features['PeakToPeak'] = np.ptp(values)
    features['Median'] = np.median(values)
    features['Mode'] = pd.Series(values).mode()[0] if not pd.Series(values).mode().empty else np.nan
    features['FirstQuartile'] = np.percentile(values, 25)
    features['ThirdQuartile'] = np.percentile(values, 75)
    features['RootMeanSquare'] = np.sqrt(np.mean(values ** 2))
    features['RootSumSquares'] = np.sqrt(np.sum(values ** 2))
    features['PeakToRMS'] = features['PeakToPeak'] / features['RootMeanSquare'] if features['RootMeanSquare'] != 0 else np.nan
    features['Variance'] = np.var(values)
    features['PeakPosition'] = np.argmax(values)

    # Frequency-Domain Features
    fft_values = np.abs(np.fft.fft(values))
    freqs = np.fft.fftfreq(len(values))
    features['FFT_Mean'] = np.mean(fft_values)
    features['FFT_Std'] = np.std(fft_values)
    features['FFT_Min'] = np.min(fft_values)
    features['FFT_Max'] = np.max(fft_values)
    features['FFT_Skewness'] = skew(fft_values)
    features['FFT_Kurtosis'] = kurtosis(fft_values)
    features['FFT_PeakToPeak'] = np.ptp(fft_values)
    features['FFT_Median'] = np.median(fft_values)
    features['FFT_Mode'] = pd.Series(fft_values).mode()[0] if not pd.Series(fft_values).mode().empty else np.nan
    features['FFT_PeakPosition'] = np.argmax(fft_values)
    features['MeanFrequency'] = np.mean(freqs)

  

   
    features['Category'] = window['Category'].iloc[0]

    return pd.Series(features)

# Process data in chunks to avoid MemoryError
chunk_size = 10000  # Number of windows to process at a time (adjust based on your RAM)
windows = extract_windows(combined_data, window_size)
total_windows = len(windows)

# Write features incrementally to CSV
first_chunk = True
for i in range(0, total_windows, chunk_size):
    chunk_windows = windows[i:i + chunk_size]
    chunk_features = pd.DataFrame([extract_features_per_window(window) for window in chunk_windows])
    chunk_features = chunk_features.dropna()  # Drop NaN rows within the chunk
    
    # Write to CSV: append mode for subsequent chunks, header only for first chunk
    if first_chunk:
        chunk_features.to_csv('features_extracted.csv', mode='w', index=False)
        first_chunk = False
    else:
        chunk_features.to_csv('features_extracted.csv', mode='a', header=False, index=False)
    
    print(f"Processed windows {i} to {min(i + chunk_size, total_windows)} of {total_windows}")

print("Features extracted and saved to 'features_extracted.csv'")

Sampling rate: 305899.97 Hz
Time difference between samples: 0.000003 seconds
Window size for 0.004 seconds: 1223 rows
Processed windows 0 to 9076 of 9076
Features extracted and saved to 'features_extracted.csv'


In [14]:
# Shell 4: Prepare Data for Modeling (with Categorical Encoding)
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import joblib

# Load features from Shell 3
features = pd.read_csv('features_extracted.csv')

# Separate features and labels
X = features.drop(columns=['Category'])
y = features['Category']

# Feature scaling (only numeric columns)
scaler = MinMaxScaler()
numeric_columns = X.select_dtypes(include=[np.number]).columns
X_scaled_numeric = pd.DataFrame(scaler.fit_transform(X[numeric_columns]), 
                                columns=numeric_columns)

# Save the scaler
joblib.dump(scaler, 'Accelerometer_classifiedscaler.pkl')
print("Scaler saved successfully!")



# Combine scaled numeric and encoded categorical features
X_scaled = pd.concat([X_scaled_numeric], axis=1)

# Save scaled features and labels
X_scaled.to_csv('X_scaled.csv', index=False)
y.to_csv('y_multiclass.csv', index=False)
print("Scaled features and labels saved to 'X_scaled.csv' and 'y_multiclass.csv'")

Scaler saved successfully!
Scaled features and labels saved to 'X_scaled.csv' and 'y_multiclass.csv'


In [12]:
# Shell 5: Model Training and Evaluation
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load scaled features and labels from Shell 4
X_scaled = pd.read_csv('X_scaled.csv')
y = pd.read_csv('y_multiclass.csv')['Category']  # Assuming 'Category' is the column name

# Base estimator
base_tree = DecisionTreeClassifier(
    max_depth=10,
    min_samples_split=5,
    class_weight='balanced'
)

# Bagging ensemble
model = BaggingClassifier(
    estimator=base_tree,
    n_estimators=600,
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    bootstrap_features=False,
    n_jobs=-1,
    random_state=42
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Train the model
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nAccuracy:", accuracy_score(y_test, y_pred))

Classification Report:
                      precision    recall  f1-score   support

         Gasket Leak       0.76      0.73      0.75       456
             No Leak       0.86      0.74      0.79       454
circumferential leak       0.74      0.77      0.75       453
        orifice leak       0.77      0.88      0.82       453

            accuracy                           0.78      1816
           macro avg       0.78      0.78      0.78      1816
        weighted avg       0.78      0.78      0.78      1816


Confusion Matrix:
[[334   0 122   0]
 [  0 335   0 119]
 [106   0 347   0]
 [  0  54   0 399]]

Accuracy: 0.7791850220264317


In [13]:
import joblib

# Save the model
joblib.dump(model, 'Accerometer_classified_model.pkl')

print("Model saved successfully!")


Model saved successfully!
