# OpenIE-MachineAnalytics Data Preprocessing

### 0x00 Read Data with Pandas

In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
machineData = pd.read_csv('../machineData/1.csv', skiprows=0)

# Display the first few rows of the DataFrame to check if the data is loaded correctly
print(machineData.head())

   index     smcAC     smcDC  vib_table  vib_spindle  AE_table  AE_spindle  \
0      1 -0.017090  0.625000   0.078125     0.314941  0.087280    0.103760   
1      2  0.263672  0.810547   0.085449     0.301514  0.098267    0.123291   
2      3  0.207520  0.781250   0.078125     0.303955  0.092163    0.104980   
3      4  0.302734  0.849609   0.073242     0.300293  0.095215    0.111084   
4      5  0.239258  1.098633   0.083008     0.299072  0.083008    0.092163   

   Unnamed: 7  
0         NaN  
1         NaN  
2         NaN  
3         NaN  
4         NaN  


### 0x01 Obtain the Data Feature Values

In [6]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
from scipy.fftpack import fft
from scipy.signal import find_peaks
from scipy.stats import entropy


# Assuming 'smcAC' is the column you are interested in
selected_columns = ['smcAC', 'smcDC', 'vib_table',
                    'vib_spindle', 'AE_table', 'AE_spindle']


def extract_features(signal):
    features = {}

    # Convert the column to numeric, handling non-numeric values
    signal_numeric = pd.to_numeric(signal, errors='coerce')

    # Replace or remove specific non-numeric values
    signal_numeric.replace('ALIGNED', np.nan, inplace=True)

    # Remove NaN values or use interpolation as needed
    signal_numeric.dropna(inplace=True)

    # Convert to NumPy array
    signal_array = signal_numeric.to_numpy()

    # Check if the array is not empty
    if len(signal_array) > 0:
        # Time-domain features
        features['mean'] = np.mean(signal_array)
        features['std_dev'] = np.std(signal_array)
        features['rms'] = np.sqrt(np.mean(np.square(signal_array)))
        features['skewness'] = skew(signal_array)
        features['kurtosis'] = kurtosis(signal_array)
        features['max_value'] = np.max(signal_array)
        features['min_value'] = np.min(signal_array)
        features['median'] = np.median(signal_array)
        features['iqr'] = np.percentile(
            signal_array, 75) - np.percentile(signal_array, 25)
        features['zero_crossing_rate'] = np.sum(
            np.diff(np.sign(signal_array)) != 0) / len(signal_array)

        # Frequency-domain features using FFT
        try:
            fft_result = fft(signal_array)
            magnitude_spectrum = np.abs(fft_result)

            features['dominant_frequency'] = np.argmax(magnitude_spectrum)
            features['max_frequency_magnitude'] = np.max(magnitude_spectrum)

            # Statistical measures
            features['auto_correlation'] = np.correlate(
                signal_array, signal_array, mode='full')

            # Frequency-domain features
            features['power_spectral_density'] = np.mean(
                np.square(magnitude_spectrum))
            features['total_power'] = np.sum(np.square(signal_array))
            features['spectral_entropy'] = entropy(magnitude_spectrum)
            features['centroid_frequency'] = np.sum(np.arange(
                len(magnitude_spectrum)) * magnitude_spectrum) / np.sum(magnitude_spectrum)

            # Time-frequency features
            # Add your wavelet transform code here
            features['wavelet_coefficients'] = []
            features['stft'] = np.abs(np.fft.fftshift(
                np.fft.fft(signal_array)))  # Example using FFT for STFT

            # Other features
            features['peaks_count'], _ = find_peaks(signal_array)

        except ValueError as e:
            print(f"Error in FFT calculation: {e}")

    return features


# Create a dictionary to store features for each column
features_dict = {}

# Iterate through specific signal columns and extract features
for column in selected_columns:
    if column in machineData.columns:
        selected_data = machineData[column]
        features_column = extract_features(selected_data)
        features_dict[column] = features_column
    else:
        print(f"Column '{column}' not found in the DataFrame.")

# Display the extracted features
for column, features_column in features_dict.items():
    print(f"\nFeatures of '{column}':")
    for key, value in features_column.items():
        print(f"{key}: {value}")


Features of 'smcAC':
mean: -0.16425537222222222
std_dev: 1.5233353020574607
rms: 1.5321652227479763
skewness: 8.654939892560849e-05
kurtosis: -0.7222014745508978
max_value: 3.823242
min_value: -4.20166
median: -0.15625
iqr: 2.29553225
zero_crossing_rate: 0.09888888888888889
dominant_frequency: 452
max_frequency_magnitude: 6029.391694199132
auto_correlation: [ 0.00984678 -0.13940333 -0.29887587 ... -0.29887587 -0.13940333
  0.00984678]
power_spectral_density: 21127.7724281852
total_power: 21127.772428185202
spectral_entropy: 7.324490121628972
centroid_frequency: 4465.431450506171
wavelet_coefficients: []
stft: [364.885244   212.31427726  46.775917   ...  80.96999416  46.775917
 212.31427726]
peaks_count: [   1    3    5 ... 8985 8988 8995]

Features of 'smcDC':
mean: 5.438449434333333
std_dev: 2.243695728324307
rms: 5.883103158291658
skewness: -0.9346041934008765
kurtosis: -0.8547176365840343
max_value: 7.817383
min_value: 0.625
median: 6.801758
iqr: 3.4448244999999993
zero_crossing_ra