#     Code to segment vibration signals and create a labeled dataset

In [2]:
import glob
from scipy.io import loadmat
from numpy import asarray
import numpy as np
import re
import os
import pandas as pd

In [2]:
# @@@ Function to create dataset from raw vibration signals @@@
# The function processes .mat vibration data files and do segmentation of vibration signals according to 
# desired length of the input segment "num_samples".

# @@@@ Inputs: 
#data_src: path to dataset folders
#req_Key: key to extract vibration data from .mat files
#num_samples:  required length of input vibration samples "number of data points per sample".
#class_: label/class of vibration data

# @@@@ The function returns:
# reference vibration samples "baselines" and  thier labels "baselines_labels"
# test vibration samples "segmented_data" and  thier labels "labels"

def make_dataset(data_src, req_Key, num_samples, class_):
    all_num_segments = []
    pattern = re.compile(req_Key)
    files = glob.glob(data_src)
    files = np.sort(files)
    for file in files:
        data = loadmat(file)
        keysList = [key for key in data]
        for key in keysList:
            if pattern.search(key):
                my_key = key
                drive_end_data = data[my_key] 
                num_segments = np.floor(len(drive_end_data)/num_samples)
                all_num_segments.append(num_segments)
                total_num_segments = sum(all_num_segments)
    L = len(all_num_segments)
    segmented_data = np.empty([int(total_num_segments)-L, num_samples])
    baselines = np.empty([L,num_samples])
    num = 0
    k = 0
    for file in files:
        data = loadmat(file)
        keysList = [key for key in data]
        for key in keysList:
            if pattern.search(key):
                my_key = key
                drive_end_data = data[my_key] 
                num_segments = np.floor(len(drive_end_data)/num_samples)
                for i in range(int(num_segments)):
                    if i == 0:
                        baselines[k,:] = drive_end_data[i*num_samples:(i+1)*num_samples, 0]
                        k = k +1
                    else:
                        segmented_data[num,:] = drive_end_data[i*num_samples:(i+1)*num_samples, 0]
                        num = num + 1
    segmented_data = np.unique(segmented_data, axis= 0)# Remove duplicates
    baselines = np.unique(baselines, axis= 0)# Remove duplicates
    labels = np.ones(len(segmented_data))*class_
    baselines_labels = np.ones(len(baselines))*class_
    return segmented_data, labels, baselines, baselines_labels

==========================================================================================================================
# Dataset Creatioin:
=========================================================================================================================== 

## Grouping vibration signals in required folders to  generate datasets
Link to download CWRU dataset: https://engineering.case.edu/bearingdatacenter <br>
After downloading the dataset, vibration signals to be grouped according to their operational conditions/properties* in 10 folders (corresponding to number of operational classes in the dataset: 1 normal class and 9 faulty classes) with below folder names:

1.	12K_DE_Normal 
2.	12k_DE_IRFault_0.007
3.	12k_DE_IRFault_0.014
4.	12k_DE_IRFault_0.021
5.	12k_DE_BallFault_0.007
6.	12k_DE_BallFault_0.014
7.	12k_DE_BallFault_0.021
8.	12k_DE_ORFault_0.007
9.	12k_DE_ORFault_0.014
10.	12k_DE_ORFault_0.021

*12k = sampling rate of vibration signals <br>
*DE= Drive End vibration data <br>
*IRFault = Inner Race faults <br>
*BallFault = Ball faults <br>
*ORFault = Outer Race faults <br>
*0.014/0.021/0.007= fault diameter <br>


In [3]:
fs = 12000; # Sampling rate ( sample/second) of data collection
num_samples = 2000 # Set required length of input vibration samples "number of data points per sample".
Key = "_DE_time" # key to extract  Drive End (DE) vibration data from .mat files

In [4]:
data_path = (r".\dataset") # path were the folders contain vibration signals are located.

cls_1 = '12K_DE_Normal/*'; cls_2 = '12k_DE_IRFault_0.007/*'; cls_3 = '12k_DE_IRFault_0.014/*'; 
cls_4 = '12k_DE_IRFault_0.021/*'; cls_5 = '12k_DE_BallFault_0.007/*'
cls_6 = '12k_DE_BallFault_0.014/*'; cls_7 = '12k_DE_BallFault_0.021/*'
cls_8 = '12k_DE_ORFault_0.007/*'; cls_9 = '12k_DE_ORFault_0.014/*'; cls_10 ='12k_DE_ORFault_0.021/*'

norm, y_norm, norm_baselines, y_norm_baseline = make_dataset(os.path.join(data_path, cls_1), Key, num_samples, 1)
defc1, y_defc1, defc1_baslines, y_defc1_baseline  = make_dataset(os.path.join(data_path, cls_2), Key, num_samples, 2)
defc2, y_defc2, defc2_baslines, y_defc2_baseline = make_dataset(os.path.join(data_path, cls_3), Key, num_samples, 3)
defc3, y_defc3, defc3_baslines, y_defc3_baseline = make_dataset(os.path.join(data_path, cls_4), Key, num_samples, 4)
defc4, y_defc4, defc4_baslines, y_defc4_baseline = make_dataset(os.path.join(data_path, cls_5), Key, num_samples, 5)
defc5, y_defc5, defc5_baslines, y_defc5_baseline = make_dataset(os.path.join(data_path, cls_6), Key, num_samples, 6)
defc6, y_defc6, defc6_baslines, y_defc6_baseline = make_dataset(os.path.join(data_path, cls_7), Key, num_samples, 7)
defc7, y_defc7, defc7_baslines, y_defc7_baseline = make_dataset(os.path.join(data_path, cls_8), Key, num_samples, 8)
defc8, y_defc8, defc8_baslines, y_defc8_baseline = make_dataset(os.path.join(data_path, cls_9), Key, num_samples, 9)
defc9, y_defc9, defc9_baslines, y_defc9_baseline = make_dataset(os.path.join(data_path, cls_10), Key, num_samples, 10)
print('Class 1 features shape:', norm.shape); print('Class 1 labels shape:', y_norm.shape)
print('Class 2 features shape:', defc1.shape); print('Class 2 labels shape:', y_defc1.shape)
print('Class 3 features shape:', defc2.shape); print('Class 3 labels shape:', y_defc2.shape)
print('Class 4 features shape:', defc3.shape); print('Class 4 labels shape:', y_defc3.shape)
print('Class 5 features shape:', defc4.shape); print('Class 5 labels shape:', y_defc4.shape)
print('Class 6 features shape:', defc5.shape); print('Class 6 labels shape:', y_defc5.shape)
print('Class 7 features shape:', defc6.shape); print('Class 7 labels shape:', y_defc6.shape)
print('Class 8 features shape:', defc7.shape); print('Class 8 labels shape:', y_defc7.shape)
print('Class 9 features shape:', defc8.shape); print('Class 9 labels shape:', y_defc8.shape)
print('Class 10 features shape:', defc9.shape); print('Class 10 labels shape:', y_defc9.shape)

Class 1 features shape: (842, 2000)
Class 1 labels shape: (842,)
Class 2 features shape: (238, 2000)
Class 2 labels shape: (238,)
Class 3 features shape: (236, 2000)
Class 3 labels shape: (236,)
Class 4 features shape: (237, 2000)
Class 4 labels shape: (237,)
Class 5 features shape: (237, 2000)
Class 5 labels shape: (237,)
Class 6 features shape: (238, 2000)
Class 6 labels shape: (238,)
Class 7 features shape: (238, 2000)
Class 7 labels shape: (238,)
Class 8 features shape: (238, 2000)
Class 8 labels shape: (238,)
Class 9 features shape: (237, 2000)
Class 9 labels shape: (237,)
Class 10 features shape: (238, 2000)
Class 10 labels shape: (238,)


In [6]:
##Test Samples:

test_data = np.concatenate( (norm, defc1, defc2, defc3, defc4, defc5, defc6, defc7, defc8,  defc9 ) , axis=0, out=None)
y_test = np.concatenate( (y_norm, y_defc1, y_defc2, y_defc3, y_defc4, y_defc5, 
                          y_defc6, y_defc7, y_defc8, y_defc9  ), axis=0, out=None)
#Shuffle test samples
df = pd.DataFrame(data=test_data)
df['labels'] = y_test
df = df.sample(frac = 1)
y_test = df['labels'].to_numpy()

test_data = df.drop(['labels'], axis=1).to_numpy()

##### Save Test samples to a CSV file:#############
df.to_csv(r'.\Test_Samples.csv', index=False)
##################################################
print("Test samples shape:", test_data.shape )


## Reference Samples
baselines = np.concatenate( (norm_baselines, defc1_baslines, defc2_baslines, defc3_baslines, defc4_baslines, defc5_baslines, 
                     defc6_baslines, defc7_baslines, defc8_baslines,  defc9_baslines ) , axis=0, out=None)
baselines_labels = np.concatenate( (y_norm_baseline, y_defc1_baseline, y_defc2_baseline, y_defc3_baseline, 
                                    y_defc4_baseline, y_defc5_baseline, y_defc6_baseline, y_defc7_baseline,
                                    y_defc8_baseline, y_defc9_baseline), axis=0, out=None)

##### Save Test samples to a CSV file:###############
df = pd.DataFrame(data=baselines)
df['labels'] = baselines_labels
df.to_csv(r'.\Reference_Samples.csv', index=False)
#####################################################
print("Baselines samples shape:", baselines.shape )

Test samples shape: (2979, 2000)
Baselines samples shape: (40, 2000)
