In [2]:
import os
import configparser

import numpy as np
import pandas as pd

from source.preprocessing import splitter, converter
from source.datamodels import iterators
from source.utils import get_project_root

# 1 Preprocessing
load datasets, convert third-party data files to our format etc.

## 1.1 Data loading

In [3]:
root = get_project_root()
config = configparser.ConfigParser()
config.read(os.path.join(root, "userconfig.ini"))

own_data_path = config['Path']['own_data_path']
third_party_data_path = config['Path']['third_party_data_path']
Cesar1_path = config['Path']['Cesar1_path']

images_path = config['Path']['images_path']
public_images_path = config['Path']['public_images_path']

tables_path = config['Path']['tables_path']
public_tables_path = config['Path']['public_tables_path']

bootstrap_jsons_path = config['Path']['bootstrap_jsons_path']
single_jsons_path = config['Path']['single_jsons_path']

### 1.1.1 Load our initial datasets
Datasets obtained from our experiments

In [None]:
signals_dataset = pd.read_csv(os.path.join(root, own_data_path, 'bearing_signals.csv'))
classes_dataset = pd.read_csv(os.path.join(root, own_data_path, 'bearing_classes.csv'), delimiter=';', skiprows=[1])

### 1.1.2 Load our joined datasets
Datasets obtained from our experiments with joined target column

In [11]:
full_dataset = pd.read_csv(os.path.join(root, own_data_path, 'bearings.csv'), delimiter=',')
full_dataset.head()

Unnamed: 0.1,Unnamed: 0,target,experiment_id,bearing_1_id,bearing_2_id,timestamp,a1_x,a1_y,a1_z,a2_x,a2_y,a2_z,rpm,hz,w
0,0,0,1,0,1,0.0,0.113269,0.149706,-0.110275,-0.18603,0.19445,0.454299,0.0,0.0,6e-06
1,1,0,1,0,1,0.000333,-0.367713,-0.228832,0.177821,0.285992,0.002226,-0.04393,0.0,0.0,0.000243
2,2,0,1,0,1,0.000667,0.113269,0.149706,-0.398371,-0.091625,0.002226,0.454299,0.0,0.0,0.000369
3,3,0,1,0,1,0.001,-0.17532,-0.228832,-0.110275,0.285992,0.002226,0.255007,0.0,0.0,0.00052
4,4,0,1,0,1,0.001333,-0.079124,0.055072,-0.110275,0.191588,0.002226,0.255007,0.0,0.0,0.000175


### 1.1.3 Load third-party datasets
Load third-party datasets, converted to our standard view

In [9]:
full_dataset = pd.read_csv(os.path.join(root, third_party_data_path, 'N1 Cesar Ricardo', 'csv', 'bearings.csv'), delimiter=',')
full_dataset.head()

Unnamed: 0,target,a1_y,a2_y,rpm,experiment_id,timestamp
0,1,1.937934,1.954861,200,1,0.0
1,1,1.937547,1.954629,200,1,2.5e-05
2,1,1.937166,1.954989,200,1,5e-05
3,1,1.937594,1.95554,200,1,7.5e-05
4,1,1.938502,1.955792,200,1,0.0001


### 1.1.4 Load dataset with statistics
Dataset is ready for experiments

In [3]:
prepared_data = pd.read_csv(os.path.join(root, own_data_path, 'processed_full_signal_specter1000_noscale.csv'), delimiter=',')  # our experiment
# prepared_data = pd.read_csv(os.path.join(root, third_party_data_path, 'N1 Cesar Ricardo', 'csv',
#                                          'processed_full_signal_specter1000_noscale.csv'), delimiter=',')  # third-party dataset
prepared_data.head()

Unnamed: 0,target,group,a1_x_signal_complexity,a1_x_signal_shannon_entropy,a1_x_signal_kurtosis,a1_x_signal_variation,a1_x_signal_hurst,a1_x_signal_skew,a1_x_signal_activity,a1_x_signal_iqr,...,a2_z_specter_iqr,a2_z_specter_zero_crossing,a2_z_specter_range,a2_z_specter_mean,a2_z_specter_petrosian_fd,a2_z_specter_higuchi_fd,a2_z_specter_crest_factor,a2_z_specter_energy,a2_z_specter_std,a2_z_specter_sample_entropy
0,0.0,1.0,1.865568,6.38802,-0.337526,-9.989303,0.62617,-0.231906,0.51953,3.006134,...,130.212247,0.0,430.516204,121.609951,1.024136,1.709899,2.865578,22610590.0,88.439848,2.754299
1,0.0,1.0,1.812863,6.507361,-0.329388,-8.479932,0.633813,0.04648,0.51992,3.270674,...,79.932721,0.0,339.669032,116.951076,1.024806,1.75462,2.630547,16832350.0,56.167537,2.881832
2,0.0,1.0,1.775775,6.489806,-0.439591,-9.585973,0.625745,0.014204,0.531458,3.174478,...,87.487539,0.0,359.727237,126.969372,1.022686,1.718792,2.593399,19717600.0,59.969811,2.874073
3,0.0,1.0,1.912905,6.422261,-0.364524,-9.033494,0.606942,-0.14576,0.51209,2.982085,...,78.688634,0.0,353.579137,108.447712,1.025371,1.746161,2.880197,15475130.0,60.944452,2.780298
4,0.0,1.0,1.868826,6.449559,0.017384,-8.48262,0.631058,-0.149621,0.526677,2.982085,...,83.595954,0.0,331.560162,112.269206,1.02424,1.762398,2.66519,15966520.0,57.984006,2.852191


---
## 1.2 Signals and classes datasets join
Use to combine our datasets into one

In [None]:
targets_map = dict(zip(classes_dataset['bearing_id'], classes_dataset['status']))
targets_vector = signals_dataset['bearing_2_id'].map(targets_map)
joined_dataset = signals_dataset.copy()
joined_dataset.insert(loc=0, column='target', value=targets_vector)
joined_dataset.to_csv(os.path.join(own_data_path, 'bearings.csv'))

---
## 1.3 Convert third-party data files to our standard dataframe view

In [None]:
%%time

cesar_1_path = os.path.join(root, third_party_data_path, 'N1 Cesar Ricardo')
cesar_1 = converter.Converter.cesar_convert(cesar_1_path)
cesar_1.head()

# cesar_2_path = os.path.join(third_party_data_path, 'Bearings_cesar_1')
# cesar_2 = converter.Converter.cesar_convert(cesar_2_path)

# luigi_path = os.path.join(third_party_data_path, 'Bearings_luigi')
# luigi = converter.Converter.luigi_convert(luigi_path)

---
## 1.4 Split datasets
Split datasets on chunks and evaluate set of statistical features for each chunk

### 1.4.1 Split our dataset

In [15]:
%%time

stats = ['mean', 'std', 'skew']  # You can directly input statistics names
# stats = iterators.Stats.get_keys()  # Use Stats.get_keys() if you need to calculate all supported statistics
splitter_processor = splitter.Splitter(use_signal=True, use_specter=False, specter_threshold=1000, stats=stats)
prepared_data = splitter_processor.split_dataset(full_dataset, stable_area=[(0, 3)], splits_number=10,
                                                 signal_data_columns=['a1_x', 'a1_y', 'a1_z', 'a2_x', 'a2_y', 'a2_z'])
print(f"features number: {prepared_data.shape[1]-2}")
print(f"examples number: {prepared_data.shape[0]}")
prepared_data.head()

features number: 18
examples number: 1120
Wall time: 11.6 s


Unnamed: 0,target,group,a1_x_signal_mean,a1_x_signal_std,a1_x_signal_skew,a1_y_signal_mean,a1_y_signal_std,a1_y_signal_skew,a1_z_signal_mean,a1_z_signal_std,a1_z_signal_skew,a2_x_signal_mean,a2_x_signal_std,a2_x_signal_skew,a2_y_signal_mean,a2_y_signal_std,a2_y_signal_skew,a2_z_signal_mean,a2_z_signal_std,a2_z_signal_skew
0,0.0,1.0,-0.091416,0.168345,-0.094233,0.014905,0.164634,-0.139448,-0.105153,0.177636,-0.188695,0.14984,0.160024,-0.353922,0.039176,0.165201,-0.329476,0.19522,0.17924,-0.21764
1,0.0,1.0,-0.085323,0.168466,-0.046775,0.019847,0.166212,-0.140761,-0.120945,0.1771,-0.154561,0.146274,0.159796,-0.322403,0.032982,0.164606,-0.325526,0.191012,0.18104,-0.241943
2,0.0,1.0,-0.093981,0.165922,-0.016693,0.020267,0.16395,-0.211511,-0.113903,0.17865,-0.166532,0.151308,0.161691,-0.317274,0.044943,0.163114,-0.338985,0.190459,0.181786,-0.204486
3,0.0,1.0,-0.08853,0.169049,-0.038873,0.017533,0.167378,-0.247919,-0.104727,0.177623,-0.18154,0.150784,0.160421,-0.356063,0.038535,0.16441,-0.36742,0.186584,0.184207,-0.199939
4,0.0,1.0,-0.090133,0.166901,-0.093073,0.019952,0.169524,-0.15785,-0.11465,0.180942,-0.198373,0.152462,0.162736,-0.415544,0.037467,0.161812,-0.335397,0.190902,0.180665,-0.228084


### 1.4.2 Split third-party dataset
For datasets created by César Ricardo Soto-Ocampo et al. there is a1_y and a2_y columns. For this example we use stable timezone from 0 to 3 seconds

In [None]:
%%time
from sklearn.preprocessing import StandardScaler
joined_dataset = cesar_1
stats = iterators.Stats.get_keys()
stats = [stat for stat in stats if stat not in ['mean', 'std', 'variation']]
splitter_processor = splitter.Splitter(use_signal=True, use_specter=True, specter_threshold=1000, stats=stats, scaler=StandardScaler)
prepared_data = splitter_processor.split_dataset(joined_dataset, stable_area=[(0, 3)], splits_number=10, signal_data_columns=['a1_y', 'a2_y'])
print(f"features number: {prepared_data.shape[1]-2}")
print(f"examples number: {prepared_data.shape[0]}")
prepared_data.head()

# 2 Run ML experiments

## 2.1 Run cross-validation
As an example, cross-validation with grouped overlap resampling launched here over logistic regression, SVC and random forest classifiers

2.2.1 Initialize experiment workflow and estimators

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate

from source.processes import Shuffler

In [5]:
LR_params = {'C': 10}
logit = LogisticRegression()
logit.set_params(**LR_params)

SVC_params = {'C': 10}
svc = SVC()
svc.set_params(**SVC_params)

RFC_params = {'n_estimators': 200}
rfc = RandomForestClassifier()
rfc.set_params(**RFC_params)

RandomForestClassifier(n_estimators=200)

In [6]:
X = prepared_data.drop(columns=['target', 'group']).values
y = prepared_data['target'].values
groups = prepared_data['group'].values
X_scaled = StandardScaler().fit_transform(X)
scores = iterators.Metrics.get_scorers_dict()  # Get dict of scores in format required by cross_validate() scoring field

X.shape

(1120, 204)

### 2.2.2 run cross-validations for each estimator
We run 100 fits for each estimator in this example

In [7]:
%%time
cv = Shuffler.OverlapGroupCV(train_size=0.63, n_repeats=100).split(X_scaled, y, groups)
logit_cv_results = cross_validate(logit, X_scaled, y, cv=cv, scoring=scores, groups=groups)
print(f"Logistic regression mean F1 score: {np.mean(logit_cv_results['test_f1'])}")

Logistic regression mean F1 score: 0.841052024168488
Wall time: 6.29 s


In [8]:
%%time
cv = Shuffler.OverlapGroupCV(train_size=0.63, n_repeats=100).split(X_scaled, y, groups)
svc_cv_results = cross_validate(svc, X_scaled, y, cv=cv, scoring=scores, groups=groups)
print(f"SVC mean F1 score: {np.mean(svc_cv_results['test_f1'])}")

SVC mean F1 score: 0.7445571254312032
Wall time: 5.11 s


In [9]:
%%time
cv = Shuffler.OverlapGroupCV(train_size=0.63, n_repeats=100).split(X_scaled, y, groups)
rfc_cv_results = cross_validate(rfc, X_scaled, y, cv=cv, scoring=scores, groups=groups)
print(f"Random forest mean F1 score: {np.mean(rfc_cv_results['test_f1'])}")

Random forest mean F1 score: 0.7327410602124919
Wall time: 1min 2s


## 2.3 Run GridSearch
GridSearch for Logistic Regression tuning with bootstrapped samples

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from source.processes import Shuffler

In [47]:
X = prepared_data.drop(columns=['target', 'group']).values
y = prepared_data['target'].values
groups = prepared_data['group'].values

logit = LogisticRegression()
X_scaled = StandardScaler().fit_transform(X)
cv = Shuffler.OverlapGroupCV(train_size=0.63, n_repeats=100).split(X_scaled, y, groups)
grid = {'C': np.logspace(-3, 4, 8)}
gscv = GridSearchCV(logit, grid, scoring='f1', cv=cv)

In [12]:
%%time
gscv.fit(X_scaled, y, groups)
print(gscv.best_params_)
print(gscv.best_score_)



{'C': 10000.0}
0.8749091756161085
Wall time: 26 s


# 3 Results postprocessing

## 3.1 extract bootstrap scores

### 3.1.1 extract CV scores
There we extract scores, obtained in stage 2.2.2

In [10]:
import re

cv_scores_names = [f'test_{score}' for score in scores.keys()]

logit_cv_scores ={re.sub("test_","", k):list(logit_cv_results[k]) for k in cv_scores_names}
logit_mean_scores = { k:np.mean(logit_cv_scores[k]) for k in logit_cv_scores.keys()}
print(f"LR mean scores: {logit_mean_scores}")

svc_cv_scores ={re.sub("test_","", k):list(svc_cv_results[k]) for k in cv_scores_names}
svc_mean_scores = { k:np.mean(svc_cv_scores[k]) for k in svc_cv_scores.keys()}
print(f"SVC mean scores: {svc_mean_scores}")

rfc_cv_scores ={re.sub("test_","", k):list(rfc_cv_results[k]) for k in cv_scores_names}
rfc_mean_scores = { k:np.mean(rfc_cv_scores[k]) for k in rfc_cv_scores.keys()}
print(f"RFC mean scores: {rfc_mean_scores}")

LR mean scores: {'accuracy': 0.9672857142857141, 'precision': 0.9600499700732599, 'recall': 0.7603999999999997, 'f1': 0.841052024168488, 'TPR': 0.7603999999999997, 'TNR': 0.9952432432432431}
SVC mean scores: {'accuracy': 0.9509047619047619, 'precision': 0.9424801908679588, 'recall': 0.6327999999999999, 'f1': 0.7445571254312032, 'TPR': 0.6327999999999999, 'TNR': 0.9938918918918919}
RFC mean scores: {'accuracy': 0.9518095238095239, 'precision': 0.9979512288786482, 'recall': 0.596, 'f1': 0.7327410602124919, 'TPR': 0.596, 'TNR': 0.9998918918918919}


## 3.2 Save experiment data to data model

### 3.2.1 Save cross-validation results
We save cv results as BootstrapResults instances to further serialization (stage 3.2.3) and ML experiment tracking tables creation (stage 3.2.4)

In [11]:
from source.datamodels.datamodels import BootstrapResults
from source.datamodels.iterators import Axes, Stats

logit_result_obj = BootstrapResults(
    run_label = "test cv run",
    model_name = "LR",
    hyperparameters = LR_params,
    use_signal = True,
    use_specter = True,
    specter_threshold = 1000,
    axes = Axes.get_keys(),
    stats = Stats.get_keys(),
    predictions = None,
    scores = logit_mean_scores,
    resampling_number = 100,
    bootstrap_scores = logit_cv_scores
)

svc_result_obj = BootstrapResults(
    run_label = "test cv run",
    model_name = "SVC",
    hyperparameters = SVC_params,
    use_signal = True,
    use_specter = True,
    specter_threshold = 1000,
    axes = Axes.get_keys(),
    stats = Stats.get_keys(),
    predictions = None,
    scores = svc_mean_scores,
    resampling_number = 100,
    bootstrap_scores = svc_cv_scores
)

rfc_result_obj = BootstrapResults(
    run_label = "test cv run",
    model_name = "RFC",
    hyperparameters = RFC_params,
    use_signal = True,
    use_specter = True,
    specter_threshold = 1000,
    axes = Axes.get_keys(),
    stats = Stats.get_keys(),
    predictions = None,
    scores = rfc_mean_scores,
    resampling_number = 100,
    bootstrap_scores = rfc_cv_scores
)

### 3.2.2 Plot results

In [None]:
from source.postprocessing.plotter import bar_plot

bar_plot(results=[logit_mean_scores, rfc_mean_scores, svc_mean_scores], models=['LR', 'RF', 'SVM'], metrics=list(logit_mean_scores.keys()), plot_size=(8, 12), Title='Test CV mean scores', filename='test_cv_mean_scores.png', filepath=os.path.join(root, images_path))

### 3.2.3 Results serialization
We serialize objects obtained in stage 3.2.1 for full reproducibility of experiments

In [12]:
from source.postprocessing.mljson import serialize_results


CV_results_path = os.path.join(root, bootstrap_jsons_path, "CV")
ResultTablesPath = os.path.join(root, tables_path)
results_objects=[logit_result_obj, svc_result_obj, rfc_result_obj]

In [13]:
serialize_results(results=results_objects, filenames=['logit_test.json', 'svc_test.json', 'rfc_test.json'], filepath=CV_results_path)

### 3.2.4 Write results to Excel table

In [14]:
from source.postprocessing.mlcsv import generate_csv_from_results, create_readable_xlsx


generate_csv_from_results(results_objects, csv_name='csv_test.csv', results_type=BootstrapResults, csv_path=ResultTablesPath)
create_readable_xlsx(xlsx_name='xlsx_test.xlsx', csv_name='csv_test.csv', xlsx_path=ResultTablesPath, csv_path=ResultTablesPath)

In [16]:
table = pd.read_excel(os.path.join(root, tables_path, 'xlsx_test.xlsx'))
table.head()

Unnamed: 0,experiment index,run_label,model_name,use_signal,use_specter,specter_threshold,resampling_number,Scores: accuracy,Scores: precision,Scores: recall,...,Statistics: energy,Statistics: hurst,Statistics: petrosian_fd,Statistics: zero_crossing,Statistics: higuchi_fd,Statistics: activity,Statistics: complexity,Statistics: crest_factor,Hyperparameters: C,Hyperparameters: n_estimators
0,0,test cv run,LR,Yes,Yes,1000,100,97,958,788,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,100.0,
1,1,test cv run,SVC,Yes,Yes,1000,100,95,949,622,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,100.0,
2,2,test cv run,RFC,Yes,Yes,1000,100,955,10,624,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,,2000.0
