In [1]:
import os

import numpy as np
import pandas as pd

from source.preprocessing import splitter, converter
from source.datamodels import iterators

# 1 Preprocessing
load datasets, convert third-party data files to our format etc.

## 1.1 Data loading

In [2]:
project_folder = "F:/PythonNotebooks/Study/Quantum/Bearings/"
own_data_path = os.path.join(project_folder, "data/own datasets/")
third_party_data_path = os.path.join(project_folder, "data/third party datasets/")

### 1.1.1 Load our initial datasets
Datasets obtained from our experiments

In [None]:
signals_dataset = pd.read_csv(os.path.join(own_data_path, 'bearing_signals.csv'))
classes_dataset = pd.read_csv(os.path.join(own_data_path, 'bearing_classes.csv'), delimiter=';', skiprows=[1])

### 1.1.2 Load our joined datasets
Datasets obtained from our experiments

In [4]:
full_dataset = pd.read_csv(os.path.join(own_data_path, 'bearings.csv'), delimiter=';', skiprows=[1])

### 1.1.3 Third-party datasets
Load converted third-party datasets

### 1.1.4 Load dataset with statistics
Dataset is ready for experiments

In [3]:
prepared_data = pd.read_csv(os.path.join(own_data_path, 'processed_full_signal_specter1000_noscale.csv'), delimiter=',')
prepared_data.head()

Unnamed: 0,target,group,a1_x_signal_complexity,a1_x_signal_shannon_entropy,a1_x_signal_kurtosis,a1_x_signal_variation,a1_x_signal_hurst,a1_x_signal_skew,a1_x_signal_activity,a1_x_signal_iqr,...,a2_z_specter_iqr,a2_z_specter_zero_crossing,a2_z_specter_range,a2_z_specter_mean,a2_z_specter_petrosian_fd,a2_z_specter_higuchi_fd,a2_z_specter_crest_factor,a2_z_specter_energy,a2_z_specter_std,a2_z_specter_sample_entropy
0,0.0,1.0,1.865568,6.38802,-0.337526,-9.989303,0.62617,-0.231906,0.51953,3.006134,...,130.212247,0.0,430.516204,121.609951,1.024136,1.709899,2.865578,22610590.0,88.439848,2.754299
1,0.0,1.0,1.812863,6.507361,-0.329388,-8.479932,0.633813,0.04648,0.51992,3.270674,...,79.932721,0.0,339.669032,116.951076,1.024806,1.75462,2.630547,16832350.0,56.167537,2.881832
2,0.0,1.0,1.775775,6.489806,-0.439591,-9.585973,0.625745,0.014204,0.531458,3.174478,...,87.487539,0.0,359.727237,126.969372,1.022686,1.718792,2.593399,19717600.0,59.969811,2.874073
3,0.0,1.0,1.912905,6.422261,-0.364524,-9.033494,0.606942,-0.14576,0.51209,2.982085,...,78.688634,0.0,353.579137,108.447712,1.025371,1.746161,2.880197,15475130.0,60.944452,2.780298
4,0.0,1.0,1.868826,6.449559,0.017384,-8.48262,0.631058,-0.149621,0.526677,2.982085,...,83.595954,0.0,331.560162,112.269206,1.02424,1.762398,2.66519,15966520.0,57.984006,2.852191


---
## 1.2 Signals and classes datasets join
Use to combine our datasets into one

In [None]:
targets_map = dict(zip(classes_dataset['bearing_id'], classes_dataset['status']))
targets_vector = signals_dataset['bearing_2_id'].map(targets_map)
joined_dataset = signals_dataset.copy()
joined_dataset.insert(loc=0, column='target', value=targets_vector)
joined_dataset.to_csv(os.path.join(own_data_path, 'bearings.csv'))

---
## 1.3 Convert third-party data files to our standard dataframe view

In [None]:
cesar_1_path = os.path.join(third_party_data_path, 'Bearings_cesar_1')
cesar_1 = converter.Converter.cesar_convert(cesar_1_path)

cesar_2_path = os.path.join(third_party_data_path, 'Bearings_cesar_1')
cesar_2 = converter.Converter.cesar_convert(cesar_2_path)

luigi_path = os.path.join(third_party_data_path, 'Bearings_luigi')
luigi = converter.Converter.luigi_convert(luigi_path)

---
## 1.4 Split datasets
Split datasets on chunks and evaluate set of statistical features for each chunk

In [None]:
%% time

# stats = ['mean', 'std']  # You can directly input statistics names
stats = iterators.Stats.get_keys()  # If you need to calculate all supported statistics
splitter = splitter.Splitter(use_signal=True, use_specter=True, specter_threshold=1000, stats=stats)
prepared_data = splitter.split_dataset(joined_dataset, stable_area=(10, 19), splits_number=10,
                                       signal_data_columns=['a1_x', 'a1_y', 'a1_z', 'a2_x', 'a2_y', 'a2_z'])
print(f"features number: {prepared_data.shape[1]-2}")
print(f"examples number: {prepared_data.shape[0]}")
print(prepared_data.head())

# 2 Run ML experiments

## 2.2 Run cross-validation
As an example, cross-validation with grouped overlap resampling launched here over logistic regression, SVC and random forest classifiers

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate

from source.processes import Shuffler

In [38]:
X = prepared_data.drop(columns=['target', 'group']).values
y = prepared_data['target'].values
groups = prepared_data['group'].values

LR_params = {'C': 10000}
logit = LogisticRegression()
logit.set_params(**LR_params)
X_scaled = StandardScaler().fit_transform(X)
cv = Shuffler.OverlapGroupCV(train_size=0.63, n_repeats=100).split(X_scaled, y, groups)

scores = iterators.Metrics.get_scorers_dict()  # Get dict of scores in format required by cross_validate() scoring field

X.shape

(1120, 204)

In [39]:
%%time
cv_results = cross_validate(logit, X_scaled, y, cv=cv, scoring=scores, groups=groups)
print(sorted(cv_results.keys()))

['fit_time', 'score_time', 'test_TNR', 'test_TPR', 'test_accuracy', 'test_f1', 'test_precision', 'test_recall']
Wall time: 4.03 s


## 2.3 Run GridSearch
GridSearch for Logistic Regression tuning with bootstrapped samples

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from source.processes import Shuffler

In [47]:
X = prepared_data.drop(columns=['target', 'group']).values
y = prepared_data['target'].values
groups = prepared_data['group'].values

logit = LogisticRegression()
X_scaled = StandardScaler().fit_transform(X)
cv = Shuffler.OverlapGroupCV(train_size=0.63, n_repeats=100).split(X_scaled, y, groups)
grid = {'C': np.logspace(-3, 4, 8)}
gscv = GridSearchCV(logit, grid, scoring='f1', cv=cv)

In [12]:
%%time
gscv.fit(X_scaled, y, groups)
print(gscv.best_params_)
print(gscv.best_score_)



{'C': 10000.0}
0.8749091756161085
Wall time: 26 s


# 3 Results postprocessing

## 3.1 extract bootstrap scores

### 3.1.1 extract CV scores

In [40]:
import re

cv_scores_names = [f'test_{score}' for score in scores.keys()]
cv_scores ={re.sub("test_","", k):list(cv_results[k]) for k in cv_scores_names}
mean_scores = { k:np.mean(cv_scores[k]) for k in cv_scores.keys()}
print(mean_scores)

{'accuracy': 0.9708809523809525, 'precision': 0.959074065252024, 'recall': 0.7944, 'f1': 0.8617638423433636, 'TPR': 0.7944, 'TNR': 0.9947297297297295}


## 3.2 Save experiment data to data model

### 3.2.1 Save cross-validation results

In [41]:
from source.datamodels.datamodels import BootstrapResults
from source.datamodels.iterators import Axes, Stats

result = BootstrapResults(
    run_label = "test cv run",
    model_name = "LR",
    hyperparameters = LR_params,
    use_signal = True,
    use_specter = True,
    specter_threshold = 1000,
    axes = Axes.get_keys(),
    stats = Stats.get_keys(),
    predictions = None,
    scores = mean_scores,
    resampling_number = 100,
    bootstrap_scores = cv_scores
)