In [None]:
import os

import numpy as np
import pandas as pd

from source.preprocessing import splitter, Converter
from source.datamodels import iterators

# Preprocessing
load datasets, convert third-party data files to our format etc.

## Data loading

In [14]:
project_folder = "F:/PythonNotebooks/Study/Quantum/Bearings/"
own_data_path = os.path.join(project_folder, "data/own datasets/")
third_party_data_path = os.path.join(project_folder, "data/third party datasets/")

### Load our initial datasets
Datasets obtained from our experiments

In [None]:
signals_dataset = pd.read_csv(os.path.join(own_data_path, 'bearing_signals.csv'))
classes_dataset = pd.read_csv(os.path.join(own_data_path, 'bearing_classes.csv'), delimiter=';', skiprows=[1])

### Third-party datasets
Load converted third-party datasets

---
## Signals and classes datasets join
Use to combine our datasets into one

In [None]:
targets_map = dict(zip(classes_dataset['bearing_id'], classes_dataset['status']))
targets_vector = signals_dataset['bearing_2_id'].map(targets_map)
joined_dataset = signals_dataset.copy()
joined_dataset.insert(loc=0, column='target', value=targets_vector)
joined_dataset.to_csv(os.path.join(own_data_path, 'bearings.csv'))

---
## Convert third-party data files to our standard dataframe view

In [None]:
cesar_1_path = os.path.join(third_party_data_path, 'Bearings_cesar_1')
cesar_1 = Converter.cesar_convert(cesar_1_path)

cesar_2_path = os.path.join(third_party_data_path, 'Bearings_cesar_1')
cesar_2 = Converter.cesar_convert(cesar_2_path)

luigi_path = os.path.join(third_party_data_path, 'Bearings_luigi')
luigi = Converter.luigi_convert(luigi_path)

---
## Split datasets
Split datasets on chunks and evaluate set of statistical features for each chunk

In [None]:
%% time

# stats = ['mean', 'std']  # You can directly input statistics names
stats = iterators.Stats.get_keys()  # If you need to calculate all supported statistics
splitter = splitter.Splitter(use_signal=True, use_specter=True, specter_threshold=1000, stats=stats)
prepared_data = splitter.split_dataset(joined_dataset, stable_area=(10, 19), splits_number=10,
                                       signal_data_columns=['a1_x', 'a1_y', 'a1_z', 'a2_x', 'a2_y', 'a2_z'])
print(f"features number: {prepared_data.shape[1]-2}")
print(f"examples number: {prepared_data.shape[0]}")
print(prepared_data.head())

# Run ML experiment
As an example, cross-validation with grouped overlap resampling launched here over linear regression, SVC and random forest classifiers

## Initialize experiment workflow

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate

from source.processes import Shuffler

In [None]:
X = prepared_data.drop(columns=['target', 'group'])
y = prepared_data['target']
groups = prepared_data['group']

cv = Shuffler.OverlapGroupCV(train_size=0.7, n_repeats=100)
logit = LogisticRegression(C=0.01)
X_scaled = StandardScaler().fit_transform(X)

scores = iterators.Metrics.get_scorers_dict()  # Get dict of scores in format required by cross_validate() scoring field

print(X.head)

## Run cross-validation

In [None]:
cv_results = cross_validate(logit, X_scaled, y, cv=cv, scoring=scores)
print(sorted(cv_results.keys()))

print(cv_results['test_score'])