In [33]:
import os

import numpy as np
import pandas as pd

from source.processes import Shuffler
from source.datamodels.datamodels import GridSearchResults
from source.datamodels.datamodels import Axes, Stats

In [10]:
project_folder = "F:/PythonNotebooks/Study/Quantum/Bearings/"
own_data_path = os.path.join(project_folder, "data/own datasets/")
third_party_data_path = os.path.join(project_folder, "data/third party datasets/")

In [11]:
prepared_data = pd.read_csv(os.path.join(own_data_path, 'processed_full_signal_specter1000_noscale.csv'), delimiter=',')  # our experiment
# prepared_data = pd.read_csv(os.path.join(third_party_data_path, 'N1 Cesar Ricardo', 'csv',
#                                          'processed_full_signal_specter1000_noscale.csv'), delimiter=',')  # third-party dataset
prepared_data.head()

Unnamed: 0,target,group,a1_x_signal_complexity,a1_x_signal_shannon_entropy,a1_x_signal_kurtosis,a1_x_signal_variation,a1_x_signal_hurst,a1_x_signal_skew,a1_x_signal_activity,a1_x_signal_iqr,...,a2_z_specter_iqr,a2_z_specter_zero_crossing,a2_z_specter_range,a2_z_specter_mean,a2_z_specter_petrosian_fd,a2_z_specter_higuchi_fd,a2_z_specter_crest_factor,a2_z_specter_energy,a2_z_specter_std,a2_z_specter_sample_entropy
0,0.0,1.0,1.865568,6.38802,-0.337526,-9.989303,0.62617,-0.231906,0.51953,3.006134,...,130.212247,0.0,430.516204,121.609951,1.024136,1.709899,2.865578,22610590.0,88.439848,2.754299
1,0.0,1.0,1.812863,6.507361,-0.329388,-8.479932,0.633813,0.04648,0.51992,3.270674,...,79.932721,0.0,339.669032,116.951076,1.024806,1.75462,2.630547,16832350.0,56.167537,2.881832
2,0.0,1.0,1.775775,6.489806,-0.439591,-9.585973,0.625745,0.014204,0.531458,3.174478,...,87.487539,0.0,359.727237,126.969372,1.022686,1.718792,2.593399,19717600.0,59.969811,2.874073
3,0.0,1.0,1.912905,6.422261,-0.364524,-9.033494,0.606942,-0.14576,0.51209,2.982085,...,78.688634,0.0,353.579137,108.447712,1.025371,1.746161,2.880197,15475130.0,60.944452,2.780298
4,0.0,1.0,1.868826,6.449559,0.017384,-8.48262,0.631058,-0.149621,0.526677,2.982085,...,83.595954,0.0,331.560162,112.269206,1.02424,1.762398,2.66519,15966520.0,57.984006,2.852191


In [12]:
# models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [13]:
LR_estimator = LogisticRegression(max_iter=10000)
SVC_estimator = SVC()
RFC_estimator = RandomForestClassifier()
GBM_estimator = GradientBoostingClassifier()
KNN_estimator = KNeighborsClassifier()

In [39]:
LR_grid = {"C": np.logspace(-3, 6, 10)}
SVC_grid = {"C": np.logspace(-3, 6, 10), 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
RFC_grid = {'n_estimators': [20, 50, 100, 200, 500], 'max_depth': [1, 3, 5, 9, 15, 30, None]}
GBM_grid = {'learning_rate': np.logspace(-3, 4, 8), 'n_estimators': [50, 100, 200, 500], 'max_depth': [1, 3, 5, 9, 15, 30, None]}
KNN_grid = {'n_neighbors': [1, 3, 5, 7, 9, 15, 35], 'p': [1, 1.5, 2, 2.5, 6, 100]}

In [40]:
X = prepared_data.drop(columns=['target', 'group']).values
y = prepared_data['target'].values
groups = prepared_data['group'].values

X_scaled = StandardScaler().fit_transform(X)

In [None]:
from datetime import datetime

GS_results_obj = []
estimators = [LR_estimator]
grids = [LR_grid]
names = ['LR']
score_name = 'f1'

for estimator, grid, name in zip(estimators, grids, names):
    start_time = datetime.now()
    cv = Shuffler.OverlapGroupCV(train_size=0.63, n_repeats=100).split(X_scaled, y, groups)
    GSCV = GridSearchCV(LR_estimator, LR_grid, scoring=score_name, cv=cv)
    GSCV.fit(X_scaled, y, groups=groups)

    print(f"Best {score_name} score for {name}: {100*GSCV.best_score_:.3f}% \n best params: {GSCV.best_params_}\n required time: {datetime.now() - start_time}")

    highest_score_params = GSCV.cv_results_['rank_test_score'][0]
    split_scores_names = [f'split{idx}_test_score' for idx in range(100)]

    scores_distr = []
    for split_scores_name in split_scores_names:
        scores_distr.append(GSCV.cv_results_[split_scores_name][highest_score_params-1])
    res = GridSearchResults(
        run_label = "GS",
        model_name = name,
        hyperparameters = GSCV.best_params_,
        hyperparameters_grid = grid,
        use_signal = True,
        use_specter = True,
        specter_threshold = 1000,
        axes = Axes.get_keys(),
        stats = Stats.get_keys(),
        predictions = None,
        scores = {score_name: GSCV.best_score_},
        resampling_number = 100,
        bootstrap_scores =  {score_name: scores_distr}
    )
    GS_results_obj.append(res)