In [2]:
import pandas as pd 
d = pd.read_csv('data/raw_data/csv_data/valid_meta.csv')

In [2]:
#from src.classification import classification_model
import torch 
from src.TCL import tcl, MLP, Maxout


datasetECG_train = torch.load('data/dataset_data/dataset_1/dataset_train.pth')
datasetECG_val = torch.load('data/dataset_data/dataset_1/dataset_val.pth')
datasetECG_test = torch.load('data/dataset_data/dataset_1/dataset_test.pth')



In [3]:
model = torch.load('models/TCL/TCL_1/model.pth',map_location=torch.device('cpu'))

In [4]:
output_dim_model = 3
name_model = 'TCL_fake'
path_meta_data_train = 'data/raw_data/csv_data/train_meta.csv'
path_meta_data_val = 'data/raw_data/csv_data/valid_meta.csv'
path_meta_data_test = 'data/raw_data/csv_data/test_meta.csv'
sliding_window = 10
stride = 2
param_grid_svm =  {'svm__C': [0.1, 1, 10],
             'svm__kernel': ['linear', 'rbf', 'poly'], 
            'svm__gamma': ['scale', 'auto']}

In [22]:
from src.feature_extraction import *

import json
import joblib

import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import PredefinedSplit, GridSearchCV


def gridsearch_SVM_pipeline(X_train,y_train, X_val, y_val, param_grid, scoring='accuracy', verbose=4):
    # Create an SVM pipeline
    svm_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('svm', SVC())
    ])
    split_index = [-1]*len(X_train) + [0]*len(X_val)
    X = np.concatenate((X_train, X_val), axis=0)
    y = np.concatenate((y_train, y_val), axis=0)
    pds = PredefinedSplit(test_fold = split_index)
    # Create a GridSearchCV object
    grid_search_model = GridSearchCV(svm_pipeline, param_grid, cv=pds, scoring=scoring, verbose=verbose)
    grid_search_model.fit(X,y)
    results_gs = grid_search_model.cv_results_
    best_params = grid_search_model.best_params_
    best_model = grid_search_model.best_estimator_
    return(results_gs, best_params, best_model)


def get_metrics(y_true, y_pred, metrics_to_compute=['accuracy', 'f1_score', 'recall', 'precision']):
    """
    Calculate specified classification metrics.

    Parameters:
    - y_true: true labels
    - y_pred: predicted labels
    - metrics_to_compute: list of metrics to calculate
    
    Returns:
    - metrics_dict: a dictionary containing calculated metrics
    """
    valid_metrics = ['accuracy', 'f1_score', 'recall', 'precision', 'roc_auc']
    
    assert all(metric in valid_metrics for metric in metrics_to_compute), "Invalid metric name in metrics_to_compute."

    metrics_dict = {}

    if 'accuracy' in metrics_to_compute:
        metrics_dict['accuracy'] = accuracy_score(y_true, y_pred)
    if 'f1_score' in metrics_to_compute:
        metrics_dict['f1_score'] = f1_score(y_true, y_pred)
    if 'recall' in metrics_to_compute:
        metrics_dict['recall'] = recall_score(y_true, y_pred)
    if 'precision' in metrics_to_compute:
        metrics_dict['precision'] = precision_score(y_true, y_pred)
    if 'roc_auc' in metrics_to_compute:
        metrics_dict['roc_auc'] = roc_auc_score(y_true, y_pred)

    return metrics_dict



def get_results(classif_model, X_train, y_train, X_test, y_test, metrics_to_compute=['accuracy', 'f1_score','recall','precision']):
    y_predict_train = classif_model(X_train)
    y_predict_test = classif_model(X_test)
    metrics_train = get_metrics(y_true=y_train, y_pred=y_predict_train, metrics_to_compute=metrics_to_compute)
    metrics_test = get_metrics(y_true=y_test,y_pred= y_predict_test, metrics_to_compute=metrics_to_compute)
    res = {'train' : metrics_train, 'test' : metrics_test}
    return res
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from torch.utils.data import DataLoader

def process_batch(batch, model, output_dim, sliding_window, stride):
    batch_x, batch_y, batch_z = batch
    output_patient_logit, features_patient = model(batch_x.float(), patient_id=batch_z)
    patient_id = batch_z.unique()
    assert patient_id.shape[0] == 1, 'Multiple patients in batch - resize batch size'
    features_patient_avg = compute_average_signal(features_patient.T, w=sliding_window, s=stride)
    return patient_id[0], features_patient_avg.detach().numpy()

def get_features_dataset_parallel(model, output_dim, dataset, sliding_window, stride, T=1000):
    batch_size = T
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    n_patient_dataset = len(loader)

    label_patients = np.zeros(n_patient_dataset)
    features_patients = np.zeros((n_patient_dataset, output_dim, get_number_windows(T, sliding_window, stride)))

    with ThreadPoolExecutor() as executor:
        batches = list(loader)
        results = list(executor.map(lambda batch: process_batch(batch, model, output_dim, sliding_window, stride), batches))

    for i, (patient_id, features_patient_avg) in enumerate(sorted(results, key=lambda x: x[0])):
        label_patients[i] = patient_id
        features_patients[i, :, :] = features_patient_avg

    return label_patients, features_patients

def classification_model(model, output_dim_model, 
                         sliding_window, stride,
                         datasetECG_train, datasetECG_val, datasetECG_test,
                         param_grid_svm, 
                         path_meta_data_train, path_meta_data_val, path_meta_data_test,
                         T = 1000,
                         metrics_to_compute = ['accuracy', 'f1_score','recall','precision'],
                         list_columns_classif = ['NORM'],
                         path_save_models= 'models/classif/'):
    df_train_meta = pd.read_csv(path_meta_data_train)
    df_val_meta = pd.read_csv(path_meta_data_val)
    df_test_meta = pd.read_csv(path_meta_data_test)
    print('everythind is read')
    id_patient_train, X_train = get_features_dataset_parallel(model, output_dim_model, datasetECG_train, sliding_window, stride, T=T)
    print('feature train')
    id_patient_val, X_val = get_features_dataset_parallel(model, output_dim_model, datasetECG_val, sliding_window, stride, T=T)
    print('feature_val')
    id_patient_test, X_test = get_features_dataset_parallel(model, output_dim_model, datasetECG_test, sliding_window, stride, T=T)
    print('feature_test')
    labels_train_dict = get_label_from_id_patient(id_patient_train, df_train_meta, list_columns_classif)
    print('label_train')
    labels_val_dict = get_label_from_id_patient(id_patient_val, df_val_meta, list_columns_classif)
    print('label_val')
    labels_test_dict = get_label_from_id_patient(id_patient_test, df_test_meta, list_columns_classif)
    print('label_test')
    X_train = X_train.reshape(X_train.shape[0], -1)
    X_val = X_val.reshape(X_val.shape[0], -1)
    X_test = X_test.reshape(X_test.shape[0], -1)
    X_big_train = np.concatenate([X_train,X_val], axis=0)
    all_res = {}
    for col in list_columns_classif :
        print(col)
        path_save_classif = path_save_models + col + '/'
        y_train = labels_train_dict[col]
        y_train = y_train = y_train.ravel()
        y_val = labels_val_dict[col]
        y_val = y_val.ravel()
        y_test = labels_test_dict[col]
        y_test = y_test.ravel()
        y_big_train = np.concatenate([y_train,y_val], axis=0)
        results_gs, best_params, best_model = gridsearch_SVM_pipeline(X_train,y_train, X_val, y_val, param_grid_svm, scoring='accuracy', verbose=2)
        results_gs.to_csv(path_save_classif+'results_gs_SVM.csv')
        with open(path_save_classif+'best_params.json','w') as f:
            json.dump(best_params,f)
        res = get_results(classif_model= best_model,
                          X_train = X_big_train,
                          y_train= y_big_train,
                          X_test= X_test, 
                          y_test= y_test,
                          metrics_to_compute= metrics_to_compute)
        with open(path_save_classif+'res_best_model.json','w') as f:
            json.dump(res,f)
        model_filename = path_save_classif+'best_model.joblib'
        joblib.dump(best_model, model_filename)
        all_res[col] = res
    return(all_res)

In [23]:

classification_model(model, output_dim_model, 
                         sliding_window, stride,
                         datasetECG_train, datasetECG_val, datasetECG_test,
                         param_grid_svm, 
                         path_meta_data_train, path_meta_data_val, path_meta_data_test,
                         T = 1000,
                         metrics_to_compute = ['accuracy', 'f1_score','recall','precision'],
                         list_columns_classif = ['NORM'],
                         path_save_models= 'models/classif/')

everythind is read
feature train
feature_val
feature_test
label_train
label_val
label_test
NORM
(19634, 1488)
(17441, 1488)
Fitting 1 folds for each of 18 candidates, totalling 18 fits


  y = column_or_1d(y, warn=True)


[CV] END ...svm__C=0.1, svm__gamma=scale, svm__kernel=linear; total time=36.9min


  y = column_or_1d(y, warn=True)


[CV] END ......svm__C=0.1, svm__gamma=scale, svm__kernel=rbf; total time= 5.5min


  y = column_or_1d(y, warn=True)


[CV] END .....svm__C=0.1, svm__gamma=scale, svm__kernel=poly; total time= 5.6min


  y = column_or_1d(y, warn=True)


[CV] END ....svm__C=0.1, svm__gamma=auto, svm__kernel=linear; total time=32.6min


  y = column_or_1d(y, warn=True)


[CV] END .......svm__C=0.1, svm__gamma=auto, svm__kernel=rbf; total time= 4.6min


  y = column_or_1d(y, warn=True)


[CV] END ......svm__C=0.1, svm__gamma=auto, svm__kernel=poly; total time= 4.6min


  y = column_or_1d(y, warn=True)


[CV] END ....svm__C=1, svm__gamma=scale, svm__kernel=linear; total time=244.1min


  y = column_or_1d(y, warn=True)


[CV] END ........svm__C=1, svm__gamma=scale, svm__kernel=rbf; total time= 4.6min


  y = column_or_1d(y, warn=True)


[CV] END .......svm__C=1, svm__gamma=scale, svm__kernel=poly; total time= 4.7min


  y = column_or_1d(y, warn=True)


[CV] END .....svm__C=1, svm__gamma=auto, svm__kernel=linear; total time=243.7min


  y = column_or_1d(y, warn=True)


[CV] END .........svm__C=1, svm__gamma=auto, svm__kernel=rbf; total time= 4.6min


  y = column_or_1d(y, warn=True)


[CV] END ........svm__C=1, svm__gamma=auto, svm__kernel=poly; total time= 4.8min


  y = column_or_1d(y, warn=True)


In [5]:
import numpy as np
def get_label_from_id_patient(id_patient, df_meta, list_columns):
    df_ = df_meta.sort_values(by=['ecg_id'], ascending = True)
    id_in_meta = df_['ecg_id'].values
    nb_label_per_patient = df_['ecg_id'].value_counts().values
    unique_values = len(np.unique(nb_label_per_patient))
    assert unique_values == 1 , 'Error: some patients have multi labels'
    assert np.isin(id_patient, id_in_meta).all(), 'Error : some patients are not included in dataframe provided'
    d = {}
    df_subset = df_.loc[df_['ecg_id'].isin(id_patient),:]
    for col in list_columns:
        d[col] = df_subset[col].values
    return(d)

a =get_label_from_id_patient([10,20,8], d, ['NORM'])

In [6]:
a['NORM']

array([0, 1, 0])