In [55]:
import pandas as pd
import numpy as np
import warnings
from utils import utils_gn, utils_sevsn, utils_noah, utils_models
from sklearn.svm import NuSVR
import importlib
importlib.reload(utils_gn)
importlib.reload(utils_sevsn)
importlib.reload(utils_noah)
importlib.reload(utils_models)
warnings.filterwarnings('ignore')

In [56]:
# read the data
df_raw = (utils_gn.read_data('severson_all.pkl')).drop('cycle_life', axis=1).join(utils_gn.read_data('noah_all.pkl'))
df = df_raw.copy()
df.head()

Unnamed: 0,mean_Q_100_10,log_mean_Q_100_10,var_Q_100_10,log_var_Q_100_10,min_Q_100_10,log_min_Q_100_10,skew_Q_100_10,log_skew_Q_100_10,kurt_Q_100_10,log_kurt_Q_100_10,...,dVdQpeak_maxmag_di_y_100m0,dVdQpeak_maxmag_di_y_diff,dVdQpeak_maxarea_di_y_0,dVdQpeak_maxarea_di_y_50,dVdQpeak_maxarea_di_y_100,dVdQpeak_maxarea_di_y_100m0,dVdQpeak_maxarea_di_y_diff,dSOHdCycCyc1,dSOHdCycCyc100,cycle_life
b1c0,-0.002873,-2.541602,1e-05,-5.014861,-0.00846,-2.072648,-0.532058,-0.274041,1.651688,0.217928,...,-0.005282,0.003144,-79.299389,-75.009448,-68.133388,11.166,2.586119,0.000388,-3.1e-05,1851.0
b1c1,-0.0041,-2.387257,1e-05,-5.01396,-0.011004,-1.958457,-0.429375,-0.367163,1.970884,0.294661,...,0.001686,0.000501,-75.511329,-73.411862,-60.405989,15.10534,10.906407,0.000396,-1.9e-05,2159.0
b1c2,-0.004487,-2.34807,1.8e-05,-4.737,-0.017216,-1.764058,-1.080194,0.033502,3.348637,0.524868,...,-0.005399,0.004043,-76.601709,-73.34594,-65.414446,11.187263,4.675725,0.000312,-1.3e-05,2236.0
b1c3,-0.007456,-2.127507,3.6e-05,-4.442613,-0.018961,-1.722149,-0.43905,-0.357486,1.904584,0.2798,...,-0.002219,0.012165,-80.446334,-67.731448,-55.749502,24.696832,-0.73294,0.000353,-1.3e-05,1433.0
b1c4,-0.00575,-2.240332,2.3e-05,-4.647744,-0.013958,-1.855177,-0.362548,-0.440634,1.666169,0.221719,...,0.002736,0.001597,-75.575598,-65.232594,-55.730088,19.84551,-0.840498,0.000347,-3e-05,1708.0


In [57]:
# prune cells with more than 1250 cycle life
df = df[df['cycle_life'] <= 1250]
df.index

Index(['b1c5', 'b1c6', 'b1c7', 'b1c9', 'b1c11', 'b1c14', 'b1c15', 'b1c16',
       'b1c17', 'b1c18',
       ...
       'b3c29', 'b3c30', 'b3c31', 'b3c34', 'b3c35', 'b3c36', 'b3c40', 'b3c41',
       'b3c43', 'b3c44'],
      dtype='object', length=113)

In [60]:
df['cycle_life'].idxmin()

'b2c1'

In [None]:
# define parameters to be used 
test_size = 0.2

In [None]:
estimator = NuSVR()
param_grid = {"nu": [0.1, 0.3, 0.5, 0.7, 0.9],
              "C": [0.001, 0.01, 0.1, 1.0],
              "kernel": ["linear", "poly", "rbf", "sigmoid"]}
scoring = 'neg_mean_absolute_percentage_error'
cv = 3


def model_pipeline(df, algo, estimator, param_grid, fname, test_size=0.2, scoring='neg_mean_absolute_percentage_error', cv=3, plot=True, model_type=None):
    dict_of_opt_params = {}
    metric_list = []

    # create a list of k
    k_list = [1.0, 0.9, 0.8, 0.6, 0.5, 0.4, 0.3, 0.2]

    for k in k_list:

        # search for the best hyper-parameters
        best_param, _ = utils_models.hyperparameter_tuning(df=df,
                                                            estimator=estimator,
                                                            param_grid=param_grid,
                                                            scoring=scoring,
                                                            cv=cv,
                                                            feature_selection=True,
                                                            k=k)

        # store the best parameters in the dictionary
        dict_of_opt_params[k] = best_param

        # use the best parameters to build model 
        if model_type is None:
            model, metrics = algo(df=df,
                                test_size=test_size,
                                feature_selection=True,
                                scaling=False,
                                params=best_param,
                                plot=True,
                                fname=fname+str(int(k*100)),
                                k=k)
        else:
            model, metrics = algo(df=df,
                                test_size=test_size,
                                feature_selection=True,
                                scaling=False,
                                params=best_param,
                                plot=True,
                                fname=fname+str(int(k*100)),
                                model_type=model_type,
                                k=k)
        
        metric_list.append(list(metrics[0].values()) + list(metrics[1].values()))

    metric_data = pd.DataFrame(data=np.array(metric_list), columns=[data + metric for data in ('Train_', 'Test_') for metric in metrics[0].keys()], index=k_list)
    metric_data.index.name = 'Features used (%)'

    best_k = metric_data['Test_MAPE'].idxmin()

    return best_k, dict_of_opt_params[best_k], metric_data



### NuSVR without scaling and feature selection

In [None]:
model, metrics = utils_models.fit_nusvr(df=df,
                                        test_size=test_size,
                                        feature_selection=False,
                                        scaling=False,
                                        params=params,
                                        plot=True,
                                        fname='nusvr_model')

### NuSVR with scaling but without feature selection

In [None]:
model, metrics = utils_models.fit_nusvr(df=df,
                                        test_size=test_size,
                                        feature_selection=False,
                                        scaling=True,
                                        params=params,
                                        plot=True,
                                        fname='nusvr_scaled_model')

### NuSVR with feature selection and with/without scaling for $k=0.5, 0.6, 0.7, 0.8, 0.9$

In [None]:
# create a list of k
k_list = [0.9, 0.8, 0.6, 0.5, 0.4, 0.3, 0.2]

In [None]:
# A function to fit model with several values of k and choosing whether to perform scaling or not
def k_effect_on_model(scaling, fname):

    metric_list = []

    for k in k_list:
        print('Fitting model with {} percent of features...'.format(int(k*100)))
        model, metrics = utils_models.fit_nusvr(df=df,
                                                test_size=test_size,
                                                feature_selection=True,
                                                scaling=scaling,
                                                params=params,
                                                plot=True,
                                                fname=fname+str(int(k*100)),
                                                k=k)

        metric_list.append(list(metrics[0].values()) + list(metrics[1].values()))

    metric_data = pd.DataFrame(data=np.array(metric_list), columns=[data + metric for data in ('Train_', 'Test_') for metric in metrics[0].keys()], index=np.array(k_list)*100)
    metric_data.index.name = 'Features used (%)'
    
    return metric_data

In [None]:
# without scaling 
k_effect_on_model(scaling=False, fname='nusvr_model_ftselection_k_')

In [None]:
# without scaling 
k_effect_on_model(scaling=True, fname='nusvr_model_ftselection_scaled_k_')

### Perform $k$-Fold Repeated Cross-validation on the model with feature selection and best MAPE

In [None]:
model = NuSVR(**params)
best_model_cross_val_scores = utils_models.repeated_kfold_cross_validation(model=model,
                                                                           df=df,
                                                                           n_splits=3,
                                                                           n_repeats=3,
                                                                           feature_selection=True,
                                                                           scaling=False,
                                                                           k=0.3)

best_model_cross_val_scores

### Hyper-parameter tuning

In [None]:
estimator = NuSVR()
param_grid = {"nu": [0.1, 0.3, 0.5, 0.7, 0.9],
              "C": [0.001, 0.01, 0.1, 1.0],
              "kernel": ["linear", "poly", "rbf", "sigmoid"]}
scoring = 'neg_mean_absolute_percentage_error'
cv = 3

# for the model without feature selection
best_param_all, _ = utils_models.hyperparameter_tuning(df, estimator, param_grid, scoring, cv, feature_selection=False, k=None)
print(best_param_all)

In [None]:
# for the model with feature selection
best_param_selected, _ = utils_models.hyperparameter_tuning(df, estimator, param_grid, scoring, cv, feature_selection=True, k=0.3)
print(best_param_selected)

### Use the best hyper-parameter to rebuild the models and carry out repeated $k$-fold cross-validation

In [None]:
# for the model using all the features 
best_model_all, metrics_all = utils_models.fit_nusvr(df=df,
                                                    test_size=test_size,
                                                    feature_selection=False,
                                                    scaling=False,
                                                    params=best_param_all,
                                                    plot=True,
                                                    fname='best_all_features_nusvr_model')

In [None]:
# for the model using selected features 
best_model_selected, metrics_selected = utils_models.fit_nusvr(df=df,
                                                               test_size=test_size,
                                                               feature_selection=True,
                                                               scaling=False,
                                                               params=best_param_selected,
                                                               plot=True,
                                                               fname='best_selected_features_nusvr_model',
                                                               k=0.3)

In [None]:
# Repeated k-fold cross-validation for the model obtained from grid search and using all features
model = NuSVR(**best_param_all)
utils_models.repeated_kfold_cross_validation(model=model,
                                            df=df,
                                            n_splits=3,
                                            n_repeats=3,
                                            feature_selection=False,
                                            scaling=False)



In [None]:
# Repeated k-fold cross-validation for the model obtained from grid search and using selected features
model = NuSVR(**best_param_selected)
utils_models.repeated_kfold_cross_validation(model=model,
                                            df=df,
                                            n_splits=3,
                                            n_repeats=3,
                                            feature_selection=True,
                                            scaling=False,
                                            k=0.3)

