In [1]:
import pandas as pd

hb_data = pd.read_csv('HB_cl1_data.csv')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
import numpy as np
from sklearn.svm import SVC

train_set, test_set = train_test_split(hb_data, test_size=0.2, random_state=42)

y_train = train_set['HB-type']
X_train = train_set.drop(['HB-type'], axis=1)

num_pipeline = make_pipeline(StandardScaler())

preprocessing = ColumnTransformer([("num",num_pipeline, make_column_selector(dtype_include=np.number))])

model_svc = make_pipeline(preprocessing, SVC(kernel='rbf', C=1.0)) 
#use the next line if you want to use the ROC or average precision as scoring function
#model_svc = make_pipeline(preprocessing, SVC(kernel='rbf', C=1.0, probability=True)) 
model_svc.fit(X_train, y_train)

0,1,2
,steps,"[('columntransformer', ...), ('svc', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [3]:
model_svc.get_params().keys()

dict_keys(['memory', 'steps', 'transform_input', 'verbose', 'columntransformer', 'svc', 'columntransformer__force_int_remainder_cols', 'columntransformer__n_jobs', 'columntransformer__remainder', 'columntransformer__sparse_threshold', 'columntransformer__transformer_weights', 'columntransformer__transformers', 'columntransformer__verbose', 'columntransformer__verbose_feature_names_out', 'columntransformer__num', 'columntransformer__num__memory', 'columntransformer__num__steps', 'columntransformer__num__transform_input', 'columntransformer__num__verbose', 'columntransformer__num__standardscaler', 'columntransformer__num__standardscaler__copy', 'columntransformer__num__standardscaler__with_mean', 'columntransformer__num__standardscaler__with_std', 'svc__C', 'svc__break_ties', 'svc__cache_size', 'svc__class_weight', 'svc__coef0', 'svc__decision_function_shape', 'svc__degree', 'svc__gamma', 'svc__kernel', 'svc__max_iter', 'svc__probability', 'svc__random_state', 'svc__shrinking', 'svc__tol

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import average_precision_score, roc_auc_score, f1_score, make_scorer

param_grid = [{'svc__C': [1,10,100,1000,5000,10000,50000],
              }]

score_function = make_scorer(f1_score, pos_label='weak')
#use one of the next lines if you want to use the respective scoring function. Please use also the line with a # in the second cell 
#score_function = make_scorer(roc_auc_score, response_method='predict_proba')
#score_function = make_scorer(average_precision_score, pos_label='weak', response_method='predict_proba')

grid_search = GridSearchCV(model_svc, param_grid, cv=5, scoring=score_function)
grid_search.fit(X_train, y_train)

cv_hyperpara = pd.DataFrame(grid_search.cv_results_)
cv_hyperpara.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_hyperpara.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,0.045349,0.015227,0.012455,0.003116,5000,{'svc__C': 5000},0.924242,0.939394,0.96124,0.945736,0.923077,0.938738,0.014221,1
3,0.029198,0.002215,0.011203,0.001143,1000,{'svc__C': 1000},0.924242,0.944882,0.936508,0.945736,0.923077,0.934889,0.009726,2
5,0.069331,0.03231,0.013762,0.003873,10000,{'svc__C': 10000},0.931298,0.924242,0.96124,0.932331,0.897638,0.92935,0.020305,3
2,0.019664,0.003385,0.012273,0.00311,100,{'svc__C': 100},0.947368,0.928,0.914729,0.924242,0.916031,0.926074,0.011748,4
1,0.017043,0.004098,0.009703,0.001449,10,{'svc__C': 10},0.940299,0.913386,0.936508,0.909091,0.925373,0.924931,0.012285,5


In [5]:
model_final = make_pipeline(preprocessing, SVC(kernel='rbf', C=5000)) 
model_final.fit(X_train, y_train)

0,1,2
,steps,"[('columntransformer', ...), ('svc', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,5000
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [6]:
from sklearn.metrics import precision_score, recall_score

y_test = test_set['HB-type']
X_test = test_set.drop(['HB-type'], axis=1)


predictions_test = model_final.predict(X_test)
precision_test = precision_score(y_test, predictions_test, pos_label='weak')
print("The model has a precision of %0.3f on the test data set\n" % precision_test)

recall_test = recall_score(y_test, predictions_test, pos_label='weak')
print("The model has a recall of %0.3f on the test data set\n" % recall_test)

The model has a precision of 0.946 on the test data set

The model has a recall of 0.956 on the test data set

