In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split,GridSearchCV, StratifiedKFold
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, make_scorer, fbeta_score,precision_score
from imblearn.under_sampling import RandomUnderSampler
import time
import collections

In [2]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.svm import SVC



In [3]:
X_train= pd.read_pickle('X_train.pkl')
y_train= pd.read_pickle('y_train.pkl')
X_test= pd.read_pickle('X_test.pkl')
y_test= pd.read_pickle('y_test.pkl')

In [4]:
counter = collections.Counter(y_train)
print(counter)
counter = collections.Counter(y_test)
print(counter)

Counter({-1: 1398, 1: 11})
Counter({-1: 350, 1: 3})


### PIPELINE: SVM -> SMOTE -> CV

1- Scoring = accuracy

In [5]:
# Create a pipeline that first applies SMOTE and then scales the data

# Create the pipeline
pipeline = ImbPipeline(steps=[
    ('smote_over', SMOTE(random_state=11, sampling_strategy=0.1)),  # Over-sampling
    ('smote_under', RandomUnderSampler(sampling_strategy=0.5)),     # Under-sampling
    ('scaler', StandardScaler()),                                   # Feature scaling
    ('svm',SVC())                                           # Classifier
])


# Define the parameter grid for the SVM classifier
# Note: Specify 'svm__' before each parameter name to indicate these parameters are for the SVM step in the pipeline
param_grid = {
    'smote_over__k_neighbors': [3,4,5],  # Parameters for SMOTE oversampling
    'svm__C': [0.1, 1, 10, 20],
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__degree': [2, 3, 4]  # Relevant for 'poly' kernel
}
# Create a grid search object with cross-validation
grid_search = GridSearchCV(pipeline, param_grid,refit=True, cv=5, scoring='accuracy', verbose=10,n_jobs=-1)


In [13]:
start = time.time()
# Fit the grid search object to the training data (NOT the SMOTE-resampled data)
grid_search.fit(X_train, y_train)  # Use your original training data here
end = time.time()

dt = end - start
print('delta t =', dt)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[CV 2/5; 1/108] START smote_over__k_neighbors=3, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 1/5; 1/108] START smote_over__k_neighbors=3, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 3/5; 1/108] START smote_over__k_neighbors=3, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 1/5; 2/108] START smote_over__k_neighbors=3, svm__C=0.1, svm__degree=2, svm__kernel=rbf
[CV 4/5; 1/108] START smote_over__k_neighbors=3, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 3/5; 2/108] START smote_over__k_neighbors=3, svm__C=0.1, svm__degree=2, svm__kernel=rbf
[CV 4/5; 2/108] START smote_over__k_neighbors=3, svm__C=0.1, svm__degree=2, svm__kernel=rbf
[CV 5/5; 1/108] START smote_over__k_neighbors=3, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 5/5; 2/108] START smote_over__k_neighbors=3, svm__C=0.1, svm__degree=2, svm__kernel=rbf
[CV 2/5; 2/108] START smote_over__k_neighbors=3, svm__C=0.1, svm__degree=2, svm__kernel=rbf
[CV 1/5; 3/108] START smote_over__k_neighbors=3, svm__C=0.1, svm_

In [14]:
# Print the best parameters and score
print('Best parameters: ', grid_search.best_params_)
print('Best score: ', grid_search.best_score_)
# Access the best model
best_model = grid_search.best_estimator_

Best parameters:  {'smote_over__k_neighbors': 3, 'svm__C': 0.1, 'svm__degree': 2, 'svm__kernel': 'poly'}
Best score:  0.9921935337321118


In [15]:
best_model

In [16]:
y_pred = grid_search.best_estimator_.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

print(f'Test Set Accuracy: {test_accuracy * 100:.2f}%')

Test Set Accuracy: 99.15%


In [17]:
# Print the classification report
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.99      1.00      1.00       350
           1       0.00      0.00      0.00         3

    accuracy                           0.99       353
   macro avg       0.50      0.50      0.50       353
weighted avg       0.98      0.99      0.99       353



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[350   0]
 [  3   0]]


2- Scoring = f1

In [13]:
# Create a pipeline that first applies SMOTE and then scales the data
pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=345)),
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

# Define the parameter grid for the SVM classifier
# Note: Specify 'svm__' before each parameter name to indicate these parameters are for the SVM step in the pipeline
param_grid = {
    'smote__k_neighbors': [2,3,4],  # Example range; adjust based on your dataset characteristics
    'svm__C': [0.1, 1, 10, 20],
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__degree': [2, 3, 4]  # Relevant for 'poly' kernel
}
# Create a grid search object with cross-validation
grid_search = GridSearchCV(pipeline, param_grid,refit=True, cv=5, scoring='f1', verbose=10,n_jobs=-1)

In [14]:
start = time.time()
# Fit the grid search object to the training data (NOT the SMOTE-resampled data)
grid_search.fit(X_train, y_train)  # Use your original training data here
end = time.time()

dt = end - start
print('delta t =', dt)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 1/5; 1/108] START smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear


[CV 1/5; 1/108] END smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear;, score=0.000 total time=   0.4s
[CV 2/5; 1/108] START smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 2/5; 1/108] END smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear;, score=0.000 total time=   0.4s
[CV 3/5; 1/108] START smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 3/5; 1/108] END smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear;, score=0.000 total time=   0.4s
[CV 4/5; 1/108] START smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 4/5; 1/108] END smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear;, score=0.600 total time=   0.3s
[CV 5/5; 1/108] START smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 5/5; 1/108] END smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear;, score=0.000 total time=   0.4s
[CV 1/5; 2/108] START smote__k_neig

In [15]:
# Print results
results = grid_search.cv_results_
for mean_score, std_score, params in zip(results['mean_test_score'], results['std_test_score'], results['params']):
    print(f"Mean test score: {mean_score:.3f} (+/-{std_score:.3f}) for {params}")

Mean test score: 0.120 (+/-0.240) for {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 2, 'svm__kernel': 'linear'}
Mean test score: 0.265 (+/-0.221) for {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 2, 'svm__kernel': 'rbf'}
Mean test score: 0.052 (+/-0.027) for {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 2, 'svm__kernel': 'poly'}
Mean test score: 0.120 (+/-0.240) for {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 3, 'svm__kernel': 'linear'}
Mean test score: 0.265 (+/-0.221) for {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 3, 'svm__kernel': 'rbf'}
Mean test score: 0.035 (+/-0.006) for {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 3, 'svm__kernel': 'poly'}
Mean test score: 0.120 (+/-0.240) for {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 4, 'svm__kernel': 'linear'}
Mean test score: 0.265 (+/-0.221) for {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 4, 'svm__kernel': 'rbf'}
Mean test score: 0.025 (+/-0.008) for

In [16]:
best_model
# Print the best parameters and score
print('Best parameters: ', grid_search.best_params_)
print('Best score: ', grid_search.best_score_)
# Access the best model
best_model = grid_search.best_estimator_

Best parameters:  {'smote__k_neighbors': 4, 'svm__C': 1, 'svm__degree': 2, 'svm__kernel': 'rbf'}
Best score:  0.37333333333333335


In [17]:
y_pred = grid_search.best_estimator_.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

print(f'Test Set Accuracy: {test_accuracy * 100:.2f}%')

Test Set Accuracy: 98.58%


In [18]:
# Print the classification report
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

              precision    recall  f1-score   support

          -1       1.00      0.99      0.99       350
           1       0.38      1.00      0.55         3

    accuracy                           0.99       353
   macro avg       0.69      0.99      0.77       353
weighted avg       0.99      0.99      0.99       353

Confusion Matrix:
[[345   5]
 [  0   3]]


3- Scoring = recall

In [19]:
# Create a pipeline that first applies SMOTE and then scales the data
pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=345)),
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

# Define the parameter grid for the SVM classifier
# Note: Specify 'svm__' before each parameter name to indicate these parameters are for the SVM step in the pipeline
param_grid = {
    'smote__k_neighbors': [2,3,4],  # Example range; adjust based on your dataset characteristics
    'svm__C': [0.1, 1, 10, 20],
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__degree': [2, 3, 4]  # Relevant for 'poly' kernel
}
# Create a grid search object with cross-validation
grid_search = GridSearchCV(pipeline, param_grid,refit=True, cv=5, scoring='recall', verbose=10,n_jobs=-1)

In [20]:
start = time.time()
# Fit the grid search object to the training data (NOT the SMOTE-resampled data)
grid_search.fit(X_train, y_train)  # Use your original training data here
end = time.time()

dt = end - start
print('delta t =', dt)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 1/5; 1/108] START smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear


[CV 1/5; 1/108] END smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear;, score=0.000 total time=   0.4s
[CV 2/5; 1/108] START smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 2/5; 1/108] END smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear;, score=0.000 total time=   0.3s
[CV 3/5; 1/108] START smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 3/5; 1/108] END smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear;, score=0.000 total time=   0.4s
[CV 4/5; 1/108] START smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 4/5; 1/108] END smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear;, score=1.000 total time=   0.4s
[CV 5/5; 1/108] START smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 5/5; 1/108] END smote__k_neighbors=2, svm__C=0.1, svm__degree=2, svm__kernel=linear;, score=0.000 total time=   0.4s
[CV 1/5; 2/108] START smote__k_neig

In [21]:
# Print results
results = grid_search.cv_results_
for mean_score, std_score, params in zip(results['mean_test_score'], results['std_test_score'], results['params']):
    print(f"Mean test score: {mean_score:.3f} (+/-{std_score:.3f}) for {params}")

Mean test score: 0.200 (+/-0.400) for {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 2, 'svm__kernel': 'linear'}
Mean test score: 0.600 (+/-0.490) for {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 2, 'svm__kernel': 'rbf'}
Mean test score: 0.800 (+/-0.400) for {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 2, 'svm__kernel': 'poly'}
Mean test score: 0.200 (+/-0.400) for {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 3, 'svm__kernel': 'linear'}
Mean test score: 0.600 (+/-0.490) for {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 3, 'svm__kernel': 'rbf'}
Mean test score: 1.000 (+/-0.000) for {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 3, 'svm__kernel': 'poly'}
Mean test score: 0.200 (+/-0.400) for {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 4, 'svm__kernel': 'linear'}
Mean test score: 0.600 (+/-0.490) for {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 4, 'svm__kernel': 'rbf'}
Mean test score: 0.900 (+/-0.200) for

In [22]:
best_model
# Print the best parameters and score
print('Best parameters: ', grid_search.best_params_)
print('Best score: ', grid_search.best_score_)
# Access the best model
best_model = grid_search.best_estimator_

Best parameters:  {'smote__k_neighbors': 2, 'svm__C': 0.1, 'svm__degree': 3, 'svm__kernel': 'poly'}
Best score:  1.0


In [23]:
y_pred = grid_search.best_estimator_.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

print(f'Test Set Accuracy: {test_accuracy * 100:.2f}%')

Test Set Accuracy: 57.79%


In [24]:
# Print the classification report
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

              precision    recall  f1-score   support

          -1       1.00      0.57      0.73       350
           1       0.02      1.00      0.04         3

    accuracy                           0.58       353
   macro avg       0.51      0.79      0.38       353
weighted avg       0.99      0.58      0.72       353

Confusion Matrix:
[[201 149]
 [  0   3]]


4- Scoring = f1

In [45]:
# Create a pipeline that first applies SMOTE and then scales the data

pipeline = ImbPipeline(steps=[
    ('smote_over', SMOTE(random_state=11, sampling_strategy=0.1)),  # Over-sampling
    ('smote_under', RandomUnderSampler(sampling_strategy=0.5)),     # Under-sampling
    ('scaler', StandardScaler()),                                   # Feature scaling
    ('svm',SVC())                                           # Classifier
])


param_grid = {
    'smote_over__k_neighbors': [2,3,4],  # Parameters for SMOTE oversampling
    'smote_over__sampling_strategy': [0.1,0.2, 0.3, 0.4],
    'smote_under__sampling_strategy': [0.3,0.4, 0.5, 0.6],
    'svm__C': [0.1, 1, 10, 20],
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__degree': [2, 3, 4]  # Relevant for 'poly' kernel
}

# Define the parameter grid for the SVM classifier
# Note: Specify 'svm__' before each parameter name to indicate these parameters are for the SVM step in the pipeline

# Create a grid search object with cross-validation
grid_search = GridSearchCV(pipeline, param_grid,refit=True, cv=5, scoring='f1', verbose=10,n_jobs=-1)

In [46]:
start = time.time()
# Fit the grid search object to the training data (NOT the SMOTE-resampled data)
grid_search.fit(X_train, y_train)  # Use your original training data here
end = time.time()
dt = end - start
print('delta t =', dt)

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


[CV 1/5; 1/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling_strategy=0.3, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 2/5; 1/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling_strategy=0.3, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 4/5; 1/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling_strategy=0.3, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 5/5; 1/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling_strategy=0.3, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 3/5; 1/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling_strategy=0.3, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 4/5; 2/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling_strategy=0.3, svm__C=0.1, svm__degree=2, svm__kernel=rbf
[

540 fits failed out of a total of 8640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 293, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 250, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
                               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/ana

delta t = 1099.7411451339722


In [47]:
# Print results
results = grid_search.cv_results_
for mean_score, std_score, params in zip(results['mean_test_score'], results['std_test_score'], results['params']):
    print(f"Mean test score: {mean_score:.3f} (+/-{std_score:.3f}) for {params}")

Mean test score: 0.147 (+/-0.129) for {'smote_over__k_neighbors': 2, 'smote_over__sampling_strategy': 0.1, 'smote_under__sampling_strategy': 0.3, 'svm__C': 0.1, 'svm__degree': 2, 'svm__kernel': 'linear'}
Mean test score: 0.210 (+/-0.310) for {'smote_over__k_neighbors': 2, 'smote_over__sampling_strategy': 0.1, 'smote_under__sampling_strategy': 0.3, 'svm__C': 0.1, 'svm__degree': 2, 'svm__kernel': 'rbf'}
Mean test score: 0.000 (+/-0.000) for {'smote_over__k_neighbors': 2, 'smote_over__sampling_strategy': 0.1, 'smote_under__sampling_strategy': 0.3, 'svm__C': 0.1, 'svm__degree': 2, 'svm__kernel': 'poly'}
Mean test score: 0.060 (+/-0.080) for {'smote_over__k_neighbors': 2, 'smote_over__sampling_strategy': 0.1, 'smote_under__sampling_strategy': 0.3, 'svm__C': 0.1, 'svm__degree': 3, 'svm__kernel': 'linear'}
Mean test score: 0.160 (+/-0.320) for {'smote_over__k_neighbors': 2, 'smote_over__sampling_strategy': 0.1, 'smote_under__sampling_strategy': 0.3, 'svm__C': 0.1, 'svm__degree': 3, 'svm__kern

In [48]:
best_model
# Print the best parameters and score
print('Best parameters: ', grid_search.best_params_)
print('Best score: ', grid_search.best_score_)
# Access the best model
best_model = grid_search.best_estimator_

Best parameters:  {'smote_over__k_neighbors': 3, 'smote_over__sampling_strategy': 0.2, 'smote_under__sampling_strategy': 0.3, 'svm__C': 1, 'svm__degree': 3, 'svm__kernel': 'rbf'}
Best score:  0.4133333333333333


In [49]:
y_pred = grid_search.best_estimator_.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

print(f'Test Set Accuracy: {test_accuracy * 100:.2f}%')

Test Set Accuracy: 98.58%


Training results

In [None]:
# Print the classification report
y_pred = grid_search.predict(X_train)
print(classification_report(y_train, y_train))

cm = confusion_matrix(y_train, y_train)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

Testing result

In [50]:
# Print the classification report
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

              precision    recall  f1-score   support

          -1       1.00      0.99      0.99       350
           1       0.38      1.00      0.55         3

    accuracy                           0.99       353
   macro avg       0.69      0.99      0.77       353
weighted avg       0.99      0.99      0.99       353

Confusion Matrix:
[[345   5]
 [  0   3]]


4- Scoring = f2

In [7]:
# Create a pipeline that first applies SMOTE and then scales the data

pipeline = ImbPipeline(steps=[
    ('smote_over', SMOTE(random_state=11, sampling_strategy=0.1)),  # Over-sampling
    ('smote_under', RandomUnderSampler(sampling_strategy=0.5)),     # Under-sampling
    ('scaler', StandardScaler()),                                   # Feature scaling
    ('svm',SVC())                                           # Classifier
])

f2_scorer = make_scorer(fbeta_score, beta=2)
custom_precision_scorer = make_scorer(precision_score, zero_division=0)

params = {
    'smote_over__k_neighbors': [2,3,4],  # Parameters for SMOTE oversampling
    'smote_over__sampling_strategy': [0.1,0.2, 0.3, 0.4],
    'smote_under__sampling_strategy': [0.3,0.4, 0.5, 0.6],
    'svm__C': [0.1, 1, 10, 20],
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__degree': [2, 3, 4]  # Relevant for 'poly' kernel
}

# Define the parameter grid for the SVM classifier
# Note: Specify 'svm__' before each parameter name to indicate these parameters are for the SVM step in the pipeline

# Create a grid search object with cross-validation

#grid_search = GridSearchCV(pipeline, param_grid,refit=True, cv=5, scoring='f1', ,n_jobs=-1)

grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=params,
                           scoring={    'precision': custom_precision_scorer,
                                        'recall': 'recall',
                                        'f2_score': f2_scorer},
                           refit = 'f2_score',
                           cv=StratifiedKFold(n_splits=3,shuffle=True, random_state=11),
                           verbose=10,
                           n_jobs=-1)

In [8]:
start = time.time()
# Fit the grid search object to the training data (NOT the SMOTE-resampled data)
grid_search.fit(X_train, y_train)  # Use your original training data here
end = time.time()
dt = end - start
print('delta t =', dt)

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits
[CV 2/3; 1/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling_strategy=0.3, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 1/3; 1/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling_strategy=0.3, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 3/3; 1/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling_strategy=0.3, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 1/3; 2/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling_strategy=0.3, svm__C=0.1, svm__degree=2, svm__kernel=rbf
[CV 2/3; 2/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling_strategy=0.3, svm__C=0.1, svm__degree=2, svm__kernel=rbf
[CV 3/3; 2/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling

324 fits failed out of a total of 5184.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
324 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 293, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 250, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
                               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/ana

delta t = 562.2807829380035


In [11]:
# Print results
results = grid_search.cv_results_


In [12]:
for mean_score, std_score, params in zip(results['mean_test_score'], results['std_test_score'], results['params']):
    print(f"Mean test score: {mean_score:.3f} (+/-{std_score:.3f}) for {params}")

KeyError: 'mean_test_score'

In [13]:
# Print the best parameters and score
print('Best parameters: ', grid_search.best_params_)
print('Best score: ', grid_search.best_score_)
# Access the best model
best_model = grid_search.best_estimator_

Best parameters:  {'smote_over__k_neighbors': 3, 'smote_over__sampling_strategy': 0.2, 'smote_under__sampling_strategy': 0.5, 'svm__C': 1, 'svm__degree': 3, 'svm__kernel': 'rbf'}
Best score:  0.4716805200341006


In [14]:
y_pred = grid_search.best_estimator_.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

print(f'Test Set Accuracy: {test_accuracy * 100:.2f}%')

Test Set Accuracy: 98.02%


In [15]:
# Print the classification report
y_pred = grid_search.predict(X_train)
print(classification_report(y_train, y_train))

cm = confusion_matrix(y_train, y_train)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      1398
           1       1.00      1.00      1.00        11

    accuracy                           1.00      1409
   macro avg       1.00      1.00      1.00      1409
weighted avg       1.00      1.00      1.00      1409

Confusion Matrix:
[[1398    0]
 [   0   11]]


In [16]:
# Print the classification report
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

              precision    recall  f1-score   support

          -1       1.00      0.98      0.99       350
           1       0.30      1.00      0.46         3

    accuracy                           0.98       353
   macro avg       0.65      0.99      0.73       353
weighted avg       0.99      0.98      0.99       353

Confusion Matrix:
[[343   7]
 [  0   3]]


# Precision

In [18]:
# Create a pipeline that first applies SMOTE and then scales the data

pipeline = ImbPipeline(steps=[
    ('smote_over', SMOTE(random_state=11, sampling_strategy=0.1)),  # Over-sampling
    ('smote_under', RandomUnderSampler(sampling_strategy=0.5)),     # Under-sampling
    ('scaler', StandardScaler()),                                   # Feature scaling
    ('svm',SVC())                                           # Classifier
])

f2_scorer = make_scorer(fbeta_score, beta=2)
custom_precision_scorer = make_scorer(precision_score, zero_division=0)

params = {
    'smote_over__k_neighbors': [2,3,4],  # Parameters for SMOTE oversampling
    'smote_over__sampling_strategy': [0.1,0.2, 0.3, 0.4],
    'smote_under__sampling_strategy': [0.3,0.4, 0.5, 0.6],
    'svm__C': [0.1, 1, 10, 20],
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__degree': [2, 3, 4]  # Relevant for 'poly' kernel
}

# Define the parameter grid for the SVM classifier
# Note: Specify 'svm__' before each parameter name to indicate these parameters are for the SVM step in the pipeline

# Create a grid search object with cross-validation

#grid_search = GridSearchCV(pipeline, param_grid,refit=True, cv=5, scoring='f1', ,n_jobs=-1)

grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=params,
                           scoring={    'precision': custom_precision_scorer},
                           refit = 'precision',
                           cv=StratifiedKFold(n_splits=3,shuffle=True, random_state=11),
                           verbose=10,
                           n_jobs=-1)

In [19]:
start = time.time()
# Fit the grid search object to the training data (NOT the SMOTE-resampled data)
grid_search.fit(X_train, y_train)  # Use your original training data here
end = time.time()
dt = end - start
print('delta t =', dt)

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits
[CV 1/3; 1/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling_strategy=0.3, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 2/3; 1/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling_strategy=0.3, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 3/3; 1/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling_strategy=0.3, svm__C=0.1, svm__degree=2, svm__kernel=linear
[CV 1/3; 2/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling_strategy=0.3, svm__C=0.1, svm__degree=2, svm__kernel=rbf
[CV 2/3; 2/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling_strategy=0.3, svm__C=0.1, svm__degree=2, svm__kernel=rbf
[CV 3/3; 2/1728] START smote_over__k_neighbors=2, smote_over__sampling_strategy=0.1, smote_under__sampling

324 fits failed out of a total of 5184.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
324 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 293, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 250, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
                               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/ana

delta t = 564.0956172943115


In [20]:
# Print results
results = grid_search.cv_results_

In [21]:
for mean_score, std_score, params in zip(results['mean_test_score'], results['std_test_score'], results['params']):
    print(f"Mean test score: {mean_score:.3f} (+/-{std_score:.3f}) for {params}")

KeyError: 'mean_test_score'

In [22]:
# Print the best parameters and score
print('Best parameters: ', grid_search.best_params_)
print('Best score: ', grid_search.best_score_)
# Access the best model
best_model = grid_search.best_estimator_

Best parameters:  {'smote_over__k_neighbors': 4, 'smote_over__sampling_strategy': 0.2, 'smote_under__sampling_strategy': 0.3, 'svm__C': 1, 'svm__degree': 4, 'svm__kernel': 'rbf'}
Best score:  0.4166666666666667


In [23]:
y_pred = grid_search.best_estimator_.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

print(f'Test Set Accuracy: {test_accuracy * 100:.2f}%')

Test Set Accuracy: 98.58%


In [24]:
# Print the classification report
y_pred = grid_search.predict(X_train)
print(classification_report(y_train, y_train))

cm = confusion_matrix(y_train, y_train)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      1398
           1       1.00      1.00      1.00        11

    accuracy                           1.00      1409
   macro avg       1.00      1.00      1.00      1409
weighted avg       1.00      1.00      1.00      1409

Confusion Matrix:
[[1398    0]
 [   0   11]]


In [25]:
# Print the classification report
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

              precision    recall  f1-score   support

          -1       1.00      0.99      0.99       350
           1       0.38      1.00      0.55         3

    accuracy                           0.99       353
   macro avg       0.69      0.99      0.77       353
weighted avg       0.99      0.99      0.99       353

Confusion Matrix:
[[345   5]
 [  0   3]]
