The **preprocessing steps** (removing outliers and one-hot encoding categorical variables) have already been accomplished in the Random Forest Model Training so we'll just load in the cleaned dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

In [2]:
data_filepath = '../data/processed/cleaned_diabetes_one_hot_encoding.csv'
df = pd.read_csv(data_filepath)

# Separate the features and the target variable
X = df.drop('diabetes', axis=1)
y = df['diabetes']

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Data scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the SVM classifier
svm_model = SVC(kernel='linear', probability=True, random_state=42)

# Train the SVM model
svm_model.fit(X_train_scaled, y_train)

# Making predictions on the test set
y_pred = svm_model.predict(X_test_scaled)
y_pred_prob = svm_model.predict_proba(X_test_scaled)[:, 1]

# Compute and print the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"ROC AUC: {roc_auc}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.9646458987001345
ROC AUC: 0.9557248271750498
Confusion Matrix:
[[16450    55]
 [  576   767]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     16505
           1       0.93      0.57      0.71      1343

    accuracy                           0.96     17848
   macro avg       0.95      0.78      0.84     17848
weighted avg       0.96      0.96      0.96     17848



### Hyperparameter Tuning using GridSearchCV

In [3]:
'''
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(estimator=SVC(probability=True, random_state=42), param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best ROC AUC Score:", grid_search.best_score_)


Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Best ROC AUC Score: 0.959424164012679


'''
hardcoded_best_params = {
    'C': 0.1,
    'gamma': 'scale',
    'kernel': 'linear'
}


In [4]:
svm_optimized = SVC(**hardcoded_best_params, probability=True, random_state=42)

# Train the SVM model using the scaled training data
svm_optimized.fit(X_train_scaled, y_train)

# Making predictions on the scaled test set
y_pred_opt = svm_optimized.predict(X_test_scaled)
y_pred_prob_opt = svm_optimized.predict_proba(X_test_scaled)[:, 1]

# Evaluate the optimized model
accuracy_opt = accuracy_score(y_test, y_pred_opt)
roc_auc_opt = roc_auc_score(y_test, y_pred_prob_opt)
conf_matrix_opt = confusion_matrix(y_test, y_pred_opt)
class_report_opt = classification_report(y_test, y_pred_opt)

# Print evaluation metrics
print(f"Optimized Model Accuracy: {accuracy_opt}")
print(f"Optimized Model ROC AUC: {roc_auc_opt}")
print("Optimized Model Confusion Matrix:\n", conf_matrix_opt)
print("Optimized Model Classification Report:\n", class_report_opt)

Optimized Model Accuracy: 0.9647579560735097
Optimized Model ROC AUC: 0.9557740913367483
Optimized Model Confusion Matrix:
 [[16452    53]
 [  576   767]]
Optimized Model Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98     16505
           1       0.94      0.57      0.71      1343

    accuracy                           0.96     17848
   macro avg       0.95      0.78      0.85     17848
weighted avg       0.96      0.96      0.96     17848



#### Model Complexity and Kernel Choice

In [5]:
'''
# Create a pipeline that scales data then applies SVC
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(probability=True, random_state=42))
])

param_grid = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__kernel': ['linear', 'rbf', 'poly'],
    'svc__degree': [2, 3, 4],  # Only used for 'poly' kernel
    'svc__gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best ROC AUC Score:", grid_search.best_score_)

# Evaluate the best model found by GridSearchCV
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_prob = best_model.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred))


Best Parameters: {'svc__C': 0.1, 'svc__degree': 2, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
Best ROC AUC Score: 0.9594258954649781
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     16505
           1       0.94      0.57      0.71      1343

    accuracy                           0.96     17848
   macro avg       0.95      0.78      0.85     17848
weighted avg       0.96      0.96      0.96     17848
'''

'\n# Create a pipeline that scales data then applies SVC\npipeline = Pipeline([\n    (\'scaler\', StandardScaler()),\n    (\'svc\', SVC(probability=True, random_state=42))\n])\n\nparam_grid = {\n    \'svc__C\': [0.1, 1, 10, 100],\n    \'svc__kernel\': [\'linear\', \'rbf\', \'poly\'],\n    \'svc__degree\': [2, 3, 4],  # Only used for \'poly\' kernel\n    \'svc__gamma\': [\'scale\', \'auto\']\n}\n\ngrid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=\'roc_auc\', n_jobs=-1, verbose=2)\ngrid_search.fit(X_train, y_train)\n\nprint("Best Parameters:", grid_search.best_params_)\nprint("Best ROC AUC Score:", grid_search.best_score_)\n\n# Evaluate the best model found by GridSearchCV\nbest_model = grid_search.best_estimator_\ny_pred = best_model.predict(X_test)\ny_pred_prob = best_model.predict_proba(X_test)[:, 1]\nprint(classification_report(y_test, y_pred))\n\n\nBest Parameters: {\'svc__C\': 0.1, \'svc__degree\': 2, \'svc__gamma\': \'scale\', \'svc__kernel\': \'linear\'}\nBest ROC A

#### Handling Class Imbalance with SMOTE

In [6]:
'''from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

# Creating an imbalanced-learn pipeline
imb_pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),
    ('svc', SVC(probability=True, random_state=42))
])

param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf'],
    'svc__gamma': ['scale', 'auto']
}

# Using GridSearchCV for hyperparameter tuning with the imbalanced pipeline
grid_search_smote = GridSearchCV(imb_pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=2)
grid_search_smote.fit(X_train, y_train)

print("Best Parameters with SMOTE:", grid_search_smote.best_params_)
print("Best ROC AUC Score with SMOTE:", grid_search_smote.best_score_)

'''
'''
Results:
Best Parameters with SMOTE: {'svc__C': 10, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
Best ROC AUC Score with SMOTE: 0.9575152547834718

'''

"\nResults:\nBest Parameters with SMOTE: {'svc__C': 10, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}\nBest ROC AUC Score with SMOTE: 0.9575152547834718\n\n"

In [7]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Hardcoded best parameters obtained from the GridSearchCV with SMOTE
hardcoded_best_params_smote = {
    'C': 10,  # Adjusted from 'svc__C': 10 for direct use in SVC
    'gamma': 'scale',
    'kernel': 'linear'
}

# Applying StandardScaler and SVC with the best parameters in a pipeline
pipeline_smote_optimized = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(**hardcoded_best_params_smote, probability=True, random_state=42))
])

# Train the model using the training set
pipeline_smote_optimized.fit(X_train, y_train)

# Making predictions on the test set
y_pred_smote_optimized = pipeline_smote_optimized.predict(X_test)
y_pred_prob_smote_optimized = pipeline_smote_optimized.predict_proba(X_test)[:, 1]

# Compute and print the evaluation metrics
accuracy_smote_optimized = accuracy_score(y_test, y_pred_smote_optimized)
roc_auc_smote_optimized = roc_auc_score(y_test, y_pred_prob_smote_optimized)
conf_matrix_smote_optimized = confusion_matrix(y_test, y_pred_smote_optimized)
class_report_smote_optimized = classification_report(y_test, y_pred_smote_optimized)

print(f"Optimized Model with SMOTE Accuracy: {accuracy_smote_optimized}")
print(f"Optimized Model with SMOTE ROC AUC: {roc_auc_smote_optimized}")
print("Optimized Model with SMOTE Confusion Matrix:\n", conf_matrix_smote_optimized)
print("Optimized Model with SMOTE Classification Report:\n", class_report_smote_optimized)


Optimized Model with SMOTE Accuracy: 0.9646458987001345
Optimized Model with SMOTE ROC AUC: 0.9557203609186323
Optimized Model with SMOTE Confusion Matrix:
 [[16450    55]
 [  576   767]]
Optimized Model with SMOTE Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98     16505
           1       0.93      0.57      0.71      1343

    accuracy                           0.96     17848
   macro avg       0.95      0.78      0.84     17848
weighted avg       0.96      0.96      0.96     17848



#### Advanced oversampling techniques

In [9]:
!pip install imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable


In [10]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

# Creating a combined sampling strategy pipeline
resampling_pipeline = ImbPipeline([
    ('smote_tomek', SMOTETomek(smote=SMOTE(random_state=42), tomek=TomekLinks(sampling_strategy='majority'), random_state=42)),
    ('scaler', StandardScaler()),
    ('svc', SVC(probability=True, random_state=42))
])

param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf'],
    'svc__gamma': ['scale', 'auto']
}

# Using GridSearchCV for hyperparameter tuning with the combined sampling strategy pipeline
grid_search_comb = GridSearchCV(resampling_pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=2)
grid_search_comb.fit(X_train, y_train)

print("Best Parameters with SMOTE-Tomek:", grid_search_comb.best_params_)
print("Best ROC AUC Score with SMOTE-Tomek:", grid_search_comb.best_score_)

# Evaluating the model with the best parameters on the test set
best_pipeline_comb = grid_search_comb.best_estimator_
y_pred_comb = best_pipeline_comb.predict(X_test)
y_pred_prob_comb = best_pipeline_comb.predict_proba(X_test)[:, 1]

# Compute and print the evaluation metrics
accuracy_comb = accuracy_score(y_test, y_pred_comb)
roc_auc_comb = roc_auc_score(y_test, y_pred_prob_comb)
conf_matrix_comb = confusion_matrix(y_test, y_pred_comb)
class_report_comb = classification_report(y_test, y_pred_comb)

print(f"Model with SMOTE-Tomek Accuracy: {accuracy_comb}")
print(f"Model with SMOTE-Tomek ROC AUC: {roc_auc_comb}")
print("Model with SMOTE-Tomek Confusion Matrix:\n", conf_matrix_comb)
print("Model with SMOTE-Tomek Classification Report:\n", class_report_comb)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ......svc__C=1, svc__gamma=auto, svc__kernel=linear; total time=28.1min
[CV] END .....svc__C=1, svc__gamma=scale, svc__kernel=linear; total time=28.1min
[CV] END ......svc__C=1, svc__gamma=auto, svc__kernel=linear; total time=28.1min
[CV] END .....svc__C=1, svc__gamma=scale, svc__kernel=linear; total time=28.6min
[CV] END .....svc__C=1, svc__gamma=scale, svc__kernel=linear; total time=28.7min
[CV] END .....svc__C=1, svc__gamma=scale, svc__kernel=linear; total time=28.8min
[CV] END ......svc__C=1, svc__gamma=auto, svc__kernel=linear; total time=29.1min
[CV] END ......svc__C=1, svc__gamma=auto, svc__kernel=linear; total time=29.4min
[CV] END .......svc__C=10, svc__gamma=scale, svc__kernel=rbf; total time=29.9min
[CV] END .........svc__C=1, svc__gamma=auto, svc__kernel=rbf; total time=31.3min
[CV] END .........svc__C=1, svc__gamma=auto, svc__kernel=rbf; total time=31.4min
[CV] END .......svc__C=10, svc__gamma=scale, svc

#### Stratified K-Fold Cross-Validation

In [11]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np

# Define a Stratified K-Fold Cross-Validator
stratified_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the SVM model within a pipeline, including scaling
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(**hardcoded_best_params_smote, probability=True, random_state=42))
])

# Compute the cross-validated ROC AUC score
roc_auc_scores = cross_val_score(svm_pipeline, X, y, cv=stratified_k_fold, scoring='roc_auc', n_jobs=-1)

print(f"Cross-Validated ROC AUC Scores: {roc_auc_scores}")
print(f"Mean ROC AUC Score: {np.mean(roc_auc_scores)}")

Cross-Validated ROC AUC Scores: [0.96146557 0.95723615 0.95943222 0.95928449 0.95651649]
Mean ROC AUC Score: 0.9587869849348654
