# Step 1: Import necessary libraries

In [20]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from scipy.stats import uniform, randint
import matplotlib.pyplot as plt
import seaborn as sns


# Step 2: Load the dataset

In [2]:
data = pd.read_csv("dataset/diabetes.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
data['Outcome'].value_counts(normalize=True)

Outcome
0    0.651042
1    0.348958
Name: proportion, dtype: float64

In [6]:
X = data.drop(columns=['Outcome'])
y = data['Outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [7]:
# Check the shape of the data
print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)

Feature matrix shape: (768, 8)
Target vector shape: (768,)


In [11]:
# Standardizing the features
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Train the SVM model

In [12]:
# Create an SVM model (Support Vector Classifier)
model = SVC()

In [23]:
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)


# check accuracy
f1_train = f1_score(y_train, y_pred_train)
f1_test = f1_score(y_test, y_pred_test)

print(f"Train f1 Score: {f1_train:.4f}")
print(f"Test f1 Score: {f1_test:.4f}")

Train f1 Score: 0.6169
Test f1 Score: 0.5496


# Step 4: Evaluation of the Model

In [24]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report (precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))

# Accuracy
print(f"F1 Score: {f1_score(y_test, y_pred_test):.4f}")

Confusion Matrix:
[[136  14]
 [ 45  36]]

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.91      0.82       150
           1       0.72      0.44      0.55        81

    accuracy                           0.74       231
   macro avg       0.74      0.68      0.69       231
weighted avg       0.74      0.74      0.73       231

F1 Score: 0.5496


# Step 5: HyperPramatere Tunning

## Random Search

In [25]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'C': uniform(0.001, 100),           # Regularization parameter (C)
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Kernel type
    'gamma': ['scale', 'auto', uniform(0.001, 1)],  # Kernel coefficient
    'degree': randint(2, 5),            # Degree of the polynomial kernel
    'coef0': uniform(0, 10),            # Independent term in kernel function (for poly/sigmoid kernels)
    'class_weight': [None, 'balanced'],  # Class weight (useful for imbalanced datasets)
    'shrinking': [True, False],         # Whether to use the shrinking heuristic
}

In [26]:
# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model,
                                   param_distributions=param_dist,
                                   n_iter=50,       # Number of random combinations to try
                                   cv=5,            # 5-fold cross-validation
                                   random_state=42,
                                   n_jobs=-1)       # Use all available cores for parallel computation

# Fit the RandomizedSearchCV
random_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# Use the best parameters from RandomizedSearchCV as the starting point for GridSearchCV
best_params = random_search.best_params_
best_params

In [None]:
# Create svc model
model = SVC(**best_params)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)


# check accuracy
f1_train = f1_score(y_train, y_pred_train)
f1_test = f1_score(y_test, y_pred_test)

print(f"Train f1 Score: {f1_train:.4f}")
print(f"Test f1 Score: {f1_test:.4f}")

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report (precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))

# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.4f}")

## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV



# Define a finer grid around the best hyperparameters found by RandomizedSearchCV
param_grid = {
    'C': np.logspace(np.log10(best_params['C'] / 10), np.log10(best_params['C'] * 10), 10),
    'gamma': ['scale', 'auto', uniform(0.001, 1)],
    'kernel': [best_params['kernel']],  # Keep kernel fixed for fine-tuning
    'degree': [best_params['degree']],  # Keep degree fixed
    'coef0': [best_params['coef0'], 0, 5, 10],  # Test different coef0 values
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           cv=5,             # 5-fold cross-validation
                           n_jobs=-1)        # Use all available cores for parallel computation

# Fit GridSearchCV
grid_search.fit(X_train, y_train)


In [None]:
# Use the best parameters from GridSearchCV
best_params = grid_search.best_params_
best_params

In [None]:
# Create svc model
model = SVC(**best_params)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)


# check accuracy
f1_train = f1_score(y_train, y_pred_train)
f1_test = f1_score(y_test, y_pred_test)

print(f"Train f1 Score: {f1_train:.4f}")
print(f"Test f1 Score: {f1_test:.4f}")

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report (precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))

# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.4f}")