In [None]:
#dataset from kagglehub
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/pima-indians-diabetes-database")

print("Path to dataset files:", path)

In [None]:
#imports
import pandas as pd
import numpy as np

#dataset loading
buffer = path + '/diabetes.csv' #Honestly might as well just attach a local copy of the dataset
try:
   initial_diabetes = pd.read_csv(buffer)
except:
   initial_diabetes = pd.read_csv('/diabetes.csv') #Which is what exactly this is for(local copy)

#What are the attributes
print("Dataset Shape:", initial_diabetes.shape)
initial_diabetes.head()

In [None]:
#Cleaning the data/Preprocessing
diabetes = initial_diabetes.dropna() #Removes rows with empty cells(an entirely new DataFrame)
diabetes = diabetes.drop_duplicates() #Removes duplicate rows #(inplace = True) will make sure that the method does NOT return a new DataFrame
for x in diabetes.index:
  if diabetes.loc[x, "SkinThickness"] == 0:
    diabetes.drop(x, inplace = True)
for x in diabetes.index:
  if diabetes.loc[x, "BloodPressure"] == 0:
    diabetes.drop(x, inplace = True)

diabetes.tail(70)
# diabetes.count()
# # Count the number of positive and negative cases
# positive_cases = diabetes[diabetes['Outcome'] == 1].shape[0]
# negative_cases = diabetes[diabetes['Outcome'] == 0].shape[0]

# # Display the counts
# print(f"Number of Positive Cases (Outcome=1): {positive_cases}")
# print(f"Number of Negative Cases (Outcome=0): {negative_cases}")


In [None]:
#Visualize pairplots to see the distribution of attributes for disease outcome
import seaborn as sns
import matplotlib.pyplot as plt
sns.pairplot(diabetes, hue='Outcome')
plt.show()

In [None]:
#Box and whiskers
print("\nBoxplots of features grouped by target:")
plt.figure(figsize=(15, 30))
for i, col in enumerate(diabetes.columns[:-1]):
    plt.subplot(4, 2, i+1)
    sns.boxplot(x='Outcome', y=col, data=diabetes)
    plt.title(f'Boxplot of {col} by Outcome')
plt.tight_layout()
plt.show()


In [None]:
#Preparing x and y
x = diabetes.drop(columns=['Outcome']) #Attributes
y = diabetes['Outcome'] #Target

#Train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)
#Random State Controls the shuffling applied to the data before applying the split but only if there is even shuffling(Default is true).
#Test Size: If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. The Train size not included should be the complement of Test size.

#diabetes with column train
diabetes_train = {
    'attributes': x_train, #attribute table
    'target': y_train #target table
}

#diabetes with column test
diabetes_test = {
  'attributes': x_test,
  'target': y_test
}


In [None]:
#Grid Search and Bayesian Optimization(Implement both to determine which is better to detect sets of hyperparameters for this model)
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer
!pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical

def perform_grid_search(X_train, y_train, X_test, y_test):
    """
    Perform Grid Search for Decision Tree hyperparameter tuning
    """
    # Define parameter grid or what I think ppl generally call search space
    param_grid = {
        'max_depth': [1, 5, 8, 13, 21, 34],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'criterion': ['gini', 'entropy'],
        'max_features': ['sqrt', 'log2', None]
    }

    # Create Decision Tree classifier
    dtc = DecisionTreeClassifier(random_state=0)

    # Create GridSearchCV object
    grid_search = GridSearchCV(
        estimator=dtc,
        param_grid=param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    # Fit the grid search
    grid_search.fit(X_train, y_train)

    # Get best model and make predictions
    best_grid_model = grid_search.best_estimator_
    y_pred_grid = best_grid_model.predict(X_test)

    # Return results
    return {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'test_accuracy': accuracy_score(y_test, y_pred_grid),
        'best_model': best_grid_model
    }

def perform_bayesian_optimization(X_train, y_train, X_test, y_test):
    """
    Perform Bayesian Optimization for Decision Tree hyperparameter tuning
    """
    # Define search space
    search_space = {
        'max_depth': Integer(1, 34),
        'min_samples_split': Integer(2, 10),
        'min_samples_leaf': Integer(1, 4),
        'criterion': Categorical(['gini', 'entropy']),
        'max_features': Categorical(['sqrt', 'log2', None])
    }

    # Create Decision Tree classifier
    dtc = DecisionTreeClassifier(random_state=0)

    # Create BayesSearchCV object
    bayes_search = BayesSearchCV(
        estimator=dtc,
        search_spaces=search_space,
        n_iter=50,  # Number of iterations
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    # Fit the Bayesian optimization
    bayes_search.fit(X_train, y_train)

    # Get best model and make predictions
    best_bayes_model = bayes_search.best_estimator_
    y_pred_bayes = best_bayes_model.predict(X_test)

    # Return results
    return {
        'best_params': bayes_search.best_params_,
        'best_score': bayes_search.best_score_,
        'test_accuracy': accuracy_score(y_test, y_pred_bayes),
        'best_model': best_bayes_model
    }

# Run Grid Search
grid_results = perform_grid_search(x_train, y_train, x_test, y_test)

# Run Bayesian Optimization
bayes_results = perform_bayesian_optimization(x_train, y_train, x_test, y_test)

# Print results
print("\nGrid Search Results:")
print("Best parameters:", grid_results['best_params'])
print("Best cross-validation score:", grid_results['best_score'])
print("Test accuracy:", grid_results['test_accuracy'])

print("\nBayesian Optimization Results:")
print("Best parameters:", bayes_results['best_params'])
print("Best cross-validation score:", bayes_results['best_score'])
print("Test accuracy:", bayes_results['test_accuracy'])

In [None]:
# Function to compare and visualize results between grid and bayesian.
def compare_tuning_methods(grid_results, bayes_results):
    """
    Compare and visualize results from both tuning methods
    """
    plt.figure(figsize=(12, 6))

    # Create bar plot
    methods = ['Grid Search', 'Bayesian Optimization']
    train_scores = [grid_results['best_score'], bayes_results['best_score']]
    test_scores = [grid_results['test_accuracy'], bayes_results['test_accuracy']]

    x = np.arange(len(methods))
    width = 0.35

    plt.bar(x - width/2, train_scores, width, label='Cross-validation Score')
    plt.bar(x + width/2, test_scores, width, label='Test Accuracy')

    plt.xlabel('Tuning Method')
    plt.ylabel('Accuracy')
    plt.title('Comparison of Hyperparameter Tuning Methods')
    plt.xticks(x, methods)
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize = 10)
    plt.tight_layout()
    plt.show()

compare_tuning_methods(grid_results, bayes_results)

# Decision Tree

In [None]:
#Training portion
dtc = DecisionTreeClassifier(**bayes_results['best_params'], random_state=0) #Testing if setting random state to an Int will alter its deterministic nature(It does) (The seed 100 generates a better accuracy result than 0)
dtc.fit(diabetes_train['attributes'], diabetes_train['target'])

In [None]:
#Testing portion
predict = dtc.predict(diabetes_test['attributes'])
predict

In [None]:
pd.DataFrame(list(zip(diabetes_test['target'], predict)), columns=['Target', 'Predicted'])
#first part of the parameter is data input with indexing, second part is just the columns set

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

predictions = dtc.predict(x_test)

accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print(f'Accuracy: {accuracy:.3f}')
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

### Visualization of Results

In [None]:
#How does the decision tree look like
from sklearn.tree import plot_tree
plt.figure(figsize=[100,100]) #The visualization is fit automatically to the size of the axis. Use the figsize or dpi arguments of plt.figure to control the size of the rendering.
tree = plot_tree(dtc, feature_names=x.columns.tolist(), #class_name not needed since result is binary
          filled=True, rounded=True) #filled: When set to True, paint nodes to indicate majority class for classification, extremity of values for regression, or purity of node for multi-output.

In [None]:
#Define Colormap for Visualization
from matplotlib import cm
from matplotlib.colors import ListedColormap
colormap = cm.get_cmap('tab20')
cm_dark = ListedColormap(colormap.colors[::2])
cm_light = ListedColormap(colormap.colors[1::2])

#Initialize Accuracy Storage Variables
all_acc = []
all_acc_cols = []

#Generate Feature Combinations
att_cols = diabetes_train['attributes'].columns
all_comb = []
for horiz in att_cols:
  for vert in att_cols:
    if horiz is vert or [horiz,vert] in all_comb or [vert, horiz] in all_comb:
      continue
    all_comb.append([horiz, vert])

In [None]:
import numpy as np
max_depth = None
dtc = DecisionTreeClassifier(random_state=100, max_depth = max_depth)
#Iterate through all feature pairs
for i, [h,v] in enumerate(all_comb):
  fig, ax = plt.subplots(1, 2, figsize = [40,20])

  dtc.fit(diabetes_train['attributes'][[h,v]], diabetes_train['target'])
  plot_tree(dtc, feature_names = diabetes_train['attributes'][[h,v]].columns.to_list(),
            ax=ax[0], filled=True, rounded=True)

  x_min = diabetes_train['attributes'][h].min()
  x_max = diabetes_train['attributes'][h].max()
  x_range = x_max - x_min
  x_min = x_min - 0.1 * x_range
  x_max = x_max + 0.1 * x_range
  y_min = diabetes_train['attributes'][v].min()
  y_max = diabetes_train['attributes'][v].max()
  y_range = y_max - y_min
  y_min = y_min - 0.1 * y_range
  y_max = y_max + 0.1 * y_range
  xx, yy = np.meshgrid(np.arange(x_min, x_max, .01*y_range), np.arange(y_min,y_max, .01*y_range))
  z = dtc.predict(list(zip(xx.ravel(), yy.ravel())))
  z = z.reshape(xx.shape)

  plt.sca(ax[1])
  plt.pcolormesh(xx,yy,z,cmap=cm_light)

  plt.rcParams.update({'font.size': 30})
  ax[1].scatter(diabetes_train['attributes'][h], diabetes_train['attributes'][v],
                c=diabetes_train['target'], cmap=cm_dark, s=200,
                label='Training data', edgecolor='black', linewidth=1)
  ax[1].scatter(diabetes_test['attributes'][h], diabetes_test['attributes'][v],
                c=diabetes_test['target'], cmap=cm_dark, s=200,
                label = 'Testing data', edgecolor='black', linewidth=1, marker='*')
  train_acc = dtc.score(diabetes_train['attributes'][[h,v]], diabetes_train['target'])
  test_acc = dtc.score(diabetes_test['attributes'][[h,v]], diabetes_test['target'])
  ax[1].set_title(f'training:{train_acc:.3f}, testing:{test_acc:.3f}')
  ax[1].set_xlabel(h)
  ax[1].set_ylabel(v)
  ax[1].legend()

  all_acc.append([1, h, v, max_depth, train_acc, test_acc])

all_acc_cols = ['i', 'attribute 1', 'attribute 2', 'max depth 1', 'training accuracy 1', 'testing accuracy 1']





In [None]:
import numpy as np
max_depth = 3
dtc = DecisionTreeClassifier(random_state=100, max_depth = max_depth)
for i, [h,v] in enumerate(all_comb):
  fig, ax = plt.subplots(1, 2, figsize = [40,20])

  dtc.fit(diabetes_train['attributes'][[h,v]], diabetes_train['target'])
  plot_tree(dtc, feature_names = diabetes_train['attributes'][[h,v]].columns.to_list(),
            ax=ax[0], filled=True, rounded=True)

  x_min = diabetes_train['attributes'][h].min()
  x_max = diabetes_train['attributes'][h].max()
  x_range = x_max - x_min
  x_min = x_min - 0.1 * x_range
  x_max = x_max + 0.1 * x_range
  y_min = diabetes_train['attributes'][v].min()
  y_max = diabetes_train['attributes'][v].max()
  y_range = y_max - y_min
  y_min = y_min - 0.1 * y_range
  y_max = y_max + 0.1 * y_range
  xx, yy = np.meshgrid(np.arange(x_min, x_max, .01*y_range), np.arange(y_min,y_max, .01*y_range))
  z = dtc.predict(list(zip(xx.ravel(), yy.ravel())))
  z = z.reshape(xx.shape)

  plt.sca(ax[1])
  plt.pcolormesh(xx,yy,z,cmap=cm_light)

  plt.rcParams.update({'font.size': 30})
  ax[1].scatter(diabetes_train['attributes'][h], diabetes_train['attributes'][v],
                c=diabetes_train['target'], cmap=cm_dark, s=200,
                label='Training data', edgecolor='black', linewidth=1)
  ax[1].scatter(diabetes_test['attributes'][h], diabetes_test['attributes'][v],
                c=diabetes_test['target'], cmap=cm_dark, s=200,
                label = 'Testing data', edgecolor='black', linewidth=1, marker='*')
  train_acc = dtc.score(diabetes_train['attributes'][[h,v]], diabetes_train['target'])
  test_acc = dtc.score(diabetes_test['attributes'][[h,v]], diabetes_test['target'])
  ax[1].set_title(f'training:{train_acc:.3f}, testing:{test_acc:.3f}')
  ax[1].set_xlabel(h)
  ax[1].set_ylabel(v)
  ax[1].legend()

  all_acc[i] += [max_depth, train_acc, test_acc]

all_acc_cols += ['max depth 2', 'training accuracy 2', 'testing accuracy 2']





In [None]:
import matplotlib.pyplot as plt
max_depths = [1,5,8,13,21,34]
training_accuracy =[]
testing_accuracy = []
for md in max_depths:
  dtc = DecisionTreeClassifier(max_depth=md)
  dtc.fit(x_train, y_train)
  train = dtc.score(x_train, y_train)
  test = dtc.score(x_test, y_test)
  training_accuracy.append(train)
  testing_accuracy.append(test)

plt.tight_layout()
plt.plot(max_depths, training_accuracy, label='training accuracy')
plt.plot(max_depths, testing_accuracy, label='testing accuracy')
plt.xlabel('max depth')
plt.ylabel('accuracy')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize = 10)
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc

max_depths = [1,5,8,13,21,34]
plt.figure(figsize=(10, 8))  # Set figure size

# Loop through each max_depth
for md in max_depths:
    dtc = DecisionTreeClassifier(max_depth=md)
    dtc.fit(x_train, y_train)

    # Predict probabilities for the test set
    y_prob = dtc.predict_proba(x_test)[:, 1]  # Probabilities for the positive class #Index Slicing

    # Compute the ROC curve and AUC
    fpr, tpr, thresholds = roc_curve(y_test, y_prob, pos_label=None) #pos_label:The label of the positive class. When pos_label=None, if y_true is in {-1, 1} or {0, 1}, pos_label is set to 1, otherwise an error will be raised.
    roc_auc = auc(fpr, tpr) #Area under curve

    # Plot the ROC curve
    plt.plot(fpr, tpr, label=f'max_depth={md}, AUC={roc_auc:.2f}') #true positive, false postive

    print(f"max_depth={md}, AUC={roc_auc:.4f}")

# Plot diagonal line for reference
plt.plot([0, 1], [0, 1], 'k--', label='Reference')

# Add labels, legend, and title
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve for Different max_depths')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize = 10)
plt.grid()
plt.show()
plt.tight_layout()

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(y_test, predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'], yticklabels=['No Diabetes', 'Diabetes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
acc_table = pd.DataFrame(all_acc, columns=['i', 'attribute 1', ' attribute 2', 'max depth 1', ' train accuracy 1', 'test accuracy 1',
                                           'max depth 2', 'train accuracy 2', ' test accuracy 2'])
acc_table

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=0)
rf_classifier.fit(x_train, y_train)
rf_pred = rf_classifier.predict(x_test)

In [None]:
import pandas as pd
import seaborn as sns

# Get feature importance
feature_importances = pd.DataFrame({'Feature': x_train.columns, 'Importance': rf_classifier.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importances, palette='Blues_d')
plt.title('Feature Importance in Random Forest')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

## Results

In [None]:
import matplotlib.pyplot as plt

# Define max_depth values to test
max_depths = [1, 5, 8, 13, 21, 34]

# Lists to store accuracies
training_accuracy = []
testing_accuracy = []

# Loop through each max_depth
for md in max_depths:
    rf = RandomForestClassifier(max_depth=md, n_estimators=100, random_state=0)  # Initialize Random Forest
    rf.fit(x_train, y_train)  # Fit the model

    # Calculate training and testing accuracies
    train = rf.score(x_train, y_train)
    test = rf.score(x_test, y_test)
    training_accuracy.append(train)
    testing_accuracy.append(test)

# Plot the accuracies
plt.tight_layout()
plt.plot(max_depths, training_accuracy, label='Training Accuracy')
plt.plot(max_depths, testing_accuracy, label='Testing Accuracy')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=10)
plt.title('Training and Testing Accuracy for Random Forest')
plt.grid()
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Define max_depth values to test
max_depths = [1, 5, 8, 13, 21, 34]
plt.figure(figsize=(10, 8))  # Set figure size

# Loop through each max_depth
for md in max_depths:
    # Initialize and fit Random Forest with specific max_depth
    rf = RandomForestClassifier(max_depth=md, n_estimators=100, random_state=0)
    rf.fit(x_train, y_train)

    # Predict probabilities for the positive class
    y_prob = rf.predict_proba(x_test)[:, 1]

    # Compute the ROC curve and AUC
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)

    # Plot the ROC curve
    plt.plot(fpr, tpr, label=f'max_depth={md}, AUC={roc_auc:.2f}')
    print(f"max_depth={md}, AUC={roc_auc:.4f}")  # Log AUC for each depth

# Plot diagonal line for reference
plt.plot([0, 1], [0, 1], 'k--', label='Reference')

# Add labels, legend, and title
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve for Random Forest with Different max_depths')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=10)
plt.grid()
plt.tight_layout()  # Adjust layout
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, rf_pred)
precision = precision_score(y_test, rf_pred)
recall = recall_score(y_test, rf_pred)
f1 = f1_score(y_test, rf_pred)

print(f'Accuracy: {accuracy:.3f}')
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(y_test, rf_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'], yticklabels=['No Diabetes', 'Diabetes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svm = SVC(kernel='rbf', probability=True, random_state=0)
svm.fit(x_train, y_train)
svm_pred = svm.predict(x_test)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC

# Train SVM on two features (for 2D visualization)
svm = SVC(kernel='linear', C=1)
svm.fit(x_train.iloc[:, :2], y_train)  # Using only the first two features for simplicity

# Create a mesh grid
x_min, x_max = x_train.iloc[:, 0].min() - 1, x_train.iloc[:, 0].max() + 1
y_min, y_max = x_train.iloc[:, 1].min() - 1, x_train.iloc[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))

# Predict on grid points
Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot the decision boundary
plt.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.Paired)
plt.scatter(x_train.iloc[:, 0], x_train.iloc[:, 1], c=y_train, edgecolors='k', cmap=plt.cm.Paired)
plt.title("SVM Decision Boundary")
plt.show()

## Results

In [None]:
import matplotlib.pyplot as plt
from sklearn.svm import SVC

# Define C values to test (regularization parameter)
C_values = [0.01, 0.1, 1, 10, 100]

# Lists to store accuracies
training_accuracy = []
testing_accuracy = []

# Loop through each C value
for c in C_values:
    svm = SVC(C=c, kernel='linear', random_state=0)  # Initialize SVM with RBF kernel
    svm.fit(x_train, y_train)  # Fit the model

    # Calculate training and testing accuracies
    train = svm.score(x_train, y_train)
    test = svm.score(x_test, y_test)
    training_accuracy.append(train)
    testing_accuracy.append(test)

# Plot the accuracies
plt.figure(figsize=(10, 6))
plt.plot(C_values, training_accuracy, marker='o', label='Training Accuracy')
plt.plot(C_values, testing_accuracy, marker='s', label='Testing Accuracy')
plt.xscale('log')  # Use a logarithmic scale for C values
plt.xlabel('C (Regularization Parameter)')
plt.ylabel('Accuracy')
plt.title('Training and Testing Accuracy for SVM with Different C Values')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=10)
plt.grid()
plt.tight_layout()
plt.show()

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

kernels = ['linear', 'rbf', 'poly']
plt.figure(figsize=(10, 8))

for kernel in kernels:
    # Initialize and fit SVM with the specific kernel
    svm = SVC(kernel=kernel, C=1, probability=True, random_state=0)
    svm.fit(x_train, y_train)

    # Predict probabilities for the positive class
    y_prob = svm.predict_proba(x_test)[:, 1]

    # Compute the ROC curve and AUC
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)

    # Plot the ROC curve
    plt.plot(fpr, tpr, label=f'kernel={kernel}, AUC={roc_auc:.2f}')
    print(f"kernel={kernel}, AUC={roc_auc:.4f}")  # Log AUC for each kernel

# Plot diagonal line for reference
plt.plot([0, 1], [0, 1], 'k--', label='Reference')

# Add labels, legend, and title
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve for SVM with Different Kernels')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=10)
plt.grid()
plt.tight_layout()  # Adjust layout
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, svm_pred)
precision = precision_score(y_test, svm_pred)
recall = recall_score(y_test, svm_pred)
f1 = f1_score(y_test, svm_pred)

print(f'Accuracy: {accuracy:.3f}')
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(y_test, svm_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'], yticklabels=['No Diabetes', 'Diabetes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()