In [None]:
#Preprocessing and Exploration
# NumPy for numerical operations used during preprocessing
import numpy as np

# Pandas for loading the diabetes dataset and preparing it for modeling
import pandas as pd

# Decision Tree classifier, one of the models used to predict diabetes classes
from sklearn.tree import DecisionTreeClassifier

# Matplotlib Pyplot for plotting distributions and model visualizations
import matplotlib.pyplot as plt

# Metrics used to evaluate how well each model predicts diabetes categories
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# plot_tree for displaying the structure of the trained decision tree model
from sklearn.tree import plot_tree

# Tools for splitting the dataset into training/testing sets and optimizing model performance
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# KNN classifier, another model used to classify patient diabetes status based on similarity
from sklearn.neighbors import KNeighborsClassifier

# Pipeline to connect preprocessing steps (scaling) with the model in one workflow
from sklearn.pipeline import Pipeline

# Scalers used to normalize or standardize medical measurements before training the models
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# SVM classifier, one of the main models selected for multi-class diabetes prediction
from sklearn.svm import SVC

# Seaborn for creating advanced visualizations during the exploratory data analysis phase
import seaborn as sns


In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/Dataset of Diabetes .csv')
dataset.head(5)

# New section

# New section

In [None]:
# Generate a full summary of the dataset to review each column’s data type, non-null count, and overall structure
dataset.info()

In [None]:
# Produce descriptive statistics for all numerical features to understand their distributions, ranges, and central tendencies
dataset.describe()

In [None]:
# Count the number of unique values in each column of the dataset
dataset.nunique()

In [None]:
# Identify missing data by counting how many NaN values appear in each column of the dataset
dataset.isna().sum()

In [None]:
# Display the distribution of the diabetes outcome classes to check for imbalance in the target variable
print('\nClass distribution:')
print(dataset['CLASS'].value_counts())

# The repeated class labels indicate inconsistent formatting in the dataset

In [None]:
# Clean up target labels
dataset['CLASS'] = dataset['CLASS'].astype(str).str.strip().str.upper()

In [None]:
# Display class distribution
print('\nClass distribution:')
print(dataset['CLASS'].value_counts())

In [None]:
# Display class distribution
print('\nClass distribution:')
print(dataset['Gender'].value_counts())

In [None]:
# Clean up target labels
dataset['Gender'] = dataset['Gender'].astype(str).str.strip().str.upper()

In [None]:
# Clean up target labels
dataset['Gender'] = dataset['Gender'].astype(str).str.strip().str.upper()

In [None]:
# Visualize age distribution
plt.figure(figsize=(8, 6))
sns.histplot(dataset['AGE'], bins=20, kde=True, color='orange')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
ax = sns.countplot(x='CLASS', hue='Gender', data=dataset)

plt.show()

In [None]:
sns.pairplot(dataset)

In [None]:
dataset.head(2)

In [None]:
# Identify all categorical (object-type) columns in the dataset for encoding
cat_columns = dataset.select_dtypes(['object']).columns

# Convert each categorical column into numeric codes using factorization
dataset[cat_columns] = dataset[cat_columns].apply(lambda x: pd.factorize(x)[0])

# Display the first two rows to confirm that categorical features were successfully encoded
dataset.head(2)

In [None]:
# Create the feature matrix (X) by removing the target label (CLASS) and non-predictive identifiers (ID, No_Pation)
X = dataset.drop(columns=['CLASS', 'ID', 'No_Pation'])

# Extract the diabetes classification labels into the target vector (y)
y = dataset['CLASS']

# Display the first few rows of the feature matrix
X.head()

In [None]:
# Apply standardization to the feature set so all medical measurements share a comparable scale
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Preview the standardized feature matrix
X_scaled.head(2)

In [None]:
# First split: divide the dataset into an 80% training+validation set and a 20% test set,
# ensuring class proportions are preserved with stratified sampling
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Second split: separate the training+validation set into 60% training and 20% validation,
# maintaining overall class balance across all subsets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)

# Display the combined training and validation feature set
X_train_val

In [None]:
# Using the combined train+val set since GridSearchCV handles its own validation internally
X_grid = X_train_val
y_grid = y_train_val

# Create a pipeline that first normalizes the data, then applies SVM
pipeline = Pipeline([
    ('scaler', StandardScaler()),    # Apply feature scaling
    ('svm', SVC())                   # SVM model
])

# Parameter options that will be tested during the grid search
param_grid = {
    'svm__C': [0.1, 1, 10],           # Regularization values to try
    'svm__kernel': ['linear', 'rbf'], # Kernel types to compare
    'svm__gamma': ['scale', 'auto']   # Gamma settings for RBF kernel
}

# Perform the grid search with 5-fold CV to find the best setup
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

# Fit the search on the training+validation portion
grid_search.fit(X_grid, y_grid)

# Display the best settings found and their CV accuracy
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Test the optimized model on the separate test split
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Show test accuracy plus detailed class results
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
# Confusion matrix and performance metrics
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

In [None]:
# Create a heatmap for the confusion matrix
plt.figure(figsize=(7, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', linewidths=0.5, linecolor='black')

# Add labels and title
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')

# Display the plot
plt.show()


In [None]:
# Train a Decision Tree using Entropy
# Build a Decision Tree with simple depth and leaf settings to reduce overfitting
clf = DecisionTreeClassifier(criterion = "entropy", random_state = 42,max_depth = 3, min_samples_leaf = 5)
# Fit the model on the training data
clf.fit(X_train, y_train)

In [None]:
# Evaluate the Decision Tree on the Test Set
# Predictions on the test data
y_pred = clf.predict(X_test)

# Compute accuracy and classification metrics
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

In [None]:
# Visualize Confusion Matrix
# Create a heatmap for the confusion matrix
plt.figure(figsize=(7, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Reds', linewidths=0.5, linecolor='black')

# Add labels and title
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')

# Display the plot
plt.show()

In [None]:
# Split Data into Train, Validation, and Test Sets

# Standard split: 80% train, 10% validation, 10% test
# (Split already done earlier – here we only display the sizes

print(f"\nTraining set size: {len(X_train)} samples")
print(f"Validation set size: {len(X_val)} samples")
print(f"Testing set size: {len(X_test)} samples")


#Tune Hyperparameters to Reduce Overfitting

# Overfitting occurs when the model learns the training data too well, including its noise.
# Use GridSearchCV to try different parameters and pick the best one

# Key parameters for preventing overfitting in Decision Trees:
#   - max_depth: Limits how deep the tree can grow. A smaller value reduces complexity.
#   - min_samples_leaf: Minimum number of samples required to be at a leaf node.
#   - ccp_alpha: Cost-complexity pruning parameter. A higher value leads to more pruning.

param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_leaf': [1, 5, 10, 20],
    'criterion': ['gini', 'entropy']
}

# Base Decision Tree estimator
dt_base = DecisionTreeClassifier(random_state=42)

# Perform GridSearchCV using 5-fold cross validation
grid_search = GridSearchCV(estimator=dt_base, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train_val, y_train_val)

# Print the best parameters found
best_params = grid_search.best_params_
print(f"\nBest hyperparameters found via GridSearchCV: {best_params}")


# Train Final Decision Tree with Best Parameters
best_dt_model = DecisionTreeClassifier(
    max_depth=best_params['max_depth'],
    min_samples_leaf=best_params['min_samples_leaf'],
    criterion=best_params['criterion'],
    random_state=42)

best_dt_model.fit(X_train_val, y_train_val)


# Evaluate the Optimized Model
# Test the tuned model on unseen data.
y_pred = best_dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy of the best model on the test set: {accuracy:.2f}")

# Visualize the Optimized Decision Tree
# Display the pruned tree structure with colors and labels.
plt.figure(figsize=(20, 10))
plot_tree(best_dt_model,
          feature_names=X.columns,
          class_names=['Diabetic', 'Non-Diabetic', 'Predict-Diabetic'],
          filled=True,
          rounded=True,
          fontsize=8,
          impurity=False)
plt.title("Optimized Decision Tree for Diabetes Classification")
plt.show()


In [None]:
# Train K-Nearest Neighbors Model
# Use KNN with k = 9 neighbors
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train_val, y_train_val)

In [None]:
# Evaluate KNN on the Test Set
# Predictions on the test set
y_pred_knn = knn.predict(X_test)

# Confusion matrix and performance metrics
cm = confusion_matrix(y_test, y_pred_knn)
accuracy = accuracy_score(y_test, y_pred_knn)
report = classification_report(y_test, y_pred_knn)

print("Confusion Matrix:\n", cm)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

In [None]:
# Visualize KNN Confusion Matrix

# predictions on the test data
y_pred_knn = knn.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_knn)
print("Accuracy:", accuracy)

# Create a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_knn)

# Plot confusion matrix
plt.figure(figsize=(5, 3))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Reds")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()