In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

this code was by Salim 
# 1-import library

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

# Load Dataset

In [7]:
data_path = "diabetes.csv"
columns = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 
    'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'
]
data = pd.read_csv(data_path, header=0, names=columns)  # Replace header=None with header=0
data = data.apply(pd.to_numeric, errors='coerce')  # Force numeric conversion, setting non-numeric values to NaN

# Handle Missing Values (replace 0 with NaN for specific columns) :

In [8]:
missing_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[missing_columns] = data[missing_columns].replace(0, np.nan)

In [1]:
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'matplotlib.backends.registry'

# Impute Missing Values with Median :


In [9]:
imputer = SimpleImputer(strategy='median')
data[missing_columns] = imputer.fit_transform(data[missing_columns])

# Split Features and Target :


In [10]:
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Handle Class Imbalance :

In [11]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-Test Split :

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Feature Scaling :

In [13]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Random Forest Classifier with Hyperparameter Tuning :

In [14]:
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(random_state=42)
grid = GridSearchCV(rf, rf_params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

# Best Model :

In [15]:
best_rf = grid.best_estimator_
print(f"Best Parameters: {grid.best_params_}")
print(f"Best Cross-Validated Accuracy: {grid.best_score_:.4f}")

Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Best Cross-Validated Accuracy: 0.8075


# Evaluate on Test Set :

In [16]:
best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)
y_proba = best_rf.predict_proba(X_test)[:, 1]

# Metrics :

In [17]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}")


Confusion Matrix:
[[79 21]
 [15 85]]

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.79      0.81       100
           1       0.80      0.85      0.83       100

    accuracy                           0.82       200
   macro avg       0.82      0.82      0.82       200
weighted avg       0.82      0.82      0.82       200

Accuracy: 0.8200
ROC-AUC: 0.9053


# Feature Importance :

In [18]:
importance = best_rf.feature_importances_
print("\nFeature Importances:")
for i, v in enumerate(importance):
    print(f"Feature: {columns[i]}, Importance: {v:.4f}")


Feature Importances:
Feature: Pregnancies, Importance: 0.0529
Feature: Glucose, Importance: 0.2746
Feature: BloodPressure, Importance: 0.0870
Feature: SkinThickness, Importance: 0.0818
Feature: Insulin, Importance: 0.0926
Feature: BMI, Importance: 0.1774
Feature: DiabetesPedigreeFunction, Importance: 0.1151
Feature: Age, Importance: 0.1187


feel free to contact me and support me through upvote 