In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc
import numpy as np

In [None]:
# Load the dataset
file_path = "Healthcare-Diabetes.csv"
df = pd.read_csv(file_path)

In [None]:
# Show basic dataset info
df.info()

In [None]:
# Summary statistics of numerical features
df.describe()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Class distribution
sns.countplot(x=df['Outcome'])
plt.title("Class Distribution")
plt.show()


In [None]:
# Drop the 'Id' column as it is not needed
df = df.drop(columns=['Id'])

# Split features and target
X = df.drop(columns=['Outcome'])
y = df['Outcome']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [None]:
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

In [None]:
y_probs = model.predict_proba(X_test)[:, 1]  # Probability of positive class
fpr, tpr, _ = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color="blue", label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

In [None]:
# Get feature importance
feature_importances = model.feature_importances_

# Create a DataFrame to display feature importance
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display feature importance
feature_importance_df

In [None]:
thresholds = np.linspace(0, 1, 50)
tp_rates, fp_rates, tn_rates, fn_rates = [], [], [], []

for threshold in thresholds:
    y_pred_thresh = (y_probs >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_thresh).ravel()
    
    tp_rates.append(tp)
    fp_rates.append(fp)
    tn_rates.append(tn)
    fn_rates.append(fn)

plt.figure(figsize=(10, 6))
plt.plot(thresholds, tp_rates, label="True Positives", color="green")
plt.plot(thresholds, fp_rates, label="False Positives", color="red")
plt.plot(thresholds, tn_rates, label="True Negatives", color="blue")
plt.plot(thresholds, fn_rates, label="False Negatives", color="orange")

plt.xlabel("Threshold")
plt.ylabel("Count")
plt.title("Effect of Changing Threshold on TP, FP, TN, FN")
plt.legend()
plt.grid(True)
plt.show()
