In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score

# Load the dataset
file_path = 'diabetes.csv'  # Replace with the path to your dataset
data = pd.read_csv(file_path)

# Q1: Import and examine the dataset
print("Dataset Overview:")
print(data.head())

print("\nDescriptive Statistics:")
print(data.describe())

print("\nChecking for missing values:")
print(data.isnull().sum())

# Visualize distributions and relationships
sns.pairplot(data, hue='Outcome')
plt.show()

# Q2: Preprocess the data
# Check for and handle missing values
data = data.dropna()  # Drop rows with missing values for simplicity

# Remove outliers (example using z-score)
from scipy import stats
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
data_clean = data[(z_scores < 3).all(axis=1)]  # Assuming a threshold of 3 for z-scores

# Split the dataset into features and target
X = data_clean.drop('Outcome', axis=1)
y = data_clean['Outcome']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Q3: Train a Decision Tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Cross-validation to optimize hyperparameters (e.g., depth of the tree)
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
print("\nCross-Validation Scores:")
print(cv_scores)
print("Mean CV Score:", cv_scores.mean())

# Q4: Evaluate the model
y_pred = model.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nModel Performance:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Non-Diabetic', 'Diabetic'], columns=['Predicted Non-Diabetic', 'Predicted Diabetic'])

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

# Q5: Interpret the Decision Tree
plt.figure(figsize=(15, 10))
plot_tree(model, filled=True, feature_names=X.columns, class_names=['Non-Diabetic', 'Diabetic'])
plt.title('Decision Tree Visualization')
plt.show()

# Q6: Validate the Model
# Sensitivity analysis and scenario testing can be done by changing the dataset or using different splits

print("\nFeature Importances:")
print(pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False))

# Additional validation steps would include testing with new data and robustness checks
