In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif

In [3]:
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
X = data.drop('Outcome', axis=1)
y = data['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

pipeline = Pipeline([
    ('selector', SelectKBest(f_classif)),
    ('classifier', DecisionTreeClassifier())
])

parameters = {
    'selector__k': [1, 2, 3, 4, 5, 6, 7, 8],
    'classifier__max_depth': [None, 5, 10, 15, 20, 25, 30]
}



grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
selected_features = grid_search.best_estimator_.named_steps['feature_selection'].get_support(indices=True)
selected_feature_names = X.columns[selected_features]

# Train the model with the best parameters
feature_importances = grid_search.best_estimator_.named_steps['classifier'].feature_importances_
optimized_pipeline = grid_search.best_estimator_
optimized_pipeline.fit(X_train, y_train)

feature_importance_df = pd.DataFrame({'Feature': selected_feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Predict on the test set
y_pred_optimized = optimized_pipeline.predict(X_test)

# Evaluate the optimized classifier
accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
class_report_optimized = classification_report(y_test, y_pred_optimized)
conf_matrix_optimized = confusion_matrix(y_test, y_pred_optimized)

# Print results
print("Best Parameters:", best_params)
print("Best Cross-validation Score:", best_score)
print("Test Accuracy:", accuracy_optimized)
print("Classification Report:\n", class_report_optimized)
print("Confusion Matrix:\n", conf_matrix_optimized)
print(feature_importance_df)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


KeyError: 'feature_selection'

In [None]:
feature_importances = optimized_pipeline

feature_names = X_train.columns

# Create a bar plot for feature importances
plt.figure(figsize=(10, 6))
plt.barh(np.arange(len(feature_names)), feature_importances, align='center')
plt.yticks(np.arange(len(feature_names)), feature_names)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importances in Decision Tree')
plt.show()

# Print feature importances
for feature, importance in zip(feature_names, feature_importances):
    print(f"Feature: {feature}, Importance: {importance:.3f}")
