In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
# Load the dataset
df = pd.read_csv("./datasets/heart.csv")
df1 = df.copy()  # Create a copy of the dataframe

# Define the columns to be encoded and scaled
cat_cols = ['sex', 'exng', 'caa', 'cp', 'fbs', 'restecg', 'slp', 'thall']
con_cols = ["age", "trtbps", "chol", "thalachh", "oldpeak"]

# Encoding the categorical columns
df1 = pd.get_dummies(df1, columns=cat_cols, drop_first=True)

# Define the features and target
X = df1.drop(['output'], axis=1)
y = df1['output']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# Normalizing the data
scaler = MinMaxScaler()
# X_train_normalized = scaler.fit_transform(X_train)
# X_test_normalized = scaler.transform(X_test)
# X_train_normalized = pd.DataFrame(X_train_normalized, columns=X.columns)
# X_test_normalized = pd.DataFrame(X_test_normalized, columns=X.columns)

X_train_normalized = X_train
X_test_normalized = X_test
# Instantiate and fit the SVM model
svm = SVC(kernel='linear', C=1, random_state=42,
          probability=True).fit(X_train_normalized, y_train)

# Predicting values
y_pred_svm = svm.predict(X_test_normalized)

# Printing the test accuracy for SVM
print("The test accuracy score of SVM is ", accuracy_score(
    y_test, y_pred_svm), f1_score(y_test, y_pred_svm))

# Instantiate and fit the Logistic Regression model
logreg = LogisticRegression(penalty='none', max_iter=2000).fit(
    X_train_normalized, y_train)

# Predicting values
y_pred_logreg = logreg.predict(X_test_normalized)

# Printing the test accuracy for Logistic Regression
print("The test accuracy score of Logistic Regression is ", accuracy_score(
    y_test, y_pred_logreg), f1_score(y_test, y_pred_logreg))

# Instantiate and fit the Decision Tree model
dt = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)

# Predicting values
y_pred_dt = dt.predict(X_test)

# Printing the test accuracy for Decision Tree
print("The test accuracy score of Decision Tree is ", accuracy_score(
    y_test, y_pred_dt), "f1 :",  f1_score(y_test, y_pred_dt))

# Instantiate and fit the Random Forest model
rf = RandomForestClassifier().fit(X_train, y_train)

# Predicting values
y_pred_rf = rf.predict(X_test)

# Printing the test accuracy for Random Forest
print("The test accuracy score of Random Forest is ", accuracy_score(
    y_test, y_pred_rf), "f1 :",  f1_score(y_test, y_pred_rf))

# Instantiate the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(n_estimators=50, random_state=42)

# Fit the XGBoost model
xgb_classifier.fit(X_train, y_train)

# Predict values
y_pred_xgb = xgb_classifier.predict(X_test)

# Printing the test accuracy for Gradient Boosting Classifier
print("The test accuracy score of Gradient Boosting Classifier is ",
      accuracy_score(y_test, y_pred_xgb), "f1 :",  f1_score(y_test, y_pred_xgb))

# Instantiate the mlp
mlp = MLPClassifier(hidden_layer_sizes=(64, 64),
                    max_iter=1000, random_state=42)

# Fit the mlp model
mlp.fit(X_train, y_train)

# Predict values
y_pred_mlp = mlp.predict(X_test)

# Printing the test accuracy for mlp Classifier
print("The test accuracy score of MLP Classifier is ", accuracy_score(
    y_test, y_pred_mlp), "f1 :",  f1_score(y_test, y_pred_mlp))

# Instantiate the KNN
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the mlp model
knn.fit(X_train, y_train)

# Predict values
y_pred_knn = knn.predict(X_test)

# Printing the test accuracy for mlp Classifier
print("The test accuracy score of knn Classifier is ", accuracy_score(
    y_test, y_pred_mlp), "f1 :",  f1_score(y_test, y_pred_knn))
# Calculate permutation importance for SVM
perm_importance_svm = permutation_importance(
    svm, X_test_normalized, y_test, n_repeats=30, random_state=42, n_jobs=50)

# Get feature importances
rf_importances = rf.feature_importances_
xgb_importances = xgb_classifier.feature_importances_
logreg_importances = abs(logreg.coef_[0])
current_importance_type = xgb_classifier.get_booster().get_score()
print(current_importance_type)
# Create a DataFrame with feature importances
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'RandomForest': rf_importances,
    'DecissionTree': dt.feature_importances_,
    'XGBoost': xgb_importances,
    'LogisticRegression': logreg_importances,
    'SVM': perm_importance_svm.importances_mean,
})

# Display feature importances
display(feature_importances)

In [None]:
from sklearn.inspection import PartialDependenceDisplay
import matplotlib.pyplot as plt
feature_names = [0, 1, 2, 3, 4, 5]

# Decision Tree
fig, ax = plt.subplots(figsize=(12, 6))
ax.set_title("Decision Tree - Partial Dependence")
PartialDependenceDisplay.from_estimator(dt, X, features=feature_names, ax=ax)
plt.show()

# Logistic Regression
fig, ax = plt.subplots(figsize=(12, 6))
ax.set_title("Logistic Regression - Partial Dependence")
PartialDependenceDisplay.from_estimator(
    logreg, X_train, features=feature_names, categorical_features=["sex_1"], ax=ax)
plt.show()

# SVM - Partial Dependence doesn't directly apply to SVM; consider other visualization methods
# SVM doesn't inherently support partial dependence plots as decision tree-based models do.

# XGBoost
fig, ax = plt.subplots(figsize=(12, 6))
ax.set_title("XGBoost - Partial Dependence")
PartialDependenceDisplay.from_estimator(
    xgb_classifier, X_train, features=feature_names, ax=ax, categorical_features=["sex_1"])
plt.show()

# Random Forest
fig, ax = plt.subplots(figsize=(12, 6))
ax.set_title("Random Forest - Partial Dependence")
PartialDependenceDisplay.from_estimator(
    rf, X_train, features=feature_names, ax=ax, categorical_features=["sex_1"])
plt.show()
# MULTI LAYER PERCEPTRON
fig, ax = plt.subplots(figsize=(12, 6))
ax.set_title("MLP - Partial Dependence")
PartialDependenceDisplay.from_estimator(
    mlp, X_train, features=feature_names, ax=ax, categorical_features=["sex_1"])
plt.show()
# KNN
fig, ax = plt.subplots(figsize=(12, 6))
ax.set_title("KNN - Partial Dependence")
PartialDependenceDisplay.from_estimator(
    knn, X_train, features=feature_names, ax=ax, categorical_features=["sex_1"])
plt.show()
# SVM
fig, ax = plt.subplots(figsize=(12, 6))
ax.set_title("SUPPORT VECTOR MACHINE - Partial Dependence")
PartialDependenceDisplay.from_estimator(
    svm, X_train, features=feature_names, ax=ax, categorical_features=["sex_1"])
plt.show()

This data was extracted from the 1994 Census bureau database by Ronny Kohavi and Barry Becker (Data Mining and Visualization, Silicon Graphics). A set of reasonably clean records was extracted using the following conditions: ((AAGE>16) && (AGI>100) && (AFNLWGT>1) && (HRSWK>0)). The prediction task is to determine whether a person makes over $50K a year.

In [None]:
import shap
import lime
import pdpbox
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.inspection import permutation_importance
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import warnings

from sklearn.discriminant_analysis import StandardScaler
warnings.filterwarnings('ignore')
df = pd.read_csv("./datasets/adult.csv")
# Definition of the columns that will be features (note that the column 'clientid' is not present)
features = [
    'age', 'workclass', 'fnlwgt', 'education', 'education.num',
    'marital.status', 'occupation', 'relationship', 'race', 'sex',
    'capital.gain', 'capital.loss', 'hours.per.week', 'native.country'
]

# Preparation of arguments for ``scikit-learn`` library methods
X = df[features].values
# Part of transforming categorical to integer
lbp = LabelEncoder()
X[:, 1] = lbp.fit_transform(X[:, 1])

X[:, 3] = lbp.fit_transform(X[:, 3])

X[:, 5] = lbp.fit_transform(X[:, 5])

X[:, 6] = lbp.fit_transform(X[:, 6])

X[:, 7] = lbp.fit_transform(X[:, 7])
X
X[:, 8] = lbp.fit_transform(X[:, 8])

X[:, 9] = lbp.fit_transform(X[:, 9])

X[:, 13] = lbp.fit_transform(X[:, 13])

LE = LabelEncoder()

y = LE.fit_transform(df["income"])
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0)


# Instantiate and fit the SVM model
svm = SVC(random_state=42, probability=True).fit(X_train, y_train)

# Predicting values
y_pred_svm = svm.predict(X_test)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
roc_svm = roc_auc_score(y_test, y_pred_svm)

# Printing the test accuracy for SVM
print("The test accuracy score of SVM is ", accuracy_score(
    y_test, y_pred_svm), "f1 :",  f1_score(y_test, y_pred_svm))

# Instantiate and fit the Logistic Regression model
logreg = LogisticRegression(max_iter=2000).fit(X_train, y_train)

# Predicting values
y_pred_logreg = logreg.predict(X_test)

accuracy_lr = accuracy_score(y_test, y_pred_logreg)
f1_lr = f1_score(y_test, y_pred_logreg)
precision_lr = precision_score(y_test, y_pred_logreg)
recall_lr = recall_score(y_test, y_pred_logreg)
roc_lr = roc_auc_score(y_test, y_pred_logreg)

# Printing the test accuracy for Logistic Regression
print("The test accuracy score of Logistic Regression is ", accuracy_score(
    y_test, y_pred_logreg), "f1 :", f1_score(y_test, y_pred_logreg))

# Instantiate and fit the Decision Tree model
dt = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)

# Predicting values
y_pred_dt = dt.predict(X_test)

accuracy_dt = accuracy_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
roc_dt = roc_auc_score(y_test, y_pred_dt)

# Printing the test accuracy for Decision Tree
print("The test accuracy score of Decision Tree is ", accuracy_dt, "f1 :", f1_dt)

# Instantiate and fit the Random Forest model
rf = RandomForestClassifier().fit(X_train, y_train)

# Predicting values
y_pred_rf = rf.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
roc_rf = roc_auc_score(y_test, y_pred_rf)

# Printing the test accuracy for Random Forest
print("The test accuracy score of Random Forest is ", accuracy_score(
    y_test, y_pred_rf), "f1 :",  f1_score(y_test, y_pred_rf))

# Instantiate the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(n_estimators=50, random_state=42)

# Fit the XGBoost model
xgb_classifier.fit(X_train, y_train)

# Predict values
y_pred_xgb = xgb_classifier.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
roc_xgb = roc_auc_score(y_test, y_pred_xgb)

# Printing the test accuracy for Gradient Boosting Classifier
print("The test accuracy score of Gradient Boosting Classifier is ",
      accuracy_score(y_test, y_pred_xgb), "f1 :",  f1_score(y_test, y_pred_xgb))

# Instantiate the mlp
mlp = MLPClassifier(hidden_layer_sizes=(64, 64),
                    max_iter=2000, random_state=42)

# Fit the mlp model
mlp.fit(X_train, y_train)

# Predict values
y_pred_mlp = mlp.predict(X_test)
accuracy_rn = accuracy_score(y_test, y_pred_mlp)
recall_rn = recall_score(y_test, y_pred_mlp)
precision_rn = precision_score(y_test, y_pred_mlp)
f1_rn = f1_score(y_test, y_pred_mlp)
roc_rn = roc_auc_score(y_test, y_pred_mlp)
# Printing the test accuracy for mlp Classifier
print("The test accuracy score of MLP Classifier is ", accuracy_score(
    y_test, y_pred_mlp), "f1 :",  f1_score(y_test, y_pred_mlp))

# Instantiate the KNN
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

accuracy_knn = accuracy_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)
roc_knn = roc_auc_score(y_test, y_pred_knn)

# Printing the test accuracy for mlp Classifier
print("The test accuracy score of knn Classifier is ", accuracy_score(
    y_test, y_pred_knn), "f1 :", f1_score(y_test, y_pred_knn))
# Calculate permutation importance for SVM
models = [
    ('Decision Tree', accuracy_dt, recall_dt, precision_dt, f1_dt, roc_dt),
    ('Random Forest', accuracy_rf, recall_rf, precision_rf, f1_rf, roc_rf),
    ('XGBoost', accuracy_xgb, recall_xgb, precision_xgb, f1_xgb, roc_xgb),
    ('kNN', accuracy_knn, recall_knn, precision_knn, f1_knn, roc_knn),
    ('Logistic Regression', accuracy_lr, recall_lr, precision_lr, f1_lr, roc_lr),
    ('SVM', accuracy_svm, recall_svm, precision_svm, f1_svm, roc_svm),
    ('Neural Networks', accuracy_rn, recall_rn, precision_rn, f1_rn, roc_rn)]

df_all_models = pd.DataFrame(models, columns=[
                             'Model', 'Accuracy (%)', 'Recall (%)', 'Precision (%)', 'F1 (%)', 'AUC'])
display(df_all_models)

In [32]:
import openml
dataset = openml.datasets.get_dataset(1590)
X, y, _, _ = dataset.get_data(dataset_format="dataframe")
display(X)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802.0,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40,United-States,<=50K
1,38,Private,89814.0,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50,United-States,<=50K
2,28,Local-gov,336951.0,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40,United-States,>50K
3,44,Private,160323.0,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40,United-States,>50K
4,18,,103497.0,Some-college,10,Never-married,,Own-child,White,Female,0.0,0.0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302.0,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38,United-States,<=50K
48838,40,Private,154374.0,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40,United-States,>50K
48839,58,Private,151910.0,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40,United-States,<=50K
48840,22,Private,201490.0,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,20,United-States,<=50K
