In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
# Load the dataset
df = pd.read_csv("./datasets/heart.csv")
df1 = df.copy()  # Create a copy of the dataframe

# Define the columns to be encoded and scaled
cat_cols = ['sex', 'exng', 'caa', 'cp', 'fbs', 'restecg', 'slp', 'thall']
con_cols = ["age", "trtbps", "chol", "thalachh", "oldpeak"]

# Encoding the categorical columns
df1 = pd.get_dummies(df1, columns=cat_cols, drop_first=True)

# Define the features and target
X = df1.drop(['output'], axis=1)
y = df1['output']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# Normalizing the data
scaler = MinMaxScaler()
# X_train_normalized = scaler.fit_transform(X_train)
# X_test_normalized = scaler.transform(X_test)
# X_train_normalized = pd.DataFrame(X_train_normalized, columns=X.columns)
# X_test_normalized = pd.DataFrame(X_test_normalized, columns=X.columns)

X_train_normalized = X_train
X_test_normalized = X_test
# Instantiate and fit the SVM model
svm = SVC(kernel='linear', C=1, random_state=42, probability=True).fit(X_train_normalized, y_train)

# Predicting values
y_pred_svm = svm.predict(X_test_normalized)

# Printing the test accuracy for SVM
print("The test accuracy score of SVM is ", accuracy_score(y_test, y_pred_svm), f1_score(y_test, y_pred_svm))

# Instantiate and fit the Logistic Regression model
logreg = LogisticRegression(penalty='none',max_iter=2000).fit(X_train_normalized, y_train)

# Predicting values
y_pred_logreg = logreg.predict(X_test_normalized)

# Printing the test accuracy for Logistic Regression
print("The test accuracy score of Logistic Regression is ", accuracy_score(y_test, y_pred_logreg), f1_score(y_test, y_pred_logreg))

# Instantiate and fit the Decision Tree model
dt = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)

# Predicting values
y_pred_dt = dt.predict(X_test)

# Printing the test accuracy for Decision Tree
print("The test accuracy score of Decision Tree is ", accuracy_score(y_test, y_pred_dt),"f1 :",  f1_score(y_test, y_pred_dt))

# Instantiate and fit the Random Forest model
rf = RandomForestClassifier().fit(X_train, y_train)

# Predicting values
y_pred_rf = rf.predict(X_test)

# Printing the test accuracy for Random Forest
print("The test accuracy score of Random Forest is ", accuracy_score(y_test, y_pred_rf),"f1 :",  f1_score(y_test, y_pred_rf))

# Instantiate the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(n_estimators=50, random_state=42)

# Fit the XGBoost model
xgb_classifier.fit(X_train, y_train)

# Predict values
y_pred_xgb = xgb_classifier.predict(X_test)

# Printing the test accuracy for Gradient Boosting Classifier
print("The test accuracy score of Gradient Boosting Classifier is ", accuracy_score(y_test, y_pred_xgb),"f1 :",  f1_score(y_test, y_pred_xgb))

# Instantiate the mlp 
mlp = MLPClassifier(hidden_layer_sizes=(64, 64), max_iter=1000, random_state=42)

# Fit the mlp model
mlp.fit(X_train, y_train)

# Predict values
y_pred_mlp = mlp.predict(X_test)

# Printing the test accuracy for mlp Classifier
print("The test accuracy score of MLP Classifier is ", accuracy_score(y_test, y_pred_mlp),"f1 :",  f1_score(y_test, y_pred_mlp))

# Instantiate the KNN 
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the mlp model
knn.fit(X_train, y_train)

# Predict values
y_pred_knn = knn.predict(X_test)

# Printing the test accuracy for mlp Classifier
print("The test accuracy score of knn Classifier is ", accuracy_score(y_test, y_pred_mlp),"f1 :",  f1_score(y_test, y_pred_knn))
# Calculate permutation importance for SVM
perm_importance_svm = permutation_importance(svm, X_test_normalized, y_test, n_repeats=30, random_state=42,n_jobs=50)

# Get feature importances
rf_importances = rf.feature_importances_
xgb_importances = xgb_classifier.feature_importances_
logreg_importances = abs(logreg.coef_[0])
current_importance_type = xgb_classifier.get_booster().get_score()
print(current_importance_type)
# Create a DataFrame with feature importances
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'RandomForest': rf_importances,
    'DecissionTree': dt.feature_importances_,
    'XGBoost': xgb_importances,
    'LogisticRegression': logreg_importances,
    'SVM': perm_importance_svm.importances_mean,
})

# Display feature importances
display(feature_importances)



In [None]:
from sklearn.inspection import PartialDependenceDisplay
import matplotlib.pyplot as plt
feature_names = [0, 1,2, 3,4,5]

# Decision Tree
fig, ax = plt.subplots(figsize=(12, 6))
ax.set_title("Decision Tree - Partial Dependence")
PartialDependenceDisplay.from_estimator(dt, X, features=feature_names, ax=ax)
plt.show()

# Logistic Regression
fig, ax = plt.subplots(figsize=(12, 6))
ax.set_title("Logistic Regression - Partial Dependence")
PartialDependenceDisplay.from_estimator(logreg, X_train, features=feature_names,categorical_features= ["sex_1"], ax=ax)
plt.show()

# SVM - Partial Dependence doesn't directly apply to SVM; consider other visualization methods
# SVM doesn't inherently support partial dependence plots as decision tree-based models do.

# XGBoost
fig, ax = plt.subplots(figsize=(12, 6))
ax.set_title("XGBoost - Partial Dependence")
PartialDependenceDisplay.from_estimator(xgb_classifier, X_train, features=feature_names, ax=ax,categorical_features= ["sex_1"])
plt.show()

# Random Forest
fig, ax = plt.subplots(figsize=(12, 6))
ax.set_title("Random Forest - Partial Dependence")
PartialDependenceDisplay.from_estimator(rf, X_train, features=feature_names, ax=ax,categorical_features= ["sex_1"])
plt.show()
# MULTI LAYER PERCEPTRON
fig, ax = plt.subplots(figsize=(12, 6))
ax.set_title("MLP - Partial Dependence")
PartialDependenceDisplay.from_estimator(mlp, X_train, features=feature_names, ax=ax,categorical_features= ["sex_1"])
plt.show()
# KNN
fig, ax = plt.subplots(figsize=(12, 6))
ax.set_title("KNN - Partial Dependence")
PartialDependenceDisplay.from_estimator(knn, X_train, features=feature_names, ax=ax,categorical_features= ["sex_1"])
plt.show()
# SVM
fig, ax = plt.subplots(figsize=(12, 6))
ax.set_title("SUPPORT VECTOR MACHINE - Partial Dependence")
PartialDependenceDisplay.from_estimator(svm, X_train, features=feature_names, ax=ax,categorical_features= ["sex_1"])
plt.show()


This data was extracted from the 1994 Census bureau database by Ronny Kohavi and Barry Becker (Data Mining and Visualization, Silicon Graphics). A set of reasonably clean records was extracted using the following conditions: ((AAGE>16) && (AGI>100) && (AFNLWGT>1) && (HRSWK>0)). The prediction task is to determine whether a person makes over $50K a year.

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import pdpbox, lime, shap
from matplotlib import pyplot as plt
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score

data = pd.read_csv("./datasets/adult.csv")
data.shape
data['target']=data['income'].map({'<=50K':0,'>50K':1})
data.drop("income",axis=1,inplace=True)
data['target'].value_counts()
data.drop("education.num",axis=1,inplace=True)
data.drop('native.country',axis=1,inplace=True)
data=pd.get_dummies(data, drop_first = True)
y = data['target'].values
features = [col for col in data.columns if col not in ['target']]
X = data[features]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3, stratify=y)

# Normalizing the data
scaler = MinMaxScaler()
# X_train_normalized = scaler.fit_transform(X_train)
# X_test_normalized = scaler.transform(X_test)
# X_train_normalized = pd.DataFrame(X_train_normalized, columns=X.columns)
# X_test_normalized = pd.DataFrame(X_test_normalized, columns=X.columns)

X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.fit_transform(X_test)
# Instantiate and fit the SVM model
svm = SVC(kernel='linear', C=1, random_state=42, probability=True).fit(X_train_normalized, y_train)

# Predicting values
y_pred_svm = svm.predict(X_test_normalized)

# Printing the test accuracy for SVM
print("The test accuracy score of SVM is ", accuracy_score(y_test, y_pred_svm),"f1 :",  f1_score(y_test, y_pred_svm))

# Instantiate and fit the Logistic Regression model
logreg = LogisticRegression(penalty='none',max_iter=2000).fit(X_train_normalized, y_train)

# Predicting values
y_pred_logreg = logreg.predict(X_test_normalized)

# Printing the test accuracy for Logistic Regression
print("The test accuracy score of Logistic Regression is ", accuracy_score(y_test, y_pred_logreg),"f1 :", f1_score(y_test, y_pred_logreg))

# Instantiate and fit the Decision Tree model
dt = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)

# Predicting values
y_pred_dt = dt.predict(X_test)

# Printing the test accuracy for Decision Tree
print("The test accuracy score of Decision Tree is ", accuracy_score(y_test, y_pred_dt),"f1 :",  f1_score(y_test, y_pred_dt))

# Instantiate and fit the Random Forest model
rf = RandomForestClassifier().fit(X_train, y_train)

# Predicting values
y_pred_rf = rf.predict(X_test)

# Printing the test accuracy for Random Forest
print("The test accuracy score of Random Forest is ", accuracy_score(y_test, y_pred_rf),"f1 :",  f1_score(y_test, y_pred_rf))

# Instantiate the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(n_estimators=50, random_state=42)

# Fit the XGBoost model
xgb_classifier.fit(X_train, y_train)

# Predict values
y_pred_xgb = xgb_classifier.predict(X_test)

# Printing the test accuracy for Gradient Boosting Classifier
print("The test accuracy score of Gradient Boosting Classifier is ", accuracy_score(y_test, y_pred_xgb),"f1 :",  f1_score(y_test, y_pred_xgb))

# Instantiate the mlp 
mlp = MLPClassifier(hidden_layer_sizes=(64, 64), max_iter=1000, random_state=42)

# Fit the mlp model
mlp.fit(X_train, y_train)

# Predict values
y_pred_mlp = mlp.predict(X_test)

# Printing the test accuracy for mlp Classifier
print("The test accuracy score of MLP Classifier is ", accuracy_score(y_test, y_pred_mlp),"f1 :",  f1_score(y_test, y_pred_mlp))

# Instantiate the KNN 
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the mlp model
knn.fit(X_train, y_train)

# Predict values
y_pred_knn = knn.predict(X_test)

# Printing the test accuracy for mlp Classifier
print("The test accuracy score of knn Classifier is ", accuracy_score(y_test, y_pred_mlp), f1_score(y_test, y_pred_knn))
# Calculate permutation importance for SVM
perm_importance_svm = permutation_importance(svm, X_test_normalized, y_test, n_repeats=20, random_state=42,n_jobs=20)
perm_importance_knn = permutation_importance(knn, X_test_normalized, y_test, n_repeats=20, random_state=42,n_jobs=20)


# Get feature importances
rf_importances = rf.feature_importances_
xgb_importances = xgb_classifier.feature_importances_
logreg_importances = abs(logreg.coef_[0])
current_importance_type = xgb_classifier.get_booster().get_score()
print(current_importance_type)
# Create a DataFrame with feature importances
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'RandomForest': rf_importances,
    'DecissionTree': dt.feature_importances_,
    'XGBoost': xgb_importances,
    'LogisticRegression': logreg_importances,
    'KNN': perm_importance_knn,
    'SVM': perm_importance_svm.importances_mean,
})

# Display feature importances
display(feature_importances)
