In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer

In [2]:
# Load data
df = pd.read_csv('dataset.csv')
df.shape

(303, 14)

In [3]:
# Check for missing values
if df.isnull().sum().sum() > 0:
    # Impute missing values with mean
    imputer = SimpleImputer(strategy='mean')
    df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)


In [4]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

In [5]:
# Scale numerical data (Standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [7]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(kernel='linear'),  # Linear kernel to allow feature importance
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'ANN': MLPClassifier(max_iter=1000)  # Simple ANN
}

In [8]:
# Evaluate each model with Sequential Forward Selection
results = []

In [9]:
# Apply Forward Selection for each model
for model_name, model in models.items():
    sfs = SequentialFeatureSelector(model, direction='forward', scoring='accuracy', cv=5, n_features_to_select='auto')
    sfs.fit(X_train, y_train)
    
    # Get the selected features
    selected_features = X.columns[sfs.get_support()]
    
    print(f"\nModel: {model_name}")
    print(f"Optimal number of features: {len(selected_features)}")
    print("Selected Features:", selected_features)
    
    X_train_selected = sfs.transform(X_train)
    X_test_selected = sfs.transform(X_test)
    
    # Train the model on selected features
    model.fit(X_train_selected, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_selected)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    
    # Store results
    results.append({
        'Model': model_name,
        'Selected Features': list(selected_features),
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })
    
    # Print evaluation metrics
    # print(f"Selected Features: {selected_features}")
    # print(f"Accuracy: {accuracy:.4f}")
    # print(f"Precision: {precision:.4f}")
    # print(f"Recall: {recall:.4f}")
    # print(f"F1 Score: {f1:.4f} \n\n\n")


Model: Logistic Regression
Optimal number of features: 6
Selected Features: Index(['cp', 'chol', 'exang', 'oldpeak', 'ca', 'thal'], dtype='object')

Model: SVM
Optimal number of features: 6
Selected Features: Index(['sex', 'cp', 'trestbps', 'thalach', 'ca', 'thal'], dtype='object')

Model: KNN
Optimal number of features: 6
Selected Features: Index(['sex', 'cp', 'chol', 'exang', 'oldpeak', 'slope'], dtype='object')

Model: Naive Bayes
Optimal number of features: 6
Selected Features: Index(['cp', 'exang', 'oldpeak', 'slope', 'ca', 'thal'], dtype='object')





Model: ANN
Optimal number of features: 6
Selected Features: Index(['sex', 'cp', 'chol', 'oldpeak', 'ca', 'thal'], dtype='object')




In [10]:
# Show the results
print("\nSummary of Model Performance:")
for result in results:
    print(f"\nModel: {result['Model']}")
    print(f"Selected Features: {result['Selected Features']}")
    print(f"Accuracy: {result['Accuracy']:.4f}")
    print(f"Precision: {result['Precision']:.4f}")
    print(f"Recall: {result['Recall']:.4f}")
    print(f"F1 Score: {result['F1 Score']:.4f}")


Summary of Model Performance:

Model: Logistic Regression
Selected Features: ['cp', 'chol', 'exang', 'oldpeak', 'ca', 'thal']
Accuracy: 0.8852
Precision: 0.9032
Recall: 0.8750
F1 Score: 0.8889

Model: SVM
Selected Features: ['sex', 'cp', 'trestbps', 'thalach', 'ca', 'thal']
Accuracy: 0.9180
Precision: 0.8857
Recall: 0.9688
F1 Score: 0.9254

Model: KNN
Selected Features: ['sex', 'cp', 'chol', 'exang', 'oldpeak', 'slope']
Accuracy: 0.7541
Precision: 0.7742
Recall: 0.7500
F1 Score: 0.7619

Model: Naive Bayes
Selected Features: ['cp', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
Accuracy: 0.8525
Precision: 0.8710
Recall: 0.8438
F1 Score: 0.8571

Model: ANN
Selected Features: ['sex', 'cp', 'chol', 'oldpeak', 'ca', 'thal']
Accuracy: 0.8689
Precision: 0.9000
Recall: 0.8438
F1 Score: 0.8710


In [11]:
from sklearn.ensemble import BaggingClassifier
# Apply Bagging for each of the models evaluated earlier
for model_name, model in models.items():
    print(f"\nApplying Bagging on Model: {model_name}")
    
    # Perform Forward Selection (already done above)
    sfs = SequentialFeatureSelector(model, direction='forward', scoring='accuracy', cv=5, n_features_to_select='auto')
    sfs.fit(X_train, y_train)
    
    # Get the selected features
    selected_features = X.columns[sfs.get_support()]
    X_train_selected = sfs.transform(X_train)
    X_test_selected = sfs.transform(X_test)
    
    # Apply Bagging on the model
    bagging_model = BaggingClassifier(estimator=model, n_estimators=50, random_state=42)
    bagging_model.fit(X_train_selected, y_train)
    
    # Make predictions with Bagging
    y_pred_bagging = bagging_model.predict(X_test_selected)
    
    # Evaluate the Bagging model
    accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
    precision_bagging = precision_score(y_test, y_pred_bagging, average='binary')
    recall_bagging = recall_score(y_test, y_pred_bagging, average='binary')
    f1_bagging = f1_score(y_test, y_pred_bagging, average='binary')
    
    # Print Bagging evaluation metrics
    print(f"Selected Features: {selected_features}")
    print(f"Bagging Accuracy: {accuracy_bagging:.4f}")
    print(f"Bagging Precision: {precision_bagging:.4f}")
    print(f"Bagging Recall: {recall_bagging:.4f}")
    print(f"Bagging F1 Score: {f1_bagging:.4f}")



Applying Bagging on Model: Logistic Regression
Selected Features: Index(['cp', 'chol', 'exang', 'oldpeak', 'ca', 'thal'], dtype='object')
Bagging Accuracy: 0.8852
Bagging Precision: 0.9032
Bagging Recall: 0.8750
Bagging F1 Score: 0.8889

Applying Bagging on Model: SVM
Selected Features: Index(['sex', 'cp', 'trestbps', 'thalach', 'ca', 'thal'], dtype='object')
Bagging Accuracy: 0.8852
Bagging Precision: 0.8788
Bagging Recall: 0.9062
Bagging F1 Score: 0.8923

Applying Bagging on Model: KNN
Selected Features: Index(['sex', 'cp', 'chol', 'exang', 'oldpeak', 'slope'], dtype='object')
Bagging Accuracy: 0.7705
Bagging Precision: 0.8000
Bagging Recall: 0.7500
Bagging F1 Score: 0.7742

Applying Bagging on Model: Naive Bayes
Selected Features: Index(['cp', 'exang', 'oldpeak', 'slope', 'ca', 'thal'], dtype='object')
Bagging Accuracy: 0.8852
Bagging Precision: 0.8788
Bagging Recall: 0.9062
Bagging F1 Score: 0.8923

Applying Bagging on Model: ANN




Selected Features: Index(['sex', 'cp', 'fbs', 'oldpeak', 'ca', 'thal'], dtype='object')
Bagging Accuracy: 0.8361
Bagging Precision: 0.8667
Bagging Recall: 0.8125
Bagging F1 Score: 0.8387


In [12]:
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC

# Define the base models and the final model for stacking
base_models = [
    ('svm', SVC(probability=True)),
    ('naive_bayes', GaussianNB()),
    ('logistic_regression', LogisticRegression()),
    ('ann', MLPClassifier(max_iter=1000))
]

# Final model to combine the base models' predictions
final_model = LogisticRegression()

# Create the StackingClassifier
stacking_model = StackingClassifier(estimators=base_models, final_estimator=final_model, cv=5)

# Fit the stacking model with the selected features
stacking_model.fit(X_train_selected, y_train)

# Make predictions with the stacking model
y_pred_stacking = stacking_model.predict(X_test_selected)

# Evaluate the stacking model
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
precision_stacking = precision_score(y_test, y_pred_stacking, average='binary')
recall_stacking = recall_score(y_test, y_pred_stacking, average='binary')
f1_stacking = f1_score(y_test, y_pred_stacking, average='binary')

# Print Stacking model evaluation metrics
print("\nStacking Model Evaluation Metrics:")
print(f"Stacking Accuracy: {accuracy_stacking:.4f}")
print(f"Stacking Precision: {precision_stacking:.4f}")
print(f"Stacking Recall: {recall_stacking:.4f}")
print(f"Stacking F1 Score: {f1_stacking:.4f}")





Stacking Model Evaluation Metrics:
Stacking Accuracy: 0.8852
Stacking Precision: 0.8571
Stacking Recall: 0.9375
Stacking F1 Score: 0.8955




In [13]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

# Define the base models for voting
base_models_voting = [
    ('svm', SVC(probability=True)),
    ('naive_bayes', GaussianNB()),
    ('logistic_regression', LogisticRegression()),
    ('ann', MLPClassifier(max_iter=1000))
]

# Create the VotingClassifier
voting_model = VotingClassifier(estimators=base_models_voting, voting='soft')

# Fit the voting model with the selected features
voting_model.fit(X_train_selected, y_train)

# Make predictions with the voting model
y_pred_voting = voting_model.predict(X_test_selected)

# Evaluate the voting model
accuracy_voting = accuracy_score(y_test, y_pred_voting)
precision_voting = precision_score(y_test, y_pred_voting, average='binary')
recall_voting = recall_score(y_test, y_pred_voting, average='binary')
f1_voting = f1_score(y_test, y_pred_voting, average='binary')

# Print Voting model evaluation metrics
print("\nVoting Model Evaluation Metrics:")
print(f"Voting Accuracy: {accuracy_voting:.4f}")
print(f"Voting Precision: {precision_voting:.4f}")
print(f"Voting Recall: {recall_voting:.4f}")
print(f"Voting F1 Score: {f1_voting:.4f}")



Voting Model Evaluation Metrics:
Voting Accuracy: 0.8525
Voting Precision: 0.8710
Voting Recall: 0.8438
Voting F1 Score: 0.8571


In [14]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Create a function to evaluate each model with AdaBoost
def evaluate_boosting_model(base_model, model_name):
    # Create an AdaBoost classifier with a base model
    boosting_model = AdaBoostClassifier(estimator=base_model, n_estimators=50, random_state=42)

    # Fit the boosting model with the selected features
    boosting_model.fit(X_train_selected, y_train)

    # Make predictions with the boosting model
    y_pred_boosting = boosting_model.predict(X_test_selected)

    # Evaluate the boosting model
    accuracy_boosting = accuracy_score(y_test, y_pred_boosting)
    precision_boosting = precision_score(y_test, y_pred_boosting, average='binary')
    recall_boosting = recall_score(y_test, y_pred_boosting, average='binary')
    f1_boosting = f1_score(y_test, y_pred_boosting, average='binary')

    # Print Boosting model evaluation metrics
    print(f"\n{model_name} Boosting Model Evaluation Metrics:")
    print(f"Boosting Accuracy: {accuracy_boosting:.4f}")
    print(f"Boosting Precision: {precision_boosting:.4f}")
    print(f"Boosting Recall: {recall_boosting:.4f}")
    print(f"Boosting F1 Score: {f1_boosting:.4f}")



In [15]:
# List of base models to evaluate with AdaBoost
base_models_to_boost = [
    (SVC(probability=True), 'SVM'),
    (GaussianNB(), 'Naive Bayes'),
    (LogisticRegression(), 'Logistic Regression'),
    # (MLPClassifier(max_iter=1000), 'ANN'),
]

In [16]:
# Evaluate each model with AdaBoost
for model_class, model_name in base_models_to_boost:
    # Create an instance of the model
    # model_instance = model_class()  # Instantiate the model here
    evaluate_boosting_model(model_class, model_name)




SVM Boosting Model Evaluation Metrics:
Boosting Accuracy: 0.7705
Boosting Precision: 0.8750
Boosting Recall: 0.6562
Boosting F1 Score: 0.7500

Naive Bayes Boosting Model Evaluation Metrics:
Boosting Accuracy: 0.5902
Boosting Precision: 0.5814
Boosting Recall: 0.7812
Boosting F1 Score: 0.6667





Logistic Regression Boosting Model Evaluation Metrics:
Boosting Accuracy: 0.9016
Boosting Precision: 0.8824
Boosting Recall: 0.9375
Boosting F1 Score: 0.9091
