In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv('statlog.csv')
df.shape

(270, 14)

In [3]:

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,presence
0,70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,2
1,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,1
2,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,2
3,64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,1
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,1


In [4]:
# Check for missing values
if df.isnull().sum().sum() > 0:
    # Impute missing values with mean
    imputer = SimpleImputer(strategy='mean')
    df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)


In [6]:
# Separate features and target
X = df.drop('presence', axis=1)
y = df['presence']

In [7]:
# Scale numerical data (Standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [9]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(kernel='linear'),  # Linear kernel to allow feature importance
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'ANN': MLPClassifier(max_iter=1000)  # Simple ANN
}

In [10]:
# Evaluate each model with Sequential Forward Selection
results = []

In [11]:
# Apply Forward Selection for each model
for model_name, model in models.items():
    sfs = SequentialFeatureSelector(model, direction='forward', scoring='accuracy', cv=5, n_features_to_select='auto')
    sfs.fit(X_train, y_train)
    
    # Get the selected features
    selected_features = X.columns[sfs.get_support()]
    
    print(f"\nModel: {model_name}")
    print(f"Optimal number of features: {len(selected_features)}")
    print("Selected Features:", selected_features)
    
    X_train_selected = sfs.transform(X_train)
    X_test_selected = sfs.transform(X_test)
    
    # Train the model on selected features
    model.fit(X_train_selected, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_selected)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    
    # Store results
    results.append({
        'Model': model_name,
        'Selected Features': list(selected_features),
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })
    
    # Print evaluation metrics
    print(f"Selected Features: {selected_features}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f} \n\n\n")


Model: Logistic Regression
Optimal number of features: 6
Selected Features: Index(['cp', 'trestbps', 'fbs', 'oldpeak', 'ca', 'thal'], dtype='object')
Selected Features: Index(['cp', 'trestbps', 'fbs', 'oldpeak', 'ca', 'thal'], dtype='object')
Accuracy: 0.8519
Precision: 0.8378
Recall: 0.9394
F1 Score: 0.8857 




Model: SVM
Optimal number of features: 6
Selected Features: Index(['age', 'cp', 'chol', 'oldpeak', 'ca', 'thal'], dtype='object')
Selected Features: Index(['age', 'cp', 'chol', 'oldpeak', 'ca', 'thal'], dtype='object')
Accuracy: 0.8519
Precision: 0.8571
Recall: 0.9091
F1 Score: 0.8824 




Model: KNN
Optimal number of features: 6
Selected Features: Index(['cp', 'trestbps', 'fbs', 'oldpeak', 'slope', 'ca'], dtype='object')
Selected Features: Index(['cp', 'trestbps', 'fbs', 'oldpeak', 'slope', 'ca'], dtype='object')
Accuracy: 0.7222
Precision: 0.7500
Recall: 0.8182
F1 Score: 0.7826 




Model: Naive Bayes
Optimal number of features: 6
Selected Features: Index(['cp', 'restecg', 




Model: ANN
Optimal number of features: 6
Selected Features: Index(['cp', 'fbs', 'restecg', 'oldpeak', 'slope', 'ca'], dtype='object')
Selected Features: Index(['cp', 'fbs', 'restecg', 'oldpeak', 'slope', 'ca'], dtype='object')
Accuracy: 0.7593
Precision: 0.7941
Recall: 0.8182
F1 Score: 0.8060 





In [12]:
# Show the results
print("\nSummary of Model Performance:")
for result in results:
    print(f"\nModel: {result['Model']}")
    print(f"Selected Features: {result['Selected Features']}")
    print(f"Accuracy: {result['Accuracy']:.4f}")
    print(f"Precision: {result['Precision']:.4f}")
    print(f"Recall: {result['Recall']:.4f}")
    print(f"F1 Score: {result['F1 Score']:.4f}")


Summary of Model Performance:

Model: Logistic Regression
Selected Features: ['cp', 'trestbps', 'fbs', 'oldpeak', 'ca', 'thal']
Accuracy: 0.8519
Precision: 0.8378
Recall: 0.9394
F1 Score: 0.8857

Model: SVM
Selected Features: ['age', 'cp', 'chol', 'oldpeak', 'ca', 'thal']
Accuracy: 0.8519
Precision: 0.8571
Recall: 0.9091
F1 Score: 0.8824

Model: KNN
Selected Features: ['cp', 'trestbps', 'fbs', 'oldpeak', 'slope', 'ca']
Accuracy: 0.7222
Precision: 0.7500
Recall: 0.8182
F1 Score: 0.7826

Model: Naive Bayes
Selected Features: ['cp', 'restecg', 'thalach', 'oldpeak', 'ca', 'thal']
Accuracy: 0.8889
Precision: 0.8649
Recall: 0.9697
F1 Score: 0.9143

Model: ANN
Selected Features: ['cp', 'fbs', 'restecg', 'oldpeak', 'slope', 'ca']
Accuracy: 0.7593
Precision: 0.7941
Recall: 0.8182
F1 Score: 0.8060


In [13]:
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC

# Define the base models and the final model for stacking
base_models = [
    ('svm', SVC(probability=True)),
    ('naive_bayes', GaussianNB()),
    ('logistic_regression', LogisticRegression()),
    ('ann', MLPClassifier(max_iter=1000))
]

# Final model to combine the base models' predictions
final_model = LogisticRegression()

# Create the StackingClassifier
stacking_model = StackingClassifier(estimators=base_models, final_estimator=final_model, cv=5)

# Fit the stacking model with the selected features
stacking_model.fit(X_train_selected, y_train)

# Make predictions with the stacking model
y_pred_stacking = stacking_model.predict(X_test_selected)

# Evaluate the stacking model
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
precision_stacking = precision_score(y_test, y_pred_stacking, average='binary')
recall_stacking = recall_score(y_test, y_pred_stacking, average='binary')
f1_stacking = f1_score(y_test, y_pred_stacking, average='binary')

# Print Stacking model evaluation metrics
print("\nStacking Model Evaluation Metrics:")
print(f"Stacking Accuracy: {accuracy_stacking:.4f}")
print(f"Stacking Precision: {precision_stacking:.4f}")
print(f"Stacking Recall: {recall_stacking:.4f}")
print(f"Stacking F1 Score: {f1_stacking:.4f}")





Stacking Model Evaluation Metrics:
Stacking Accuracy: 0.7593
Stacking Precision: 0.7778
Stacking Recall: 0.8485
Stacking F1 Score: 0.8116




In [14]:
from sklearn.ensemble import VotingClassifier

# Define the base models for voting
base_models_voting = [
    ('svm', SVC(probability=True)),
    ('naive_bayes', GaussianNB()),
    ('logistic_regression', LogisticRegression()),
    ('ann', MLPClassifier(max_iter=1000))
]

# Create the VotingClassifier
voting_model = VotingClassifier(estimators=base_models_voting, voting='soft')

# Fit the voting model with the selected features
voting_model.fit(X_train_selected, y_train)

# Make predictions with the voting model
y_pred_voting = voting_model.predict(X_test_selected)

# Evaluate the voting model
accuracy_voting = accuracy_score(y_test, y_pred_voting)
precision_voting = precision_score(y_test, y_pred_voting, average='binary')
recall_voting = recall_score(y_test, y_pred_voting, average='binary')
f1_voting = f1_score(y_test, y_pred_voting, average='binary')

# Print Voting model evaluation metrics
print("\nVoting Model Evaluation Metrics:")
print(f"Voting Accuracy: {accuracy_voting:.4f}")
print(f"Voting Precision: {precision_voting:.4f}")
print(f"Voting Recall: {recall_voting:.4f}")
print(f"Voting F1 Score: {f1_voting:.4f}")



Voting Model Evaluation Metrics:
Voting Accuracy: 0.7593
Voting Precision: 0.7778
Voting Recall: 0.8485
Voting F1 Score: 0.8116


In [15]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Create a function to evaluate each model with AdaBoost
def evaluate_boosting_model(base_model, model_name):
    # Create an AdaBoost classifier with a base model
    boosting_model = AdaBoostClassifier(estimator=base_model, n_estimators=50, random_state=42)

    # Fit the boosting model with the selected features
    boosting_model.fit(X_train_selected, y_train)

    # Make predictions with the boosting model
    y_pred_boosting = boosting_model.predict(X_test_selected)

    # Evaluate the boosting model
    accuracy_boosting = accuracy_score(y_test, y_pred_boosting)
    precision_boosting = precision_score(y_test, y_pred_boosting, average='binary')
    recall_boosting = recall_score(y_test, y_pred_boosting, average='binary')
    f1_boosting = f1_score(y_test, y_pred_boosting, average='binary')

    # Print Boosting model evaluation metrics
    print(f"\n{model_name} Boosting Model Evaluation Metrics:")
    print(f"Boosting Accuracy: {accuracy_boosting:.4f}")
    print(f"Boosting Precision: {precision_boosting:.4f}")
    print(f"Boosting Recall: {recall_boosting:.4f}")
    print(f"Boosting F1 Score: {f1_boosting:.4f}")



In [16]:
# List of base models to evaluate with AdaBoost
base_models_to_boost = [
    (SVC(probability=True), 'SVM'),
    (GaussianNB(), 'Naive Bayes'),
    (LogisticRegression(), 'Logistic Regression'),
    # (MLPClassifier(max_iter=1000), 'ANN'),
]

In [17]:
# Evaluate each model with AdaBoost
for model_class, model_name in base_models_to_boost:
    # Create an instance of the model
    # model_instance = model_class()  # Instantiate the model here
    evaluate_boosting_model(model_class, model_name)




SVM Boosting Model Evaluation Metrics:
Boosting Accuracy: 0.6667
Boosting Precision: 0.7419
Boosting Recall: 0.6970
Boosting F1 Score: 0.7188

Naive Bayes Boosting Model Evaluation Metrics:
Boosting Accuracy: 0.4074
Boosting Precision: 0.5455
Boosting Recall: 0.1818
Boosting F1 Score: 0.2727

Logistic Regression Boosting Model Evaluation Metrics:
Boosting Accuracy: 0.7407
Boosting Precision: 0.7879
Boosting Recall: 0.7879
Boosting F1 Score: 0.7879


