In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier

In [None]:
data = pd.read_csv('monkeypox_cleaned.csv')

In [None]:
X = data[['Systemic Illness', 'Rectal Pain', 'Sore Throat', 'Penile Oedema', 'Oral Lesions',
          'Solitary Lesion', 'Swollen Tonsils', 'HIV Infection', 'Red blood cells', 'White blood cells',
          'Age', 'Sexually Transmitted Infection']]
y = data['MPOX']

In [None]:
le = LabelEncoder()

In [None]:
categorical_features = ['Systemic Illness', 'Rectal Pain', 'Sore Throat', 'Penile Oedema', 'Oral Lesions',
                        'Solitary Lesion', 'Swollen Tonsils', 'HIV Infection', 'Sexually Transmitted Infection']
for feature in categorical_features:
    X[feature] = le.fit_transform(X[feature])

In [None]:
test_size = 0.2
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

In [None]:
print("Feature names used for building the classification models:")
print(X.columns.tolist())

In [None]:
print("\nShape of the training set (X_train):", X_train.shape)
print("Shape of the training set (y_train):", y_train.shape)
print("Shape of the test set (X_test):", X_test.shape)
print("Shape of the test set (y_test):", y_test.shape)

In [None]:
lr_model = LogisticRegression(random_state=random_state)
dt_model = DecisionTreeClassifier(random_state=random_state)
knn_model = KNeighborsClassifier()
svm_model = SVC(kernel='rbf', random_state=random_state)
nb_model = GaussianNB()

models = [lr_model, dt_model, knn_model, svm_model, nb_model]

for model in models:
    model.fit(X_train, y_train)

In [None]:
models = [lr_model, dt_model, knn_model, svm_model, nb_model]
model_names = ['Logistic Regression', 'Decision Tree', 'K-Nearest Neighbors', 'Support Vector Machine (RBF)', 'Naive Bayes']

In [None]:
for model, name in zip(models, model_names):
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Compute the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Print the confusion matrix for each model
    print(f"Confusion Matrix for {name}:")
    print(cm)
    print()

In [None]:
for model, name in zip(models, model_names):
    y_pred = model.predict(X_test)
    recall = recall_score(y_test, y_pred, pos_label='Positive')
    precision = precision_score(y_test, y_pred, pos_label='Positive')
    f1 = f1_score(y_test, y_pred, pos_label='Positive')
    print(f"{name}:")
    print(f"Recall: {recall:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"F1-score: {f1:.2f}")
    print()

Decission tree selected as model

Define the hyperparameter grid for Decision Tree

In [None]:
param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

Create the GridSearchCV object

In [None]:
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=5, scoring='f1')

Perform the grid search

In [None]:
grid_search.fit(X_train, y_train)

Get the best hyperparameters

In [None]:
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

 Train the Decision Tree model with the best hyperparameters

In [None]:
best_dt_model = DecisionTreeClassifier(**best_params)
best_dt_model.fit(X_train, y_train)

Make predictions on the test set using the best model

In [None]:
y_pred_best = best_dt_model.predict(X_test)

Print the test confusion matrix

In [None]:
print("Test Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_best))

Calculate the new scores for the selected metrics

In [None]:
recall_best = recall_score(y_test, y_pred_best, pos_label='Positive')
precision_best = precision_score(y_test, y_pred_best, pos_label='Positive')
f1_best = f1_score(y_test, y_pred_best, pos_label='Positive')

print("New Scores for the Best Decision Tree Model:")
print("Recall:", recall_best)
print("Precision:", precision_best)
print("F1-score:", f1_best)

create base learners 

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
svm_model = SVC(kernel='rbf', random_state=42)

creating the voting ensemble classifier

In [None]:
voting_model = VotingClassifier(estimators=[('dt', dt_model), ('svm', svm_model)], voting='hard')

Train the voting ensemble classifier

In [None]:
voting_model.fit(X_train, y_train)

Make predictions on the test set using the voting ensemble classifier

In [None]:
y_pred_voting = voting_model.predict(X_test)

Print the test confusion matrix for the voting ensemble classifier

In [None]:
print("Test Confusion Matrix for the Voting Ensemble Classifier:")
print(confusion_matrix(y_test, y_pred_voting))

Print the test confusion matrices for the individual base learners

In [None]:
print("Test Confusion Matrix for the Decision Tree (DT) Model:")
print(confusion_matrix(y_test, dt_model.fit(X_train, y_train).predict(X_test)))

print("Test Confusion Matrix for the Support Vector Machine with RBF Kernel (SVM-RBF) Model:")
print(confusion_matrix(y_test, svm_model.fit(X_train, y_train).predict(X_test)))