In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier


In [3]:
# Example: Load a classification dataset
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X, y = data.data, data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [4]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "XGBoost": XGBClassifier(eval_metric='logloss')  # eval_metric needed for newer versions
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f"\n{name} Performance:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1-Score:", f1_score(y_test, y_pred))



Logistic Regression Performance:
Accuracy: 0.9736842105263158
Precision: 0.9722222222222222
Recall: 0.9859154929577465
F1-Score: 0.9790209790209791

Random Forest Performance:
Accuracy: 0.9649122807017544
Precision: 0.958904109589041
Recall: 0.9859154929577465
F1-Score: 0.9722222222222222

SVM Performance:
Accuracy: 0.9824561403508771
Precision: 0.9726027397260274
Recall: 1.0
F1-Score: 0.9861111111111112

XGBoost Performance:
Accuracy: 0.956140350877193
Precision: 0.9583333333333334
Recall: 0.971830985915493
F1-Score: 0.965034965034965


In [5]:
# Example: Tune Random Forest
param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='f1')
grid_search_rf.fit(X_train, y_train)

print("Best parameters (RF):", grid_search_rf.best_params_)
print("Best F1 Score (RF):", grid_search_rf.best_score_)


Best parameters (RF): {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 50}
Best F1 Score (RF): 0.9722550677337137


In [6]:
# Example: Tune XGBoost
param_dist_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0]
}

random_search_xgb = RandomizedSearchCV(XGBClassifier(eval_metric='logloss'), param_distributions=param_dist_xgb,
                                       n_iter=10, scoring='f1', cv=5, random_state=42)
random_search_xgb.fit(X_train, y_train)

print("Best parameters (XGB):", random_search_xgb.best_params_)
print("Best F1 Score (XGB):", random_search_xgb.best_score_)


Best parameters (XGB): {'subsample': 0.7, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1}
Best F1 Score (XGB): 0.9792195937429515


In [7]:
# Evaluate tuned XGBoost
best_xgb = random_search_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

print("\nTuned XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))



Tuned XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [8]:
summary = []

for name, model in models.items():
    y_pred = model.predict(X_test)
    summary.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred)
    })

# Add tuned models
summary.append({
    "Model": "Tuned Random Forest",
    "Accuracy": accuracy_score(y_test, grid_search_rf.best_estimator_.predict(X_test)),
    "Precision": precision_score(y_test, grid_search_rf.best_estimator_.predict(X_test)),
    "Recall": recall_score(y_test, grid_search_rf.best_estimator_.predict(X_test)),
    "F1-Score": f1_score(y_test, grid_search_rf.best_estimator_.predict(X_test)),
})

summary.append({
    "Model": "Tuned XGBoost",
    "Accuracy": accuracy_score(y_test, best_xgb.predict(X_test)),
    "Precision": precision_score(y_test, best_xgb.predict(X_test)),
    "Recall": recall_score(y_test, best_xgb.predict(X_test)),
    "F1-Score": f1_score(y_test, best_xgb.predict(X_test)),
})

df_summary = pd.DataFrame(summary)
print("\nModel Comparison:")
print(df_summary.sort_values(by="F1-Score", ascending=False))



Model Comparison:
                 Model  Accuracy  Precision    Recall  F1-Score
2                  SVM  0.982456   0.972603  1.000000  0.986111
0  Logistic Regression  0.973684   0.972222  0.985915  0.979021
1        Random Forest  0.964912   0.958904  0.985915  0.972222
5        Tuned XGBoost  0.964912   0.958904  0.985915  0.972222
3              XGBoost  0.956140   0.958333  0.971831  0.965035
4  Tuned Random Forest  0.956140   0.958333  0.971831  0.965035
