In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_curve, auc, precision_recall_curve, confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import validation_curve

# Load and preprocess your dataset (replace X and y with your data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

# Create an instance of the XGBoost classifier
xgb_model = XGBClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters and best accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

# Train a model with the best parameters
best_xgb_model = XGBClassifier(**best_params)
best_xgb_model.fit(X_train, y_train)

# Make predictions with the best model on the training and testing data
y_train_pred = best_xgb_model.predict(X_train)
y_test_pred = best_xgb_model.predict(X_test)

# Calculate accuracy on the training and testing data
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Plot the learning curve
plt.figure(figsize=(10, 5))
plt.plot(best_xgb_model.evals_result()['validation_0']['error'], label='Train')
plt.plot(best_xgb_model.evals_result()['validation_1']['error'], label='Test')
plt.xlabel('Iterations')
plt.ylabel('Error')
plt.title('XGBoost Learning Curve')
plt.legend()
plt.show()

# Plot feature importance
plt.figure(figsize=(10, 5))
xgb.plot_importance(best_xgb_model, importance_type='weight')
plt.title('XGBoost Feature Importance')
plt.show()

# Check for overfitting and adjust n_estimators if needed
if train_accuracy > test_accuracy:
    print("Warning: Overfitting detected. Reducing n_estimators.")
    best_params['n_estimators'] = int(best_params['n_estimators'] * 0.8)  # You can adjust the factor as needed
    best_xgb_model = XGBClassifier(**best_params)
    best_xgb_model.fit(X_train, y_train)

# Make predictions with the adjusted model
y_test_pred = best_xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_test_pred)

# Plot ROC curve
y_prob = best_xgb_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Plot Precision-Recall curve
precision, recall, _ = precision_recall_curve(y_test, y_prob)
plt.figure()
plt.step(recall, precision, color='b', where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall curve')
plt.show()

# Plot the confusion matrix heatmap
confusion_matrix_plot = plot_confusion_matrix(best_xgb_model, X_test, y_test, cmap=plt.cm.Blues, display_labels=["Class 0", "Class 1"])
confusion_matrix_plot.ax_.set_title('Confusion Matrix Heatmap')
plt.show()

# Validation curve for n_estimators
param_range = np.arange(100, 501, 50)
train_scores, test_scores = validation_curve(best_xgb_model, X_train, y_train, param_name="n_estimators", param_range=param_range, cv=5, scoring="accuracy")
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 5))
plt.title("Validation Curve with XGBoost")
plt.xlabel("n_estimators")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="navy", lw=lw)
plt.legend(loc="best")
plt.show()

# Print the results
print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)
print("Test Accuracy:", accuracy)
