In [None]:
# model_training.ipynb

# Import necessary libraries
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
import joblib
import yaml
import matplotlib.pyplot as plt
import seaborn as sns
import logging

# Load configuration
with open("../config.yaml", "r") as file:
    config = yaml.safe_load(file)

# Configure logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("Model Training")

# Load the cleaned data
cleaned_data_path = config["data"]["cleaned_data_path"]
data = pd.read_csv(cleaned_data_path)
logger.info(f"Cleaned data loaded successfully from {cleaned_data_path}")

# Split the data into training and test sets
target_column = config["data"]["target_column"]
X = data.drop(columns=[target_column])
y = data[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
logger.info("Data split into training and test sets")

# Define the XGBoost model
xgb_model = xgb.XGBClassifier(objective="binary:logistic", eval_metric="auc", use_label_encoder=False)
logger.info("XGBoost model initialized")

# Define hyperparameters for tuning
param_grid = {
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "n_estimators": [100, 200, 300],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9]
}

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring="roc_auc", verbose=2)
grid_search.fit(X_train, y_train)
logger.info("Grid search completed for hyperparameter tuning")

# Retrieve the best model
best_model = grid_search.best_estimator_
logger.info(f"Best model parameters: {grid_search.best_params_}")

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

logger.info(f"Model Accuracy: {accuracy:.4f}")
logger.info(f"Model AUC Score: {auc:.4f}")
logger.info(f"Classification Report:\n{report}")

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Plot the ROC curve
xgb.plot_importance(best_model, importance_type="gain", max_num_features=10)
plt.title("Top 10 Feature Importances by Gain")
plt.show()

# Save the trained model
model_path = config["model"]["path"]
joblib.dump(best_model, model_path)
logger.info(f"Trained model saved to {model_path}")
