In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline as ImbPipeline
from joblib import dump

# Load train and test datasets
X_train = pd.read_csv('/content/X_train.csv')
X_test = pd.read_csv('/content/X_test.csv')
y_train = pd.read_csv('/content/Y_train.csv').values.ravel()
y_test = pd.read_csv('/content/Y_test.csv').values.ravel()

# Define the pipeline with KNNImputer, StandardScaler, ADASYN, and XGBoost Classifier
pipeline = ImbPipeline([
    ('imputer', KNNImputer(n_neighbors=5)),     # Impute missing values with KNN
    ('scaler', StandardScaler()),               # Standardize features
    ('adasyn', ADASYN(random_state=42)),        # ADASYN oversampling
    ('xgb', XGBClassifier(eval_metric='logloss', random_state=42))  # XGBoost Classifier
])

# Define the parameter grid for XGBoost
param_grid = {
    'xgb__n_estimators': [100, 200],
    'xgb__learning_rate': [0.01, 0.1, 0.2],
    'xgb__max_depth': [3, 4, 5],
    'xgb__subsample': [0.8, 1.0],
    'xgb__colsample_bytree': [0.8, 1.0],
    'xgb__gamma': [0, 0.1, 0.2],
    'xgb__scale_pos_weight': [1, 2]
}

# Scoring metrics for evaluation
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

# Setup StratifiedKFold for cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Apply GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=scoring,
    refit='f1',
    cv=cv,
    n_jobs=-1
)

# Fit the model on training data
grid_search.fit(X_train, y_train)

# Get the best parameters
print(f"Best Parameters: {grid_search.best_params_}")

# Predict on the test set using the best estimator
y_pred = grid_search.best_estimator_.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Display evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'Confusion Matrix:\n{conf_matrix}')

# Generate and display the classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

# Save the best model to a file using joblib
dump(grid_search.best_estimator_, 'xgb_model_grid.joblib')
print("Model saved as 'xgb_model_grid.joblib'")


Best Parameters: {'xgb__colsample_bytree': 1.0, 'xgb__gamma': 0.2, 'xgb__learning_rate': 0.2, 'xgb__max_depth': 5, 'xgb__n_estimators': 200, 'xgb__scale_pos_weight': 1, 'xgb__subsample': 0.8}
Accuracy: 0.8892
Precision: 0.8961
Recall: 0.8892
F1 Score: 0.8905
Confusion Matrix:
[[1059  151]
 [  59  627]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.88      0.91      1210
           1       0.81      0.91      0.86       686

    accuracy                           0.89      1896
   macro avg       0.88      0.89      0.88      1896
weighted avg       0.90      0.89      0.89      1896

Model saved as 'xgb_model_grid.joblib'
