In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from includes.constants import FN_UNIMPORTANT_FEATURES, FN_CPP_MEMORY_FEATURES, FN_TARGET_FEATURE, FN_EXPLICIT_EXCLUDE_FEATURES
from includes.helpers import get_function_train_test_set
from includes.constants import GLOBAL_RANDOM_STATE

In [None]:
X_train, X_test, y_train, y_test = get_function_train_test_set()

In [None]:
# Step 4: Feature scaling (if needed)
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

X_train_scaled = X_train
X_test_scaled = X_test

In [None]:
# Best Cross-validation Accuracy: 0.6840
model = xgb.XGBClassifier(
  objective='binary:logistic',
  eval_metric='logloss',
  random_state=GLOBAL_RANDOM_STATE,
  colsample_bytree=0.8,
  learning_rate=0.05,
  max_depth=10,
  n_estimators=400,
  subsample=0.8
)

model.fit(
  X_train_scaled, y_train
)


In [None]:
# Predict class labels
y_pred = model.predict(X_test_scaled)

# Predict probabilities (useful for AUC-ROC)
y_proba = model.predict_proba(X_test_scaled)[:, 1]


In [None]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.4f}")

# Recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.4f}")

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

# AUC-ROC
auc_roc = roc_auc_score(y_test, y_proba)
print(f"AUC-ROC: {auc_roc:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)



In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.figure()
plt.plot(fpr, tpr, label=f'AUC = {auc_roc:.4f}')
plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
import seaborn as sns

sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve

# Compute precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

# Plot precision-recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label='Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower left')
plt.show()


In [None]:
import numpy as np

# Get feature importance from the model
importance = model.feature_importances_

# Sort features by importance
sorted_idx = np.argsort(importance)[::-1]

plt.figure(figsize=(8, 6))
plt.barh(range(len(importance)), importance[sorted_idx], align='center',color='blue')
plt.yticks(range(len(importance)), [X_train.columns[i] for i in sorted_idx])
plt.title('Feature Importance (XGBoost)')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.tight_layout()
plt.gca().invert_yaxis()  # Invert y-axis to show the most important feature on top
plt.show()


In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    model, X_train_scaled, y_train, cv=5, scoring='accuracy', n_jobs=-1)

# Calculate mean and std of scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot learning curve
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_mean, label='Training Accuracy', color='blue')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color='blue', alpha=0.2)
plt.plot(train_sizes, test_mean, label='Validation Accuracy', color='green')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color='green', alpha=0.2)
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.title('Learning Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
from sklearn.calibration import calibration_curve

# Compute calibration curve
prob_true, prob_pred = calibration_curve(y_test, y_proba, n_bins=10)

# Plot calibration curve
plt.figure(figsize=(8, 6))
plt.plot(prob_pred, prob_true, marker='o', label='Calibration Curve')
plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Curve')
plt.legend(loc='upper left')
plt.show()


In [None]:

plt.figure(figsize=(8, 6))
plt.hist(y_proba[y_test == 0], bins=50, alpha=0.6, label='Class 0', color='blue')
plt.hist(y_proba[y_test == 1], bins=50, alpha=0.6, label='Class 1', color='red')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.title('Distribution of Predicted Probabilities')
plt.legend(loc='upper center')
plt.show()


In [None]:
residuals = y_proba - y_test

plt.figure(figsize=(8, 6))
plt.scatter(range(len(residuals)), residuals, alpha=0.6, color='purple')
plt.axhline(0, color='black', linestyle='--')
plt.xlabel('Sample Index')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()


In [None]:
#HYPER PARAM OPT XGB

from sklearn.model_selection import GridSearchCV

param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 10],
    'n_estimators': [100, 200, 400],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=xgb.XGBClassifier(objective='binary:logistic', random_state=GLOBAL_RANDOM_STATE),
    param_grid=param_grid,
    scoring='accuracy',  # Or use other metrics like 'roc_auc'
    cv=5,  # 5-fold cross-validation
    verbose=1,
    n_jobs=-1  # Parallel processing
)

grid_search.fit(X_train_scaled, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)
