In [None]:
# import packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# set the aesthetic style of the plots
sns.set_style()

# filter warning messages
import warnings
warnings.filterwarnings('ignore')


# Assuming feature_importance is your DataFrame
# ... (your code to create and sort feature_importance) ...

# Set options to display all rows and columns
pd.set_option('display.max_rows', None)  # None means unlimited rows
pd.set_option('display.max_columns', None) # None means unlimited columns
pd.set_option('display.width', None)      # None means auto-detect width
pd.set_option('display.max_colwidth', None) # None means unlimited column width


In [None]:
X_processed = pd.read_csv('saved/clustered_data.csv')


In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE

def apply_smote(df, target_column, random_state=42):
    # Separate features (X) and target (y)
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    # Apply SMOTE
    smote = SMOTE(random_state=random_state)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Create a new DataFrame with resampled data
    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled[target_column] = y_resampled

    return df_resampled

X_processed = apply_smote(X_processed, ['Cluster'])

In [None]:
from sklearn.feature_selection import f_classif

X = X_processed.drop(columns=['Cluster', "target_default"])
y = X_processed['Cluster']

print("Cluster in df_normalized:", 'Cluster' in X_processed.columns)  # Should be True
print("Cluster in X:", 'Cluster' in X.columns)  # Should be False



f_scores, p_values = f_classif(X, y)

# Create a DataFrame to display results
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'F-Score': f_scores,
    'P-Value': p_values
})

# Sort by F-Score (higher F-Score means more important)
feature_importance = feature_importance.sort_values(by='F-Score', ascending=False)

print(feature_importance)

In [None]:
# X = X.drop(columns=["score_2_bin_x_fraud_score_bin", 
#                 "score_1_bin_x_fraud_score_bin",
#                 "score_1_bin_x_score_2_bin",
#                 "score_2_bin",
#                 "score_1_bin"])
X.nunique().sort_values()

In [None]:
X.info()

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, test_size=0.1)
print(len(X_train), len(X_test))
print("NaN in X_train:", X_train.isna().sum().sum())
print("NaN in y_train:", y_train.isna().sum())
print("Infinite values in X:", np.isinf(y_train.values).sum())

In [None]:
X_train_rus, y_train_rus = (X_train, y_train)

f_scores, p_values = f_classif(X_train_rus, y_train_rus)

# Create a DataFrame to display feature importance
yo = pd.DataFrame({'Feature': X_train_rus.columns, 'F-Score': f_scores, 'P-Value': p_values})

# Sort by F-Score (higher F-Score means more important)
feature_importance = yo.sort_values(by='F-Score', ascending=False)

# Print the feature importance
print(feature_importance)

In [None]:
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd

# Apply PCA
pca = PCA(n_components=len(X_train_rus.columns))
pca.fit(X_train_rus)

# Get explained variance ratio
explained_variance = pca.explained_variance_ratio_ * 100

# Create DataFrame
variance_df = pd.DataFrame({'Feature': X_train_rus.columns, 'Explained Variance (%)': explained_variance})
variance_df = variance_df.sort_values(by='Explained Variance (%)', ascending=False)

print(variance_df)


In [None]:
# final XGBoost model
xgb = XGBClassifier(
    max_depth=5, 
    learning_rate=0.01,  
    n_estimators=200, 
    gamma=1,  
    min_child_weight=6,  
    # subsample=0.8,  
    # colsample_bytree=0.8,  
    # reg_lambda=1,  
    # reg_alpha=0.1
)

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# xgb = RandomForestClassifier(
#     n_estimators=200,  # Number of trees in the forest
#     max_depth=8,        # Maximum depth of the trees
#     #... other hyperparameters (e.g., min_samples_split, min_samples_leaf, etc.)...
#     random_state=42,     # For reproducibility
# )

# xgb = AdaBoostClassifier(n_estimators=100, learning_rate=0.01, algorithm='SAMME', random_state=42)  # Adjust parameters
xgb.fit(X_train_rus, y_train_rus)
# prediction
X_test_xgb = X_test #scaler.transform(X_test)
print(X_test_xgb.head(10))
y_pred_xgb = xgb.predict(X_test_xgb)

# classification report
print(classification_report(y_test, y_pred_xgb))

# confusion matrix
fig, ax = plt.subplots()
sns.heatmap(confusion_matrix(y_test, y_pred_xgb, normalize='true'), annot=True, ax=ax)
ax.set_title('Confusion Matrix - XGBoost test')
ax.set_xlabel('Predicted Value')
ax.set_ylabel('Real Value')

plt.show()





from sklearn.preprocessing import label_binarize
y_pred_xgb = xgb.predict_proba(X_test_xgb)
n_classes = len(np.unique(y_test))
y_test_binarized = label_binarize(y_test, classes=np.arange(n_classes))

auc_score = roc_auc_score(y_test_binarized, y_pred_xgb)
print("AUC Score:", auc_score)

# Plot ROC curve for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_pred_xgb[:, i])
    roc_auc[i] = roc_auc_score(y_test_binarized[:, i], y_pred_xgb[:, i])

# Plot ROC curves
plt.figure(figsize=(8, 6))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f'Class {i} (AUC = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guess')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve - XGBoost (Multiclass)')
plt.legend()
plt.show()

In [None]:
import pickle

# Save label encoders to disk
with open('saved/xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb, f)