In [1]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [8]:
import pickle
def loadfile(filename):
    with open(f'{filename}.pickle', 'rb') as fp:
        data = pickle.load(fp)
    return data

In [9]:
import numpy as np

X_train = loadfile('X_train')
X_test = loadfile('X_test')
X_val = loadfile('X_val')
y_train = loadfile('y_train')
y_test = loadfile('y_test')
y_val = loadfile('y_val')

In [70]:
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score

model = XGBClassifier()
model.fit(X_train, y_train)
predict = model.predict(X_val)
print(classification_report(y_val, predict))
print(f'BAS:{balanced_accuracy_score(y_val, predict)}')
fbeta_score(y_val, predict, average='weighted', beta=2)

              precision    recall  f1-score   support

           0       0.69      0.20      0.31      1600
           1       0.93      0.99      0.96     17688

    accuracy                           0.93     19288
   macro avg       0.81      0.60      0.63     19288
weighted avg       0.91      0.93      0.91     19288

BAS:0.595560408186341


0.917226616376693

In [73]:
import numpy as np
from imblearn.under_sampling import RandomUnderSampler

# Assuming X_train and y_train are your training data and labels
# X_train.shape should be (n_samples, n_features)
# y_train.shape should be (n_samples,)

# Define the undersampling ratio
desired_ratio = {0: np.sum(y_train == 0), 1: int(4*np.sum(y_train == 0))}

# Random undersampling
undersampler = RandomUnderSampler(sampling_strategy=desired_ratio, random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

In [77]:
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score

model = XGBClassifier()
model.fit(X_train_resampled, y_train_resampled)
predict = model.predict(X_val)
print(classification_report(y_val, predict))
print(f'BAS:{balanced_accuracy_score(y_val, predict)}')
fbeta_score(y_val, predict, average='weighted', beta=2)

              precision    recall  f1-score   support

           0       0.42      0.40      0.41      1600
           1       0.95      0.95      0.95     17688

    accuracy                           0.90     19288
   macro avg       0.68      0.68      0.68     19288
weighted avg       0.90      0.90      0.90     19288

BAS:0.677085736092266
ROC: 0.677085736092266


0.9041871449095543

In [None]:
from sklearn.model_selection import GridSearchCV

model = XGBClassifier()
param_grid = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [200, 300, 400],
    'max_depth': [6, 7, 8],
    'min_child_weight': [50, 100, 200, 300, 400],
    'reg_alpha': [0.1, 0.3, 0.5, 0.7, 1.0],
    'reg_lambda': [0.3, 0.5, 0.7, 1.0, 1.2, 1.4]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring= 'f1_micro')
grid_search.fit(X_train_resampled, y_train_resampled)
print(grid_search.best_params_)

In [75]:
model = XGBClassifier(learning_rate=0.1, max_depth=6, min_child_weight=50, n_estimators=300, reg_alpha=1.0, reg_lambda=1.0) #param o tren kia
model.fit(X_train_resampled, y_train_resampled)
predict = model.predict(X_val)
print(classification_report(y_val, predict))
print(f'BAS:{balanced_accuracy_score(y_val, predict)}')

              precision    recall  f1-score   support

           0       0.45      0.40      0.42      1600
           1       0.95      0.96      0.95     17688

    accuracy                           0.91     19288
   macro avg       0.70      0.68      0.69     19288
weighted avg       0.91      0.91      0.91     19288

BAS:0.6766713308457711


In [80]:
#Oversampling: SMOTE
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from sklearn.metrics import roc_auc_score

X_resampled, y_resampled = SMOTE(random_state=27).fit_resample(X_train_resampled, y_train_resampled)

clf_smote = XGBClassifier(learning_rate=0.1, max_depth=6, min_child_weight=50, n_estimators=300, reg_alpha=1.0, reg_lambda=1.0).fit(X_resampled, y_resampled)
predict = clf_smote.predict(X_val)

print(classification_report(y_val, predict))
print(f'f_0.5: {fbeta_score(y_test, predict, beta=0.5)}')
print(f'f_2: {fbeta_score(y_test, predict, beta=2)}')
print(f'ROC: {roc_auc_score(y_val, predict)}')


              precision    recall  f1-score   support

           0       0.41      0.47      0.44      1600
           1       0.95      0.94      0.94     17688

    accuracy                           0.90     19288
   macro avg       0.68      0.71      0.69     19288
weighted avg       0.91      0.90      0.90     19288

f_0.5: 0.9171979616553553
f_2: 0.9065704476668022
ROC: 0.7056093113975577


In [63]:
#Oversampling: ADASYN

X_resampled, y_resampled = ADASYN(random_state=27).fit_resample(X_train_resampled, y_train_resampled)

clf_smote = XGBClassifier(learning_rate=0.1, max_depth=6, min_child_weight=50, n_estimators=300, reg_alpha=1.0, reg_lambda=1.0).fit(X_resampled, y_resampled)
predict = clf_smote.predict(X_val)

print(classification_report(y_val, predict))
print(f'ROC: {roc_auc_score(y_val, predict)}')

              precision    recall  f1-score   support

           0       0.41      0.46      0.43      1600
           1       0.95      0.94      0.94     17688

    accuracy                           0.90     19288
   macro avg       0.68      0.70      0.69     19288
weighted avg       0.91      0.90      0.90     19288

ROC: 0.6977073439620081


In [64]:
#Oversampling: RandomOverSampling

X_resampled, y_resampled = RandomOverSampler(random_state=27).fit_resample(X_train_resampled, y_train_resampled)
clf_smote = XGBClassifier(learning_rate=0.1, max_depth=6, min_child_weight=50, n_estimators=300, reg_alpha=1.0, reg_lambda=1.0).fit(X_resampled, y_resampled)
predict = clf_smote.predict(X_val)

print(classification_report(y_val, predict))
print(f'ROC: {roc_auc_score(y_val, predict)}')

              precision    recall  f1-score   support

           0       0.24      0.71      0.36      1600
           1       0.97      0.80      0.88     17688

    accuracy                           0.79     19288
   macro avg       0.61      0.75      0.62     19288
weighted avg       0.91      0.79      0.84     19288

ROC: 0.7545836160108549


In [65]:
#Phạt mô hình
from sklearn.utils.class_weight import compute_sample_weight

sample_weights = compute_sample_weight(
    class_weight={0:0.55, 1: 0.45},
    y=y_train_resampled
)

model_pen = XGBClassifier(learning_rate=0.1,
                        max_depth=6,
                        min_child_weight=50,
                        n_estimators=300,
                        reg_alpha=1.0,
                        reg_lambda=1.0)
model_pen.fit(X_train_resampled, y_train_resampled, sample_weight=sample_weights)
predict = model_pen.predict(X_val)

print(classification_report(y_val, predict))
print(f'ROC: {roc_auc_score(y_val, predict)}')

              precision    recall  f1-score   support

           0       0.41      0.44      0.43      1600
           1       0.95      0.94      0.95     17688

    accuracy                           0.90     19288
   macro avg       0.68      0.69      0.69     19288
weighted avg       0.90      0.90      0.90     19288

ROC: 0.6912768543645409


In [58]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif
selector = SelectKBest(f_classif, k=4)
X_selected = selector.fit_transform(X_train_resampled, y_train_resampled)
X_val_new = selector.transform(X_val)

TEST TRÊN TẬP TEST

In [79]:
#Oversampling: SMOTE
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from sklearn.metrics import roc_auc_score

X_resampled, y_resampled = SMOTE(random_state=27).fit_resample(X_train_resampled, y_train_resampled)

clf_smote = XGBClassifier(learning_rate=0.1, max_depth=6, min_child_weight=50, n_estimators=300, reg_alpha=1.0, reg_lambda=1.0).fit(X_resampled, y_resampled)
predict = clf_smote.predict(X_test)

print(classification_report(y_test, predict))
print(f'f_0.5: {fbeta_score(y_test, predict, beta=0.5)}')
print(f'f_2: {fbeta_score(y_test, predict, beta=2)}')
print(f'ROC: {roc_auc_score(y_test, predict)}')


              precision    recall  f1-score   support

           0       0.38      0.44      0.41      1510
           1       0.95      0.94      0.95     17778

    accuracy                           0.90     19288
   macro avg       0.67      0.69      0.68     19288
weighted avg       0.91      0.90      0.90     19288

f_0.5: 0.9491619857639916
f_2: 0.9415818801182124
ROC: 0.6890774295784878
