In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LassoCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,
    BaggingClassifier, StackingClassifier, VotingClassifier
)
# **Đánh giá hiệu suất mô hình (Model Evaluation)**
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, roc_auc_score,
    confusion_matrix, ConfusionMatrixDisplay, log_loss, precision_recall_curve, auc,
    cohen_kappa_score, matthews_corrcoef
)
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder

import joblib
from google.colab import drive
drive.mount('/content/drive')
import os

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = r"/content/drive/MyDrive/MINH THÀNH 21416C/1. Code/Code Server/Data/1_df_train.csv"

df = pd.read_csv( file_path)
df_train = df.copy()
df_train.head(20)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0
5,5,15771669,Genovese,588,Germany,Male,36.0,4,131778.58,1,1.0,0.0,136024.31,1
6,6,15692819,Ch'ang,593,France,Female,30.0,8,144772.69,1,1.0,0.0,29792.11,0
7,7,15669611,Chukwuebuka,678,Spain,Male,37.0,1,138476.41,1,1.0,0.0,106851.6,0
8,8,15691707,Manna,676,France,Male,43.0,4,0.0,2,1.0,0.0,142917.13,0
9,9,15591721,Cattaneo,583,Germany,Male,40.0,4,81274.33,1,1.0,1.0,170843.07,0


- DataTypeTransformer

In [None]:
class DataTypeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if 'Age' in X.columns:
            X['Age'] = X['Age'].round(0).astype(int)
        for col in ['HasCrCard', 'IsActiveMember']:
            if col in X.columns:
                X[col] = X[col].astype(int)
        return X

- AgeBinningTransformer

In [None]:
class FeatureEngineering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.age_50th = X['Age'].quantile(0.50)
        self.age_75th = X['Age'].quantile(0.75)
        return self

    def transform(self, X):
        X = X.copy()
        # Creating Age_Group
        X['Age_Group'] = pd.cut(
            X['Age'],
            bins=[0, self.age_50th, self.age_75th, X['Age'].max()],
            labels=['Young', 'Middle-aged', 'Older']
        )
        # Drop the original Age column
        X.drop(columns=['Age'], inplace=True)
        return X


- ColumnDropper

In [None]:
# Bộ biến đổi: Chuyển đổi từ mảng sang DataFrame
class ArrayToDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self, transformer, feature_names=None):
        self.transformer = transformer
        self.feature_names = feature_names

    def fit(self, X, y=None):
        self.feature_names = self.get_feature_names(X)
        return self

    def transform(self, X):
        return pd.DataFrame(X, columns=self.feature_names)

    def get_feature_names(self, X):
        # Lấy tên cột từ ColumnTransformer
        categorical_names = self.transformer.named_transformers_['categorical'].get_feature_names_out()
        numerical_names = ['CreditScore', 'Balance', 'EstimatedSalary', 'Tenure']
        ordinal_names = ['Age_Group']
        passthrough_names = ['NumOfProducts', 'HasCrCard', 'IsActiveMember']
        return list(categorical_names) + numerical_names + ordinal_names + passthrough_names

# Bộ biến đổi: Xóa cột không cần thiết
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop, errors='ignore')

- Specify Feature Categories

In [None]:
# Các cột cần xử lý
categorical_features = ['Gender', 'Geography']
numerical_features = ['CreditScore', 'Balance', 'EstimatedSalary', 'Tenure']

# Bộ xử lý cột
column_transformer = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(drop='first'), categorical_features),
        ('numerical_standard', StandardScaler(), numerical_features[:3]),
        ('numerical_minmax', StandardScaler(), ['Tenure']),
        ('ordinal_age_group', OrdinalEncoder(categories=[['Young', 'Middle-aged', 'Older']]), ['Age_Group'])
    ],
    remainder='passthrough'  # Giữ các cột không qua xử lý
)

- Create Transformers for Each Feature Type

In [None]:
# Pipeline tiền xử lý
preprocessing_pipeline = Pipeline([
    ('data_type_transform', DataTypeTransformer()),  # Chuyển đổi kiểu dữ liệu
    ('feature_engineering', FeatureEngineering()),  # Tạo nhóm tuổi
    ('preprocess', column_transformer),  # Tiền xử lý
    ('to_dataframe', ArrayToDataFrame(column_transformer)),  # Chuyển về DataFrame
    ('drop_columns', ColumnDropper(columns_to_drop=['Geography_Spain']))  # Xóa cột không cần thiết
])

- Create the Full Pipeline

In [None]:
# Create the pipeline with custom transformers and logistic regression model
pipeline = Pipeline([
    ('preprocessor', preprocessing_pipeline),
    ('classifier', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42))
])

- Prepare the Data

In [None]:
df_train =df.copy()
df_test = df.copy()

drop_columns = ['id', 'CustomerId', 'Surname']
df_train = df_train.drop(columns=drop_columns, errors='ignore')
df_test = df_test.drop(columns=drop_columns, errors='ignore')

X = df_train.drop(columns=['Exited'])
y = df_train['Exited']

- Fit the Pipeline

In [None]:
# Fit the pipeline on the training data
pipeline.fit(X, y)

In [None]:
import sklearn
print(sklearn.__version__)
#1.6.1 # old

1.4.2


In [None]:
# !pip install scikit-learn==1.4.2
!pip install dill



In [None]:
import dill
import os

save_path = "/content/drive/MyDrive/MINH THÀNH 21416C/1. Code/Model/churn_prediction_pipeline.pkl"
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# Lưu pipeline với dill
with open(save_path, "wb") as f:
    dill.dump(pipeline, f)

print(f"Pipeline đã được lưu vào: {save_path}")


Pipeline đã được lưu vào: /content/drive/MyDrive/MINH THÀNH 21416C/1. Code/Model/churn_prediction_pipeline.pkl


- Using the Pipeline on New Data

In [None]:
import dill

# Đường dẫn file
load_path = "/content/drive/MyDrive/MINH THÀNH 21416C/1. Code/Model/churn_prediction_pipeline.pkl"

# Tải pipeline với dill
with open(load_path, "rb") as f:
    pipeline = dill.load(f)

print("Pipeline đã được tải thành công!")


Pipeline đã được tải thành công!


In [None]:
X_test = df_train.drop(columns=['Exited'])
y_test = df_train['Exited']

# Dự đoán nhãn (predicted labels)
y_pred = pipeline.predict(X_test)

# Dự đoán xác suất (predicted probabilities)
y_proba = pipeline.predict_proba(X_test)[:, 1]

# Tính các metric đánh giá
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
logloss = log_loss(y_test, y_proba)

# In các giá trị metric
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"Log Loss: {logloss:.4f}")

Accuracy: 0.8622
Precision: 0.7448
Recall: 0.5304
F1-Score: 0.6196
ROC-AUC: 0.8855
Log Loss: 0.3261
