In [29]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import plotly.express as px
import warnings
import pymongo
from pymongo import MongoClient
from ydata_profiling import ProfileReport
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import warnings
import wandb


In [30]:
import warnings
warnings.filterwarnings('ignore')

In [31]:
# Login to Weights & Biases
wandb.login()



True

In [32]:
input_artifact="Bank-Marketing/raw_data.csv:latest"
artifact_name="preprocessed_data.csv"
artifact_type="clean_data"
artifact_description="Data after preprocessing"

In [33]:
# Create a new job_type
run = wandb.init(project="Bank-Marketing", job_type="process_data")

In [34]:
run = wandb.init(project='Bank-Marketing', job_type='process_data')
artifact = run.use_artifact('hangtn13-ssc-national-economics-university/Bank-Marketing/raw_data:v2', type='dataset')
artifact_dir = artifact.download()
df= pd.read_csv(os.path.join(artifact_dir, 'raw_data.csv'))

[34m[1mwandb[0m:   1 of 1 files downloaded.  


# **LOADING DATA**

## Data Description

Citation Request:
  This dataset is publicly available for research. The details are described in [Moro et al., 2014].
  Please include this citation if you plan to use this database:

  [Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, In press, http://dx.doi.org/10.1016/j.dss.2014.03.001

  Available at: [pdf] http://dx.doi.org/10.1016/j.dss.2014.03.001
                [bib] http://www3.dsi.uminho.pt/pcortez/bib/2014-dss.txt

**1. Title: Bank Marketing (with social/economic context)**

**2. Sources**
   Created by: Sérgio Moro (ISCTE-IUL), Paulo Cortez (Univ. Minho) and Paulo Rita (ISCTE-IUL) @ 2014
   
**3. Past Usage:**

  The full dataset (bank-additional-full.csv) was described and analyzed in:

  S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems (2014), doi:10.1016/j.dss.2014.03.001.

**4. Relevant Information:**

   This dataset is based on "Bank Marketing" UCI dataset (please check the description at: http://archive.ics.uci.edu/ml/datasets/Bank+Marketing).
   The data is enriched by the addition of five new social and economic features/attributes (national wide indicators from a ~10M population country), published by the Banco de Portugal and publicly available at: https://www.bportugal.pt/estatisticasweb.
   This dataset is almost identical to the one used in [Moro et al., 2014] (it does not include all attributes due to privacy concerns).
   Using the rminer package and R tool (http://cran.r-project.org/web/packages/rminer/), we found that the addition of the five new social and economic attributes (made available here) lead to substantial improvement in the prediction of a success, even when the duration of the call is not included. Note: the file can be read in R using: d=read.table("bank-additional-full.csv",header=TRUE,sep=";")
   
   The zip file includes two datasets:
      1) bank-additional-full.csv with all examples, ordered by date (from May 2008 to November 2010).
      2) bank-additional.csv with 10% of the examples (4119), randomly selected from bank-additional-full.csv.
   The smallest dataset is provided to test more computationally demanding machine learning algorithms (e.g., SVM).

   The binary classification goal is to predict if the client will subscribe a bank term deposit (variable y).

**5. Number of Instances:** 41188 for bank-additional-full.csv

**6. Number of Attributes:** 20 + output attribute.

**7. Attribute information:**

   For more information, read [Moro et al., 2014].

   Input variables:
   **bank client data:**

   1 - age (numeric)

   2 - job : type of job (categorical: "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown")

   3 - marital : marital status (categorical: "divorced","married","single","unknown"; note: "divorced" means divorced or widowed)

   4 - education (categorical: "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown")

   5 - default: has credit in default? (categorical: "no","yes","unknown")

   6 - housing: has housing loan? (categorical: "no","yes","unknown")

   7 - loan: has personal loan? (categorical: "no","yes","unknown")
**   related with the last contact of the current campaign:**
   8 - contact: contact communication type (categorical: "cellular","telephone")

   9 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")

  10 - day_of_week: last contact day of the week (categorical: "mon","tue","wed","thu","fri")

  11 - duration: last contact duration, in seconds (numeric). Important note:  this attribute highly affects the output target (e.g., if duration=0 then y="no"). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

**  other attributes:**
  12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)

  13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)

  14 - previous: number of contacts performed before this campaign and for this client (numeric)

  15 - poutcome: outcome of the previous marketing campaign (categorical: "failure","nonexistent","success")

**  social and economic context attributes**

  16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)

  17 - cons.price.idx: consumer price index - monthly indicator (numeric)

  18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)     

  19 - euribor3m: euribor 3 month rate - daily indicator (numeric)
  
  20 - nr.employed: number of employees - quarterly indicator (numeric)

**  Output variable (desired target):**
  21 - y - has the client subscribed a term deposit? (binary: "yes","no")

**8. Missing Attribute Values:** There are several missing values in some categorical attributes, all coded with the "unknown" label. These missing values can be treated as a possible class label or using deletion or imputation techniques.



In [35]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp_var_rate    41188 non-null  float64
 16  cons_price_idx  41188 non-null  float64
 17  cons_conf_idx   41188 non-null 

In [37]:
df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [38]:
df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp_var_rate      0
cons_price_idx    0
cons_conf_idx     0
euribor3m         0
nr_employed       0
y                 0
dtype: int64

In [39]:
df.duplicated().sum()

12

In [40]:
df.drop_duplicates(inplace=True)

- Xoá cột duration
Lí do: duration (thời lượng cuộc gọi cuối cùng) chỉ được biết sau khi cuộc gọi kết thúc, và tại thời điểm đó, kết quả y (khách hàng có đăng ký hay không) cũng đã được biết. Hơn nữa, nếu duration=0, thì y='no'
=> Có thể gây nên data leakage.

In [41]:
df.drop(columns='duration', inplace=True)

# **DATA CLEANING**

In [42]:
df.head(10)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
5,45,services,married,basic.9y,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
6,59,admin.,married,professional.course,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
7,41,blue-collar,married,unknown,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
8,24,technician,single,professional.course,no,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
9,25,services,single,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


## Tạo pipeline xử lí các lỗi liên quan đến text
- `JobAdminCleaner` (clean_job_admin): Sửa lỗi nhập liệu "admin." thành "admin" trong cột job.
- `EducationPunctuationCleaner` (clean_education_punct): Thay thế dấu chấm "." bằng khoảng trắng " " trong cột education (ví dụ: "basic.9y" -> "basic 9y", "unknown." -> "unknown "). Điều này giúp chuẩn hóa giá trị và xử lý lỗi nhập liệu.
- Chuyển đổi dạng text sang số trong cột month (tháng) và cột days_of_week (ngày trong tuần)

In [43]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import warnings # Để cảnh báo nếu cột không tồn tại

# --- Transformer tùy chỉnh cho cột 'job' (Giữ nguyên từ trước) ---
class JobAdminCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, column='job'):
        self.column = column
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_transformed = X.copy()
        if self.column in X_transformed.columns:
            X_transformed.loc[:, self.column] = X_transformed[self.column].astype(str).replace('admin.', 'admin')
        else:
            warnings.warn(f"Cảnh báo: Cột '{self.column}' không tìm thấy. Bỏ qua bước JobAdminCleaner.")
        return X_transformed

# --- Transformer tùy chỉnh cho cột 'education' (Giữ nguyên từ trước) ---
class EducationDotCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, column='education'):
        self.column = column
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_transformed = X.copy()
        if self.column in X_transformed.columns:
            X_transformed.loc[:, self.column] = X_transformed[self.column].astype(str).str.replace('.', ' ', regex=False)
        else:
            warnings.warn(f"Cảnh báo: Cột '{self.column}' không tìm thấy. Bỏ qua bước EducationDotCleaner.")
        return X_transformed

# --- Transformer tùy chỉnh chung cho việc Mapping ---
class MappingTransformer(BaseEstimator, TransformerMixin):
    """
    Transformer tùy chỉnh để áp dụng mapping dựa trên dictionary cho một cột cụ thể.
    """
    def __init__(self, column, mapping):
        """
        Khởi tạo transformer.
        Args:
            column (str): Tên của cột cần áp dụng mapping.
            mapping (dict): Dictionary chứa các cặp giá trị cũ: giá trị mới.
        """
        self.column = column
        self.mapping = mapping

    def fit(self, X, y=None):
        """
        Phương thức fit không làm gì vì mapping đã được cung cấp.
        """
        return self

    def transform(self, X):
        """
        Áp dụng mapping trên cột được chỉ định.
        Args:
            X (pd.DataFrame): DataFrame đầu vào.
        Returns:
            pd.DataFrame: DataFrame đã được biến đổi.
        """
        X_transformed = X.copy()
        if self.column in X_transformed.columns:
            # Áp dụng map, các giá trị không có trong mapping sẽ thành NaN
            # Nếu muốn xử lý khác (ví dụ: giữ nguyên giá trị cũ hoặc gán giá trị mặc định),
            # logic ở đây cần phức tạp hơn.
            X_transformed.loc[:, self.column] = X_transformed[self.column].map(self.mapping)
            # Optional: Xử lý NaN nếu cần, ví dụ: fillna(-1) hoặc giá trị khác
            # X_transformed[self.column] = X_transformed[self.column].fillna(-1) # Ví dụ
        else:
            warnings.warn(f"Cảnh báo: Cột '{self.column}' không tìm thấy. Bỏ qua bước MappingTransformer.")

        return X_transformed

# --- Định nghĩa các mapping dictionaries ---
month_map = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
    'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
    'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
}

weekday_map = {
    'mon': 2, 'tue': 3, 'wed': 4, # Thường thứ Hai là 1, ..., Chủ Nhật là 7 hoặc 0
    'thu': 5, 'fri': 6, 'sat': 7, 'sun': 8, # Sửa lại map cho hợp lý hơn (1-7)
}

# --- Xây dựng Pipeline bao gồm cả các bước mapping ---
cleaning_pipeline = Pipeline([
    ('clean_job_admin', JobAdminCleaner(column='job')),
    ('clean_education_dot', EducationDotCleaner(column='education')),
    ('map_month', MappingTransformer(column='month', mapping=month_map)),
    ('map_day_of_week', MappingTransformer(column='day_of_week', mapping=weekday_map))
    # Thêm các bước khác nếu cần
])

# --- Áp dụng Pipeline ---
df_cleaned = cleaning_pipeline.fit_transform(df)
# --- Kiểm tra kết quả ---
print("\n--- Dữ liệu sau khi qua Pipeline ---")
print(df_cleaned)
print("\nGiá trị duy nhất cột 'job' sau khi xử lý:", df_cleaned['job'].unique())
print("Giá trị duy nhất cột 'education' sau khi xử lý:", df_cleaned['education'].unique())
print("Giá trị duy nhất cột 'month' sau khi xử lý:", df_cleaned['month'].unique())
print("Giá trị duy nhất cột 'day_of_week' sau khi xử lý:", df_cleaned['day_of_week'].unique())
print("\nKiểu dữ liệu các cột sau khi xử lý:")
print(df_cleaned.dtypes) # Kiểm tra kiểu dữ liệu cột month và day_of_week


--- Dữ liệu sau khi qua Pipeline ---
       age          job  marital            education  default housing loan  \
0       56    housemaid  married             basic 4y       no      no   no   
1       57     services  married          high school  unknown      no   no   
2       37     services  married          high school       no     yes   no   
3       40        admin  married             basic 6y       no      no   no   
4       56     services  married          high school       no      no  yes   
...    ...          ...      ...                  ...      ...     ...  ...   
41183   73      retired  married  professional course       no     yes   no   
41184   46  blue-collar  married  professional course       no      no   no   
41185   56      retired  married    university degree       no     yes   no   
41186   44   technician  married  professional course       no      no   no   
41187   74      retired  married  professional course       no     yes   no   

         cont

In [44]:
df_cleaned

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,56,housemaid,married,basic 4y,no,no,no,telephone,5,2,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high school,unknown,no,no,telephone,5,2,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high school,no,yes,no,telephone,5,2,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin,married,basic 6y,no,no,no,telephone,5,2,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high school,no,no,yes,telephone,5,2,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional course,no,yes,no,cellular,11,6,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional course,no,no,no,cellular,11,6,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university degree,no,yes,no,cellular,11,6,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional course,no,no,no,cellular,11,6,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [45]:
# --- Transformer tùy chỉnh để xử lý cột 'pdays' (Đã sửa) ---
class PdaysHandler(BaseEstimator, TransformerMixin):
    """
    Xử lý giá trị 999 trong cột 'pdays'.
    1. Tạo cột nhị phân mới 'was_contacted_previously'.
    2. Thay thế giá trị 999 trong cột 'pdays' gốc bằng NaN (hoặc giá trị khác).
    """
    def __init__(self, pdays_col='pdays', new_col_suffix='_contacted_status', replace_999_with=np.nan):
        self.pdays_col = pdays_col
        self.new_col_name = f"{pdays_col}{new_col_suffix}"
        self.replace_999_with = replace_999_with

    def fit(self, X, y=None):
        # Không cần kiểm tra type ở đây nữa vì ta sẽ chuyển đổi nó trong transform
        return self

    def transform(self, X):
        X_transformed = X.copy()
        if self.pdays_col in X_transformed.columns:
            print(f"Xử lý cột '{self.pdays_col}': Tạo cột '{self.new_col_name}' và thay thế 999 bằng {self.replace_999_with}")

            # Bước 1: Tạo cột mới (làm trước khi thay đổi cột gốc)
            X_transformed[self.new_col_name] = np.where(X_transformed[self.pdays_col] == 999, 0, 1)

            # Bước 2: Chuyển đổi kiểu dữ liệu cột pdays gốc sang float một cách tường minh *trước* khi thay thế
            # Điều này giải quyết FutureWarning
            if pd.api.types.is_numeric_dtype(X_transformed[self.pdays_col]): # Chỉ chuyển nếu là số
                X_transformed[self.pdays_col] = X_transformed[self.pdays_col].astype(float)
            else:
                # Nếu không phải số, thử chuyển sang số, lỗi nếu không thể
                try:
                    X_transformed[self.pdays_col] = pd.to_numeric(X_transformed[self.pdays_col], errors='raise').astype(float)
                except (ValueError, TypeError):
                     warnings.warn(f"Cảnh báo: Không thể chuyển đổi cột '{self.pdays_col}' sang kiểu float. Bỏ qua thay thế 999.")
                     return X_transformed # Trả về mà không thay đổi pdays nếu không chuyển được

            # Bước 3: Thực hiện thay thế 999 bằng giá trị mong muốn (bây giờ kiểu đã tương thích)
            X_transformed.loc[:, self.pdays_col] = X_transformed[self.pdays_col].replace(999.0, self.replace_999_with) # Thay thế 999.0 (vì đã là float)

        else:
            warnings.warn(f"Cảnh báo: Cột '{self.pdays_col}' không tìm thấy. Bỏ qua bước PdaysHandler.")
        return X_transformed

# --- Chạy lại Cell 5 với Transformer đã sửa ---
# (Dùng lại df từ Cell 4)
print("\n--- Bước 4 (Lặp lại): Xử lý cột 'pdays' với Transformer đã sửa ---")
print("Giá trị cột 'pdays' trước khi xử lý:\n", df[['pdays']].head()) # df này pdays vẫn là int từ lần tạo đầu
pdays_handler_fixed = PdaysHandler(pdays_col='pdays', replace_999_with=np.nan)
df_cleaned = pdays_handler_fixed.fit_transform(df_cleaned) # Tạo df mới để thấy rõ sự khác biệt
print("\nGiá trị các cột liên quan sau khi xử lý (đã sửa):\n", df_cleaned[['pdays', 'pdays_contacted_status']].head())
print("Giá trị duy nhất cột 'pdays' sau xử lý (đã sửa):", df_cleaned['pdays'].unique())
print("Giá trị duy nhất cột 'pdays_contacted_status' mới (đã sửa):", df_cleaned['pdays_contacted_status'].unique())
print("Kiểu dữ liệu cột 'pdays' sau xử lý (đã sửa):", df_cleaned['pdays'].dtype)


--- Bước 4 (Lặp lại): Xử lý cột 'pdays' với Transformer đã sửa ---
Giá trị cột 'pdays' trước khi xử lý:
    pdays
0    999
1    999
2    999
3    999
4    999
Xử lý cột 'pdays': Tạo cột 'pdays_contacted_status' và thay thế 999 bằng nan

Giá trị các cột liên quan sau khi xử lý (đã sửa):
    pdays  pdays_contacted_status
0    NaN                       0
1    NaN                       0
2    NaN                       0
3    NaN                       0
4    NaN                       0
Giá trị duy nhất cột 'pdays' sau xử lý (đã sửa): [nan  6.  4.  3.  5.  1.  0. 10.  7.  8.  9. 11.  2. 12. 13. 14. 15. 16.
 21. 17. 18. 22. 25. 26. 19. 27. 20.]
Giá trị duy nhất cột 'pdays_contacted_status' mới (đã sửa): [0 1]
Kiểu dữ liệu cột 'pdays' sau xử lý (đã sửa): float64


In [46]:
df_cleaned['month'].astype(int)

0         5
1         5
2         5
3         5
4         5
         ..
41183    11
41184    11
41185    11
41186    11
41187    11
Name: month, Length: 41176, dtype: int32

In [47]:
df_cleaned['day_of_week'].astype(int)

0        2
1        2
2        2
3        2
4        2
        ..
41183    6
41184    6
41185    6
41186    6
41187    6
Name: day_of_week, Length: 41176, dtype: int32

In [48]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41176 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   age                     41176 non-null  int64  
 1   job                     41176 non-null  object 
 2   marital                 41176 non-null  object 
 3   education               41176 non-null  object 
 4   default                 41176 non-null  object 
 5   housing                 41176 non-null  object 
 6   loan                    41176 non-null  object 
 7   contact                 41176 non-null  object 
 8   month                   41176 non-null  object 
 9   day_of_week             41176 non-null  object 
 10  campaign                41176 non-null  int64  
 11  pdays                   1515 non-null   float64
 12  previous                41176 non-null  int64  
 13  poutcome                41176 non-null  object 
 14  emp_var_rate            41176 non-null  flo

In [49]:
df_cleaned.shape

(41176, 21)

In [50]:
df_cleaned.describe()


Unnamed: 0,age,campaign,pdays,previous,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,pdays_contacted_status
count,41176.0,41176.0,1515.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0
mean,40.0238,2.567879,6.014521,0.173013,0.081922,93.57572,-40.502863,3.621293,5167.03487,0.036793
std,10.42068,2.770318,3.824906,0.494964,1.570883,0.578839,4.62786,1.734437,72.251364,0.188256
min,17.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6,0.0
25%,32.0,1.0,3.0,0.0,-1.8,93.075,-42.7,1.344,5099.1,0.0
50%,38.0,2.0,6.0,0.0,1.1,93.749,-41.8,4.857,5191.0,0.0
75%,47.0,3.0,7.0,0.0,1.4,93.994,-36.4,4.961,5228.1,0.0
max,98.0,56.0,27.0,7.0,1.4,94.767,-26.9,5.045,5228.1,1.0


In [51]:
df_cleaned.dtypes.value_counts()

object     11
float64     6
int64       3
int32       1
Name: count, dtype: int64

In [55]:
# Delete duplicated rows
df_cleaned.drop_duplicates(inplace=True)

# Generate a "clean data file"
df_cleaned.to_csv(artifact_name,index=False)

In [56]:
# Create a new artifact and configure with the necessary arguments
artifact = wandb.Artifact(name=artifact_name,
                          type=artifact_type,
                          description=artifact_description)
artifact.add_file(artifact_name)

ArtifactManifestEntry(path='preprocessed_data.csv', digest='f7XkuTwzXI79rhEJZaEmqw==', size=4395701, local_path='C:\\Users\\Admin\\AppData\\Local\\wandb\\wandb\\artifacts\\staging\\tmphmn5cega', skip_cache=False)

In [57]:
# Upload the artifact to Wandb
run.log_artifact(artifact)

<Artifact preprocessed_data.csv>

In [58]:
# close the run
# waiting a while after run the previous cell before execute this
run.finish()

In [95]:
# --- 10. Export ra file CSV ---
output_filename = 'bank_data_cleaned.csv'
# index=False: Không lưu cột index của DataFrame vào file CSV
# encoding='utf-8': Đảm bảo tương thích ký tự (thường là mặc định tốt)
df_cleaned.to_csv(output_filename, index=False, encoding='utf-8')

print(f"\nĐã lưu DataFrame cuối cùng vào file '{output_filename}'")


Đã lưu DataFrame cuối cùng vào file 'bank_data_cleaned.csv'


# **DATA PREPROCESSING**

In [96]:
class QuantileCapper(BaseEstimator, TransformerMixin):
    """
    Transformer to cap values based on quantiles.
    Handles both pandas DataFrame and numpy array inputs.
    """
    def __init__(self, columns=None, lower_quantile=0.01, upper_quantile=0.99):
        """
        Args:
            columns (list, optional): List of column names or indices to apply capping.
                                     If None, applies to all columns.
            lower_quantile (float): Lower quantile threshold.
            upper_quantile (float): Upper quantile threshold.
        """
        self.columns = columns
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile
        self.bounds_ = {}  # Will store calculated thresholds during fit

    def fit(self, X, y=None):
        """
        Calculate lower and upper quantile thresholds from training data X.
        """
        # Convert to DataFrame if it's a numpy array
        is_numpy = isinstance(X, np.ndarray)
        
        if is_numpy:
            # For numpy arrays, convert to DataFrame with generic column names
            column_names = [f"feature_{i}" for i in range(X.shape[1])]
            X_df = pd.DataFrame(X, columns=column_names)
        else:
            X_df = X
        
        # Determine which columns to process
        if self.columns is None:
            # If no columns specified, process all columns
            self.columns_to_process_ = list(range(X.shape[1])) if is_numpy else X_df.columns.tolist()
        else:
            # Use provided columns
            self.columns_to_process_ = self.columns
        
        # Calculate and store bounds
        self.bounds_ = {}
        
        for i, col in enumerate(self.columns_to_process_):
            # For numpy arrays, convert column index to column name
            col_name = f"feature_{col}" if is_numpy and isinstance(col, int) else col
            col_idx = col if is_numpy and isinstance(col, int) else i
            
            # Get column data
            if is_numpy:
                col_data = X[:, col_idx] if col_idx < X.shape[1] else None
            else:
                col_data = X_df[col] if col in X_df.columns else None
            
            if col_data is not None:
                lower_bound = np.nanquantile(col_data, self.lower_quantile)
                upper_bound = np.nanquantile(col_data, self.upper_quantile)
                self.bounds_[col_idx if is_numpy else col] = {'lower': lower_bound, 'upper': upper_bound}
                print(f"Column '{col_name}': lower={lower_bound:.2f}, upper={upper_bound:.2f}")
        
        return self

    def transform(self, X):
        """
        Apply capping using thresholds calculated during fit.
        """
        if not hasattr(self, 'bounds_') or not self.bounds_:
            raise RuntimeError("Transformer is not fitted yet. Call fit first.")
        
        # Check input type
        is_numpy = isinstance(X, np.ndarray)
        
        # Create a copy to avoid modifying the original X
        X_transformed = X.copy()
        
        # Apply bounds
        for col_id, bounds in self.bounds_.items():
            if is_numpy:
                # For numpy arrays, use column index
                if isinstance(col_id, int) and col_id < X.shape[1]:
                    X_transformed[:, col_id] = np.clip(
                        X_transformed[:, col_id], 
                        bounds['lower'], 
                        bounds['upper']
                    )
            else:
                # For DataFrames, use column name
                if col_id in X_transformed.columns:
                    X_transformed[col_id] = np.clip(
                        X_transformed[col_id], 
                        bounds['lower'], 
                        bounds['upper']
                    )
        
        return X_transformed
    
    def get_feature_names_out(self, input_features=None):
        """
        Return input feature names unchanged, since QuantileCapper does not change column names.
        """
        if input_features is None:
            # If input_features not given, use self.columns_to_process_
            return np.array(self.columns_to_process_)  # or fallback generic names
        return np.array(input_features)



In [97]:


# --- 1. Mã hóa biến mục tiêu (y) riêng biệt ---
target_column = 'y'
if target_column in df_cleaned.columns:
    print(f"\nMã hóa biến mục tiêu '{target_column}'...")
    y_target = df_cleaned[target_column].map({'yes': 1, 'no': 0})
    X_features = df_cleaned.drop(columns=[target_column])
    print("Target 'y' đã được mã hóa và tách riêng.")
else:
    print(f"Cảnh báo: Không tìm thấy cột mục tiêu '{target_column}'. Xử lý toàn bộ DataFrame như features.")
    X_features = df_cleaned.copy()
    y_target = None


Mã hóa biến mục tiêu 'y'...
Target 'y' đã được mã hóa và tách riêng.


In [98]:
# --- 2. Xác định các cột số và phân loại trong Features (X_features) ---
numerical_cols = X_features.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_features.select_dtypes(include=['object', 'category']).columns.tolist()

print("\nNumerical columns (will be Imputed, Outlier-capped, and Scaled):", numerical_cols)
print("Categorical columns (will be One-Hot Encoded):", categorical_cols)


Numerical columns (will be Imputed, Outlier-capped, and Scaled): ['age', 'campaign', 'pdays', 'previous', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'pdays_contacted_status']
Categorical columns (will be One-Hot Encoded): ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']


In [99]:
# Visualize outliers in numerical columns before handling
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols[:min(9, len(numerical_cols))]):  # Show up to 9 columns
    plt.subplot(3, 3, i+1)
    sns.boxplot(x=X_features[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.savefig('outliers_before_handling.png')
plt.close()

In [100]:
# Pipeline for numerical columns: Impute NaN, Handle Outliers, Scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('outlier_handler', QuantileCapper(lower_quantile=0.01, upper_quantile=0.99)),
    ('scaler', StandardScaler())
])

# Pipeline for categorical columns: One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

# Create ColumnTransformer to apply appropriate transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

In [101]:
# Create final pipeline
final_pipeline = Pipeline([
    ('preprocessing', preprocessor)
])

# Apply pipeline
print("\nApplying preprocessing pipeline with outlier handling...")
X_processed_array = final_pipeline.fit_transform(X_features)


# --- 7. Lấy tên cột sau khi biến đổi ---
try:
    feature_names_out = final_pipeline.named_steps['preprocessing'].get_feature_names_out()
except AttributeError:
    print("Warning: Older sklearn version, trying to get column names manually.")
    # Get names from child transformers
    num_feature_names = numerical_cols  # Imputer, Capper, and Scaler don't change basic names
    cat_feature_names = final_pipeline.named_steps['preprocessing'] \
                        .named_transformers_['cat'].named_steps['onehot'] \
                        .get_feature_names_out(categorical_cols)
    feature_names_out = list(num_feature_names) + list(cat_feature_names)


Applying preprocessing pipeline with outlier handling...
Column 'feature_0': lower=23.00, upper=71.00
Column 'feature_1': lower=1.00, upper=14.00
Column 'feature_2': lower=3.00, upper=6.00
Column 'feature_3': lower=0.00, upper=2.00
Column 'feature_4': lower=-3.40, upper=1.40
Column 'feature_5': lower=92.20, upper=94.47
Column 'feature_6': lower=-49.50, upper=-26.90
Column 'feature_7': lower=0.66, upper=4.97
Column 'feature_8': lower=4963.60, upper=5228.10
Column 'feature_9': lower=0.00, upper=1.00


In [102]:
feature_names_out  # Convert to list if it's not already

['age',
 'campaign',
 'pdays',
 'previous',
 'emp_var_rate',
 'cons_price_idx',
 'cons_conf_idx',
 'euribor3m',
 'nr_employed',
 'pdays_contacted_status',
 'job_blue-collar',
 'job_entrepreneur',
 'job_housemaid',
 'job_management',
 'job_retired',
 'job_self-employed',
 'job_services',
 'job_student',
 'job_technician',
 'job_unemployed',
 'job_unknown',
 'marital_married',
 'marital_single',
 'marital_unknown',
 'education_basic 6y',
 'education_basic 9y',
 'education_high school',
 'education_illiterate',
 'education_professional course',
 'education_university degree',
 'education_unknown',
 'default_unknown',
 'default_yes',
 'housing_unknown',
 'housing_yes',
 'loan_unknown',
 'loan_yes',
 'contact_telephone',
 'month_4',
 'month_5',
 'month_6',
 'month_7',
 'month_8',
 'month_9',
 'month_10',
 'month_11',
 'month_12',
 'day_of_week_3',
 'day_of_week_4',
 'day_of_week_5',
 'day_of_week_6',
 'poutcome_nonexistent',
 'poutcome_success']

In [103]:
print("X_processed_array shape:", X_processed_array.shape)
print("Number of feature names:", len(feature_names_out))


X_processed_array shape: (41188, 53)
Number of feature names: 53


In [104]:
# Create DataFrame from processed features
X_processed_df = pd.DataFrame(
    X_processed_array,
    columns=feature_names_out,
    index=X_features.index
)
print("DataFrame created for processed features.")

# Combine processed features and encoded target
if y_target is not None:
    df_final_export = pd.concat([X_processed_df, y_target], axis=1)
    print("Combined processed features and encoded target.")
else:
    df_final_export = X_processed_df  # If no target
    print("Only processed features (no target).")

print("\n--- Final DataFrame ready for export ---")
print(df_final_export.head())
print("\nFinal NaN check:")
print(df_final_export.isnull().sum().sum())  # Should be 0


DataFrame created for processed features.
Combined processed features and encoded target.

--- Final DataFrame ready for export ---
        age  campaign     pdays  previous  emp_var_rate  cons_price_idx  \
0  1.580503 -0.651855  0.128949 -0.374858      0.648092        0.729047   
1  1.679140 -0.651855  0.128949 -0.374858      0.648092        0.729047   
2 -0.293608 -0.651855  0.128949 -0.374858      0.648092        0.729047   
3  0.002304 -0.651855  0.128949 -0.374858      0.648092        0.729047   
4  1.580503 -0.651855  0.128949 -0.374858      0.648092        0.729047   

   cons_conf_idx  euribor3m  nr_employed  pdays_contacted_status  ...  \
0       0.887767   0.712505      0.33168               -0.195415  ...   
1       0.887767   0.712505      0.33168               -0.195415  ...   
2       0.887767   0.712505      0.33168               -0.195415  ...   
3       0.887767   0.712505      0.33168               -0.195415  ...   
4       0.887767   0.712505      0.33168            

In [110]:
output_filename = 'bank_data_cleaned_imputed_encoded_outliers.csv'
df_final_export.to_csv(output_filename, index=False, encoding='utf-8')
print(f"\nSaved final DataFrame to '{output_filename}'")
client = MongoClient("mongodb+srv://hang3c18:zOjY7hfnMxhupvnv@cluster0.yu4ohfs.mongodb.net/")
db = client['bank']  # MongoDB connection URI # Database name
collection = db['bank_data_cleaned_imputed_encoded_outliers']  # Collection name

# Convert the DataFrame to a list of dictionaries (documents)
data_dict = df_final_export.to_dict(orient='records')

# Insert data into MongoDB collection
insert_result = collection.insert_many(data_dict)

# Output number of inserted documents
print(f"{len(insert_result.inserted_ids)} documents inserted into MongoDB.")

# Close the MongoDB connection
client.close()



Saved final DataFrame to 'bank_data_cleaned_imputed_encoded_outliers.csv'
41188 documents inserted into MongoDB.


In [None]:


# Let's also visualize the distribution of numerical columns after outlier handling
# Extract the numerical columns from the processed DataFrame
numerical_cols_processed = [col for col in X_processed_df.columns if col in numerical_cols]

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols_processed[:min(9, len(numerical_cols_processed))]):
    plt.subplot(3, 3, i+1)
    sns.histplot(X_processed_df[col], kde=True)
    plt.title(f'Distribution of {col} after processing')
plt.tight_layout()
plt.savefig('distributions_after_processing.png')
plt.close()

print("\nOutlier handling complete. Visualizations saved to 'outliers_before_handling.png' and 'distributions_after_processing.png'")


Saved final DataFrame to 'bank_data_cleaned_imputed_encoded_no_outliers.csv'

Outlier handling complete. Visualizations saved to 'outliers_before_handling.png' and 'distributions_after_processing.png'


In [106]:
df_final_export

Unnamed: 0,age,campaign,pdays,previous,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,pdays_contacted_status,...,month_10,month_11,month_12,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,poutcome_nonexistent,poutcome_success,y
0,1.580503,-0.651855,0.128949,-0.374858,0.648092,0.729047,0.887767,0.712505,0.331680,-0.195415,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,1.679140,-0.651855,0.128949,-0.374858,0.648092,0.729047,0.887767,0.712505,0.331680,-0.195415,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,-0.293608,-0.651855,0.128949,-0.374858,0.648092,0.729047,0.887767,0.712505,0.331680,-0.195415,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
3,0.002304,-0.651855,0.128949,-0.374858,0.648092,0.729047,0.887767,0.712505,0.331680,-0.195415,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,1.580503,-0.651855,0.128949,-0.374858,0.648092,0.729047,0.887767,0.712505,0.331680,-0.195415,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,3.060064,-0.651855,0.128949,-0.374858,-0.752343,1.546722,-1.952126,-1.495465,-2.815697,-0.195415,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1
41184,0.594128,-0.651855,0.128949,-0.374858,-0.752343,1.546722,-1.952126,-1.495465,-2.815697,-0.195415,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
41185,1.580503,-0.217292,0.128949,-0.374858,-0.752343,1.546722,-1.952126,-1.495465,-2.815697,-0.195415,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
41186,0.396854,-0.651855,0.128949,-0.374858,-0.752343,1.546722,-1.952126,-1.495465,-2.815697,-0.195415,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1


In [107]:
# --- 10. Export ra file CSV ---
output_filename = 'bank_data_cleaned_imputed_encoded.csv'
# index=False: Không lưu cột index của DataFrame vào file CSV
# encoding='utf-8': Đảm bảo tương thích ký tự (thường là mặc định tốt)
df_final_export.to_csv(output_filename, index=False, encoding='utf-8')

print(f"\nĐã lưu DataFrame cuối cùng vào file '{output_filename}'")


Đã lưu DataFrame cuối cùng vào file 'bank_data_cleaned_imputed_encoded.csv'


- Tạo pipeline xử lí các vấn đề khác

In [108]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
# Cần cài đặt thư viện imbalanced-learn: pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# imblearn cũng có Pipeline riêng (imblearn.pipeline.Pipeline) có thể chứa sampler
# nhưng ở đây ta chỉ định nghĩa các thành phần sklearn hoặc custom
import warnings

# === 1. Định nghĩa Transformer Tùy Chỉnh cho Outlier Handling (Quantile Capping) ===

class QuantileCapper(BaseEstimator, TransformerMixin):
    """
    Transformer tùy chỉnh để giới hạn giá trị dựa trên quantile.
    Tính toán quantile trên tập train (khi fit) và áp dụng cho train/test (khi transform).
    """
    def __init__(self, columns=None, lower_quantile=0.01, upper_quantile=0.99):
        """
        Args:
            columns (list, optional): Danh sách tên cột cần áp dụng.
                                      Nếu None, áp dụng cho tất cả cột số.
            lower_quantile (float): Quantile dưới.
            upper_quantile (float): Quantile trên.
        """
        self.columns = columns
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile
        self.bounds_ = {} # Sẽ lưu trữ các ngưỡng tính được khi fit

    def fit(self, X, y=None):
        """
        Tính toán ngưỡng lower và upper quantile *chỉ* từ dữ liệu huấn luyện X.
        """
        if isinstance(X, pd.DataFrame):
            X_ = X
        else:
            # Cố gắng chuyển đổi nếu là NumPy array, giả sử không có tên cột
             warnings.warn("Input is not a Pandas DataFrame. Applying capping to all columns.")
             X_ = pd.DataFrame(X) # Cần tên cột để lưu bounds_

        if self.columns is None:
            # Nếu không chỉ định cột, lấy tất cả cột số
            self.columns_to_process_ = X_.select_dtypes(include=np.number).columns.tolist()
        else:
            self.columns_to_process_ = [col for col in self.columns if col in X_.columns and pd.api.types.is_numeric_dtype(X_[col])]
            if len(self.columns_to_process_) != len(self.columns):
                 warnings.warn("Some specified columns for QuantileCapper not found or are not numeric.")

        if not self.columns_to_process_:
             warnings.warn("No numeric columns found to apply QuantileCapper.")
             return self

        # Tính toán và lưu trữ bounds
        self.bounds_ = {}
        for col in self.columns_to_process_:
            lower_bound = X_[col].quantile(self.lower_quantile)
            upper_bound = X_[col].quantile(self.upper_quantile)
            self.bounds_[col] = {'lower': lower_bound, 'upper': upper_bound}

        return self

    def transform(self, X):
        """
        Áp dụng capping sử dụng các ngưỡng đã tính toán trong fit.
        """
        if not hasattr(self, 'bounds_') or not self.bounds_:
            raise RuntimeError("Transformer is not fitted yet. Call fit first.")

        # Tạo bản sao để không thay đổi X gốc
        if isinstance(X, pd.DataFrame):
             X_transformed = X.copy()
        else:
             # Nếu là NumPy, cần đảm bảo xử lý đúng cột nếu fit bằng DataFrame
             # Để đơn giản, giả sử transform nhận cùng kiểu với fit
             X_transformed = X.copy() # Nên dùng DataFrame để an toàn nhất

        for col, bounds in self.bounds_.items():
            if col in X_transformed.columns: # Kiểm tra nếu cột tồn tại trong dữ liệu transform
                 # print(f"Applying capping to column '{col}': lower={bounds['lower']}, upper={bounds['upper']}") # Debug
                 # Sử dụng np.clip để giới hạn giá trị
                 X_transformed[col] = np.clip(X_transformed[col], bounds['lower'], bounds['upper'])
            # else: # Bỏ qua nếu cột không có trong dữ liệu transform
                 # print(f"Column '{col}' not found in data for transform. Skipping.") # Debug
        return X_transformed


# === 2. Khởi tạo các Đối tượng Sampler (Cho Balancing) ===
# Lưu ý: Balancing thường thực hiện bằng fit_resample trên train set,
# nó không nằm trong pipeline sklearn chuẩn xử lý features X một cách trực tiếp.
# Bạn sẽ áp dụng nó SAU KHI chia train/test và TRƯỚC KHI huấn luyện model.

print("\n--- Khởi tạo các Sampler (Balancing) ---")
print("Lưu ý: Áp dụng `fit_resample(X_train, y_train)` riêng biệt.")
smote_sampler = SMOTE(random_state=42)
print(f"Đã khởi tạo SMOTE: {smote_sampler}")
# random_undersampler = RandomUnderSampler(random_state=42)
# print(f"Đã khởi tạo RandomUnderSampler: {random_undersampler}")
print("-" * 30)


# === 3. Định nghĩa các Pipeline Kết hợp Outlier Handling và Scaling ===
# Các pipeline này chỉ chứa các bước xử lý features (X).

print("\n--- Định nghĩa các Feature Processing Pipelines (Outlier + Scaling) ---")

# Lựa chọn 1: Capping + StandardScaler
pipeline_cap_standard = Pipeline([
    ('outlier_capper', QuantileCapper(lower_quantile=0.01, upper_quantile=0.99)), # Có thể chỉ định cột nếu muốn
    ('scaler', StandardScaler())
])
print(f"Đã định nghĩa Pipeline (QuantileCapping + StandardScaler):\n{pipeline_cap_standard}")

ModuleNotFoundError: No module named 'imblearn'