In [51]:
# Import c√°c th∆∞ vi·ªán c·∫ßn thi·∫øt (KH√îNG D√ôNG csv hay scipy)
import numpy as np

In [52]:
# ============================================================================
# HELPER FUNCTIONS: Manual Statistical Tests (KH√îNG D√ôNG scipy)
# ============================================================================

def chi_square_test(observed):
    """
    Chi-square test of independence (manual implementation)
    
    Parameters:
    - observed: 2D array (contingency table)
    
    Returns:
    - chi2: Chi-square statistic
    - p_value: P-value
    - dof: Degrees of freedom
    - expected: Expected frequencies
    """
    observed = np.array(observed, dtype=float)
    
    # Calculate row and column totals
    row_totals = observed.sum(axis=1)
    col_totals = observed.sum(axis=0)
    n_total = observed.sum()
    
    # Calculate expected frequencies
    expected = np.outer(row_totals, col_totals) / n_total
    
    # Calculate chi-square statistic
    chi2 = np.sum((observed - expected)**2 / expected)
    
    # Degrees of freedom
    dof = (observed.shape[0] - 1) * (observed.shape[1] - 1)
    
    # Calculate p-value using chi-square CDF (manual approximation)
    # Using Wilson-Hilferty transformation for chi-square to normal
    if dof > 0:
        x = chi2 / dof
        z = (x**(1/3) - (1 - 2/(9*dof))) / np.sqrt(2/(9*dof))
        # Standard normal CDF approximation
        p_value = 0.5 * (1 + np.tanh(z / np.sqrt(2)))
        p_value = 1 - p_value  # Right tail
    else:
        p_value = 1.0
    
    return chi2, p_value, dof, expected


def t_test_independent(sample1, sample2):
    """
    Independent two-sample t-test (manual implementation)
    
    Parameters:
    - sample1, sample2: 1D arrays
    
    Returns:
    - t_stat: T-statistic
    - p_value: Two-tailed p-value
    """
    n1, n2 = len(sample1), len(sample2)
    mean1, mean2 = np.mean(sample1), np.mean(sample2)
    var1, var2 = np.var(sample1, ddof=1), np.var(sample2, ddof=1)
    
    # Pooled standard error
    pooled_se = np.sqrt(var1/n1 + var2/n2)
    
    # T-statistic
    t_stat = (mean1 - mean2) / pooled_se
    
    # Degrees of freedom (Welch-Satterthwaite)
    dof = (var1/n1 + var2/n2)**2 / ((var1/n1)**2/(n1-1) + (var2/n2)**2/(n2-1))
    
    # P-value approximation using normal distribution (for large samples)
    # For large dof, t-distribution ‚âà normal distribution
    z = abs(t_stat)
    p_value = 2 * (1 - 0.5 * (1 + np.tanh(z * np.sqrt(2/np.pi))))
    
    return t_stat, p_value

print("‚úÖ ƒê√£ define statistical helper functions!")

‚úÖ ƒê√£ define statistical helper functions!


In [53]:
# ============================================================================
# B∆Ø·ªöC 1: LOAD D·ªÆ LI·ªÜU (KH√îNG D√ôNG th∆∞ vi·ªán csv)
# ============================================================================
# ƒê·ªçc file CSV b·∫±ng file I/O thu·∫ßn t√∫y
file_path = '../data/raw/BankChurners.csv'

with open(file_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    # Lo·∫°i b·ªè d·∫•u ngo·∫∑c k√©p v√† spaces t·ª´ header
    headers = [col.strip().strip('"') for col in lines[0].strip().split(',')]
    # Lo·∫°i b·ªè d·∫•u ngo·∫∑c k√©p t·ª´ data
    rows = [[cell.strip().strip('"') for cell in line.strip().split(',')] for line in lines[1:]]

print("‚úÖ ƒê√£ load d·ªØ li·ªáu th√†nh c√¥ng!")
print(f"üìä S·ªë d√≤ng: {len(rows):,}")
print(f"üìä S·ªë c·ªôt: {len(headers)}")
print(f"\nüìã T√™n c√°c c·ªôt:")
for i, col in enumerate(headers):
    print(f"  {i:2d}. {col}")

‚úÖ ƒê√£ load d·ªØ li·ªáu th√†nh c√¥ng!
üìä S·ªë d√≤ng: 10,127
üìä S·ªë c·ªôt: 23

üìã T√™n c√°c c·ªôt:
   0. CLIENTNUM
   1. Attrition_Flag
   2. Customer_Age
   3. Gender
   4. Dependent_count
   5. Education_Level
   6. Marital_Status
   7. Income_Category
   8. Card_Category
   9. Months_on_book
  10. Total_Relationship_Count
  11. Months_Inactive_12_mon
  12. Contacts_Count_12_mon
  13. Credit_Limit
  14. Total_Revolving_Bal
  15. Avg_Open_To_Buy
  16. Total_Amt_Chng_Q4_Q1
  17. Total_Trans_Amt
  18. Total_Trans_Ct
  19. Total_Ct_Chng_Q4_Q1
  20. Avg_Utilization_Ratio
  21. Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1
  22. Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2


In [54]:
# T√°ch data th√†nh numpy arrays theo data type
# D·ª±a tr√™n EDA, ta bi·∫øt:
# - C·ªôt 0: CLIENTNUM (ID - s·∫Ω b·ªè)
# - C·ªôt 1: Attrition_Flag (TARGET)
# - C·ªôt 2-7: Categorical features
# - C·ªôt 8-20: Numerical features
# - C·ªôt 21-22: Naive_Bayes columns (s·∫Ω b·ªè)

# ƒê·ªãnh nghƒ©a indices
target_idx = 1
categorical_indices = [3, 5, 6, 7, 8]  # Gender, Education, Marital, Income, Card_Category
numerical_indices = [2, 4, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]  # Customer_Age, Dependent_count treated as numerical
drop_indices = [0, 21, 22]  # CLIENTNUM, Naive_Bayes columns

# Extract target
target_raw = [row[target_idx] for row in rows]
target = np.array([1 if t == 'Attrited Customer' else 0 for t in target_raw])

# Extract categorical features
categorical_data = []
for row in rows:
    categorical_data.append([row[i] for i in categorical_indices])
categorical_data = np.array(categorical_data)

# Extract numerical features
numerical_data = []
for row in rows:
    num_row = []
    for i in numerical_indices:
        try:
            num_row.append(float(row[i]))
        except:
            num_row.append(np.nan)  # Handle missing/invalid values
    numerical_data.append(num_row)
numerical_data = np.array(numerical_data)

# L·∫•y t√™n c·ªôt
categorical_cols = [headers[i] for i in categorical_indices]
numerical_cols = [headers[i] for i in numerical_indices]

print(f"\n‚úÖ ƒê√£ t√°ch d·ªØ li·ªáu th√†nh c√¥ng!")
print(f"üìä Target shape: {target.shape}")
print(f"üìä Categorical data shape: {categorical_data.shape}")
print(f"üìä Numerical data shape: {numerical_data.shape}")
print(f"\nüè∑Ô∏è Categorical columns ({len(categorical_cols)}):")
for i, col in enumerate(categorical_cols):
    print(f"  {i}. {col}")
print(f"\nüî¢ Numerical columns ({len(numerical_cols)}):")
for i, col in enumerate(numerical_cols):
    print(f"  {i}. {col}")


‚úÖ ƒê√£ t√°ch d·ªØ li·ªáu th√†nh c√¥ng!
üìä Target shape: (10127,)
üìä Categorical data shape: (10127, 5)
üìä Numerical data shape: (10127, 14)

üè∑Ô∏è Categorical columns (5):
  0. Gender
  1. Education_Level
  2. Marital_Status
  3. Income_Category
  4. Card_Category

üî¢ Numerical columns (14):
  0. Customer_Age
  1. Dependent_count
  2. Months_on_book
  3. Total_Relationship_Count
  4. Months_Inactive_12_mon
  5. Contacts_Count_12_mon
  6. Credit_Limit
  7. Total_Revolving_Bal
  8. Avg_Open_To_Buy
  9. Total_Amt_Chng_Q4_Q1
  10. Total_Trans_Amt
  11. Total_Trans_Ct
  12. Total_Ct_Chng_Q4_Q1
  13. Avg_Utilization_Ratio


---
## üîç B∆Ø·ªöC 2: KI·ªÇM TRA T√çNH H·ª¢P L·ªÜ C·ª¶A GI√Å TR·ªä (DATA VALIDATION)

In [55]:
# Function ƒë·ªÉ validate numerical columns
def validate_numerical_column(data, col_idx, col_name, expected_min=None, expected_max=None):
    """
    Ki·ªÉm tra t√≠nh h·ª£p l·ªá c·ªßa c·ªôt numerical
    Returns: dict v·ªõi th√¥ng tin validation
    """
    col_data = data[:, col_idx]
    
    # Lo·∫°i b·ªè NaN ƒë·ªÉ t√≠nh to√°n
    valid_data = col_data[~np.isnan(col_data)]
    
    result = {
        'column': col_name,
        'total_count': len(col_data),
        'nan_count': np.sum(np.isnan(col_data)),
        'min': np.min(valid_data) if len(valid_data) > 0 else None,
        'max': np.max(valid_data) if len(valid_data) > 0 else None,
        'mean': np.mean(valid_data) if len(valid_data) > 0 else None,
        'issues': []
    }
    
    # Ki·ªÉm tra NaN
    if result['nan_count'] > 0:
        result['issues'].append(f"‚ùå C√≥ {result['nan_count']} gi√° tr·ªã NaN ({result['nan_count']/result['total_count']*100:.2f}%)")
    
    # Ki·ªÉm tra gi√° tr·ªã √¢m (n·∫øu kh√¥ng h·ª£p l·ªá)
    if len(valid_data) > 0 and expected_min is not None:
        if result['min'] < expected_min:
            count_invalid = np.sum(valid_data < expected_min)
            result['issues'].append(f"‚ùå C√≥ {count_invalid} gi√° tr·ªã < {expected_min}")
    
    # Ki·ªÉm tra gi√° tr·ªã qu√° l·ªõn
    if len(valid_data) > 0 and expected_max is not None:
        if result['max'] > expected_max:
            count_invalid = np.sum(valid_data > expected_max)
            result['issues'].append(f"‚ùå C√≥ {count_invalid} gi√° tr·ªã > {expected_max}")
    
    if len(result['issues']) == 0:
        result['issues'].append("‚úÖ Kh√¥ng c√≥ v·∫•n ƒë·ªÅ")
    
    return result

# Validate t·ª´ng numerical column
print("="*80)
print("KI·ªÇM TRA T√çNH H·ª¢P L·ªÜ C·ª¶A C√ÅC C·ªòT NUMERICAL")
print("="*80)

# T·ª± ƒë·ªông l·∫•y min/max t·ª´ dataset (thay v√¨ hardcode)
print("\nüìä Ph√°t hi·ªán range t·ª± ƒë·ªông t·ª´ dataset:")
print("-" * 80)

validation_rules = []
for idx, col_name in enumerate(numerical_cols):
    col_data = numerical_data[:, idx]
    valid_data = col_data[~np.isnan(col_data)]
    
    # L·∫•y actual min/max t·ª´ data
    actual_min = np.min(valid_data)
    actual_max = np.max(valid_data)
    
    # ƒê·ªãnh nghƒ©a expected bounds d·ª±a tr√™n business logic
    # Ch·ªâ check c√°c constraints logic (kh√¥ng √¢m, ratio 0-1, etc.)
    if col_name == 'Avg_Utilization_Ratio':
        expected_min, expected_max = 0, 1  # Ratio ph·∫£i 0-1
    elif col_name in ['Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Credit_Limit', 
                      'Total_Trans_Amt', 'Total_Trans_Ct']:
        expected_min, expected_max = 0, None  # Kh√¥ng ƒë∆∞·ª£c √¢m
    elif col_name in ['Months_Inactive_12_mon', 'Contacts_Count_12_mon']:
        expected_min, expected_max = 0, None  # Kh√¥ng ƒë∆∞·ª£c √¢m
    elif col_name == 'Customer_Age':
        expected_min, expected_max = 18, None  # Tu·ªïi t·ªëi thi·ªÉu 18
    elif col_name == 'Total_Relationship_Count':
        expected_min, expected_max = 1, None  # √çt nh·∫•t 1 s·∫£n ph·∫©m
    else:
        expected_min, expected_max = None, None  # Kh√¥ng c√≥ constraint
    
    validation_rules.append((idx, col_name, expected_min, expected_max))
    
    print(f"{col_name:<30} Range: [{actual_min:.2f}, {actual_max:.2f}]")

print("-" * 80)

validation_results = []
for idx, col_name, min_val, max_val in validation_rules:
    result = validate_numerical_column(numerical_data, idx, col_name, min_val, max_val)
    validation_results.append(result)
    
    print(f"\nüìä {col_name}:")
    print(f"   Range: [{result['min']:.2f}, {result['max']:.2f}]")
    print(f"   Mean: {result['mean']:.2f}")
    print(f"   NaN: {result['nan_count']}")
    for issue in result['issues']:
        print(f"   {issue}")

print("\n" + "="*80)
print("T·ªîNG K·∫æT:")
total_issues = sum(1 for r in validation_results if len([i for i in r['issues'] if '‚ùå' in i]) > 0)
print(f"S·ªë c·ªôt c√≥ v·∫•n ƒë·ªÅ: {total_issues}/{len(validation_results)}")
print("="*80)

KI·ªÇM TRA T√çNH H·ª¢P L·ªÜ C·ª¶A C√ÅC C·ªòT NUMERICAL

üìä Ph√°t hi·ªán range t·ª± ƒë·ªông t·ª´ dataset:
--------------------------------------------------------------------------------
Customer_Age                   Range: [26.00, 73.00]
Dependent_count                Range: [0.00, 5.00]
Months_on_book                 Range: [13.00, 56.00]
Total_Relationship_Count       Range: [1.00, 6.00]
Months_Inactive_12_mon         Range: [0.00, 6.00]
Contacts_Count_12_mon          Range: [0.00, 6.00]
Credit_Limit                   Range: [1438.30, 34516.00]
Total_Revolving_Bal            Range: [0.00, 2517.00]
Avg_Open_To_Buy                Range: [3.00, 34516.00]
Total_Amt_Chng_Q4_Q1           Range: [0.00, 3.40]
Total_Trans_Amt                Range: [510.00, 18484.00]
Total_Trans_Ct                 Range: [10.00, 139.00]
Total_Ct_Chng_Q4_Q1            Range: [0.00, 3.71]
Avg_Utilization_Ratio          Range: [0.00, 1.00]
-----------------------------------------------------------------------

In [56]:
# Validate categorical columns
print("\n" + "="*80)
print("KI·ªÇM TRA T√çNH H·ª¢P L·ªÜ C·ª¶A C√ÅC C·ªòT CATEGORICAL")
print("="*80)

for idx, col_name in enumerate(categorical_cols):
    col_data = categorical_data[:, idx]
    unique_values = np.unique(col_data)
    
    print(f"\nüìã {col_name}:")
    print(f"   S·ªë gi√° tr·ªã unique: {len(unique_values)}")
    print(f"   C√°c gi√° tr·ªã:")
    
    for val in unique_values:
        count = np.sum(col_data == val)
        pct = count / len(col_data) * 100
        print(f"      - '{val}': {count:,} ({pct:.2f}%)")
    
    # Ki·ªÉm tra missing values ƒë∆∞·ª£c encode l√† 'Unknown'
    if 'Unknown' in unique_values:
        unknown_count = np.sum(col_data == 'Unknown')
        print(f"   ‚ö†Ô∏è C√≥ {unknown_count:,} gi√° tr·ªã 'Unknown' ({unknown_count/len(col_data)*100:.2f}%)")

print("\n" + "="*80)


KI·ªÇM TRA T√çNH H·ª¢P L·ªÜ C·ª¶A C√ÅC C·ªòT CATEGORICAL

üìã Gender:
   S·ªë gi√° tr·ªã unique: 2
   C√°c gi√° tr·ªã:
      - 'F': 5,358 (52.91%)
      - 'M': 4,769 (47.09%)

üìã Education_Level:
   S·ªë gi√° tr·ªã unique: 7
   C√°c gi√° tr·ªã:
      - 'College': 1,013 (10.00%)
      - 'Doctorate': 451 (4.45%)
      - 'Graduate': 3,128 (30.89%)
      - 'High School': 2,013 (19.88%)
      - 'Post-Graduate': 516 (5.10%)
      - 'Uneducated': 1,487 (14.68%)
      - 'Unknown': 1,519 (15.00%)
   ‚ö†Ô∏è C√≥ 1,519 gi√° tr·ªã 'Unknown' (15.00%)

üìã Marital_Status:
   S·ªë gi√° tr·ªã unique: 4
   C√°c gi√° tr·ªã:
      - 'Divorced': 748 (7.39%)
      - 'Married': 4,687 (46.28%)
      - 'Single': 3,943 (38.94%)
      - 'Unknown': 749 (7.40%)
   ‚ö†Ô∏è C√≥ 749 gi√° tr·ªã 'Unknown' (7.40%)

üìã Income_Category:
   S·ªë gi√° tr·ªã unique: 6
   C√°c gi√° tr·ªã:
      - '$120K +': 727 (7.18%)
      - '$40K - $60K': 1,790 (17.68%)
      - '$60K - $80K': 1,402 (13.84%)
      - '$80K - $120K': 1,

In [57]:
# Function ƒë·ªÉ detect outliers b·∫±ng IQR method
def detect_outliers_iqr(data, col_idx, col_name, multiplier=1.5):
    """
    Ph√°t hi·ªán outliers b·∫±ng ph∆∞∆°ng ph√°p IQR
    Outlier n·∫øu: value < Q1 - multiplier*IQR ho·∫∑c value > Q3 + multiplier*IQR
    """
    col_data = data[:, col_idx]
    
    # T√≠nh Q1, Q3, IQR
    q1 = np.percentile(col_data, 25)
    q3 = np.percentile(col_data, 75)
    iqr = q3 - q1
    
    # T√≠nh boundaries
    lower_bound = q1 - multiplier * iqr
    upper_bound = q3 + multiplier * iqr
    
    # T√¨m outliers
    outliers_lower = col_data < lower_bound
    outliers_upper = col_data > upper_bound
    outliers = outliers_lower | outliers_upper
    
    return {
        'column': col_name,
        'q1': q1,
        'q3': q3,
        'iqr': iqr,
        'lower_bound': lower_bound,
        'upper_bound': upper_bound,
        'n_outliers_lower': np.sum(outliers_lower),
        'n_outliers_upper': np.sum(outliers_upper),
        'n_outliers_total': np.sum(outliers),
        'outlier_pct': np.sum(outliers) / len(col_data) * 100,
        'outlier_indices': np.where(outliers)[0]
    }

# Function ƒë·ªÉ detect outliers b·∫±ng Z-score method
def detect_outliers_zscore(data, col_idx, col_name, threshold=3):
    """
    Ph√°t hi·ªán outliers b·∫±ng ph∆∞∆°ng ph√°p Z-score
    Outlier n·∫øu: |z-score| > threshold
    """
    col_data = data[:, col_idx]
    
    mean = np.mean(col_data)
    std = np.std(col_data)
    
    # T√≠nh z-scores
    z_scores = np.abs((col_data - mean) / std)
    
    # T√¨m outliers
    outliers = z_scores > threshold
    
    return {
        'column': col_name,
        'mean': mean,
        'std': std,
        'threshold': threshold,
        'n_outliers': np.sum(outliers),
        'outlier_pct': np.sum(outliers) / len(col_data) * 100,
        'outlier_indices': np.where(outliers)[0]
    }

print("="*80)
print("PH√ÅT HI·ªÜN OUTLIERS - PH∆Ø∆†NG PH√ÅP IQR (1.5*IQR)")
print("="*80)

outlier_results_iqr = []
for idx, col_name in enumerate(numerical_cols):
    result = detect_outliers_iqr(numerical_data, idx, col_name, multiplier=1.5)
    outlier_results_iqr.append(result)
    
    if result['n_outliers_total'] > 0:
        print(f"\nüìä {col_name}:")
        print(f"   Q1={result['q1']:.2f}, Q3={result['q3']:.2f}, IQR={result['iqr']:.2f}")
        print(f"   Bounds: [{result['lower_bound']:.2f}, {result['upper_bound']:.2f}]")
        print(f"   Outliers th·∫•p: {result['n_outliers_lower']}")
        print(f"   Outliers cao: {result['n_outliers_upper']}")
        print(f"   ‚ö†Ô∏è T·ªïng outliers: {result['n_outliers_total']} ({result['outlier_pct']:.2f}%)")

print("\n" + "="*80)
print("PH√ÅT HI·ªÜN OUTLIERS - PH∆Ø∆†NG PH√ÅP Z-SCORE (threshold=3)")
print("="*80)

outlier_results_zscore = []
for idx, col_name in enumerate(numerical_cols):
    result = detect_outliers_zscore(numerical_data, idx, col_name, threshold=3)
    outlier_results_zscore.append(result)
    
    if result['n_outliers'] > 0:
        print(f"\nüìä {col_name}:")
        print(f"   Mean={result['mean']:.2f}, Std={result['std']:.2f}")
        print(f"   ‚ö†Ô∏è Outliers: {result['n_outliers']} ({result['outlier_pct']:.2f}%)")

print("\n" + "="*80)

PH√ÅT HI·ªÜN OUTLIERS - PH∆Ø∆†NG PH√ÅP IQR (1.5*IQR)

üìä Customer_Age:
   Q1=41.00, Q3=52.00, IQR=11.00
   Bounds: [24.50, 68.50]
   Outliers th·∫•p: 0
   Outliers cao: 2
   ‚ö†Ô∏è T·ªïng outliers: 2 (0.02%)

üìä Months_on_book:
   Q1=31.00, Q3=40.00, IQR=9.00
   Bounds: [17.50, 53.50]
   Outliers th·∫•p: 188
   Outliers cao: 198
   ‚ö†Ô∏è T·ªïng outliers: 386 (3.81%)

üìä Months_Inactive_12_mon:
   Q1=2.00, Q3=3.00, IQR=1.00
   Bounds: [0.50, 4.50]
   Outliers th·∫•p: 29
   Outliers cao: 302
   ‚ö†Ô∏è T·ªïng outliers: 331 (3.27%)

üìä Contacts_Count_12_mon:
   Q1=2.00, Q3=3.00, IQR=1.00
   Bounds: [0.50, 4.50]
   Outliers th·∫•p: 399
   Outliers cao: 230
   ‚ö†Ô∏è T·ªïng outliers: 629 (6.21%)

üìä Credit_Limit:
   Q1=2555.00, Q3=11067.50, IQR=8512.50
   Bounds: [-10213.75, 23836.25]
   Outliers th·∫•p: 0
   Outliers cao: 984
   ‚ö†Ô∏è T·ªïng outliers: 984 (9.72%)

üìä Avg_Open_To_Buy:
   Q1=1324.50, Q3=9859.00, IQR=8534.50
   Bounds: [-11477.25, 22660.75]
   Outliers th·∫•p: 0


In [58]:
# X√°c nh·∫≠n kh√¥ng c√≥ missing values trong numerical data
print("="*80)
print("KI·ªÇM TRA MISSING VALUES")
print("="*80)

print("\nüìä Numerical Data:")
for idx, col_name in enumerate(numerical_cols):
    nan_count = np.sum(np.isnan(numerical_data[:, idx]))
    print(f"  {col_name}: {nan_count} NaN values")

print("\nüìã Categorical Data:")
for idx, col_name in enumerate(categorical_cols):
    unknown_count = np.sum(categorical_data[:, idx] == 'Unknown')
    if unknown_count > 0:
        print(f"  {col_name}: {unknown_count} 'Unknown' values ({unknown_count/len(categorical_data)*100:.2f}%)")
    else:
        print(f"  {col_name}: 0 missing values")

KI·ªÇM TRA MISSING VALUES

üìä Numerical Data:
  Customer_Age: 0 NaN values
  Dependent_count: 0 NaN values
  Months_on_book: 0 NaN values
  Total_Relationship_Count: 0 NaN values
  Months_Inactive_12_mon: 0 NaN values
  Contacts_Count_12_mon: 0 NaN values
  Credit_Limit: 0 NaN values
  Total_Revolving_Bal: 0 NaN values
  Avg_Open_To_Buy: 0 NaN values
  Total_Amt_Chng_Q4_Q1: 0 NaN values
  Total_Trans_Amt: 0 NaN values
  Total_Trans_Ct: 0 NaN values
  Total_Ct_Chng_Q4_Q1: 0 NaN values
  Avg_Utilization_Ratio: 0 NaN values

üìã Categorical Data:
  Gender: 0 missing values
  Education_Level: 1519 'Unknown' values (15.00%)
  Marital_Status: 749 'Unknown' values (7.40%)
  Income_Category: 1112 'Unknown' values (10.98%)
  Card_Category: 0 missing values


---
## üîß B∆Ø·ªöC 5: FEATURE ENGINEERING

D·ª±a tr√™n insights t·ª´ EDA, t·∫°o c√°c features m·ªõi:
1. **Lo·∫°i b·ªè multicollinear feature**: `Avg_Open_To_Buy` (r=0.996 v·ªõi Credit_Limit)
2. **T·∫°o features m·ªõi** t·ª´ existing features
3. **Encode categorical variables**

In [59]:
# ============================================================================
# 5.1: LO·∫†I B·ªé MULTICOLLINEAR FEATURE
# ============================================================================

# Avg_Open_To_Buy c√≥ correlation 0.996 v·ªõi Credit_Limit -> lo·∫°i b·ªè
# Index c·ªßa Avg_Open_To_Buy trong numerical_data l√† 8

print("="*80)
print("LO·∫†I B·ªé MULTICOLLINEAR FEATURE")
print("="*80)

avg_open_to_buy_idx = numerical_cols.index('Avg_Open_To_Buy')
print(f"\n‚ùå Lo·∫°i b·ªè: {numerical_cols[avg_open_to_buy_idx]} (correlation 0.996 v·ªõi Credit_Limit)")

# T·∫°o numerical_data m·ªõi kh√¥ng c√≥ Avg_Open_To_Buy
indices_to_keep = [i for i in range(numerical_data.shape[1]) if i != avg_open_to_buy_idx]
numerical_data_clean = numerical_data[:, indices_to_keep]
numerical_cols_clean = [col for i, col in enumerate(numerical_cols) if i != avg_open_to_buy_idx]

print(f"‚úÖ Shape tr∆∞·ªõc: {numerical_data.shape}")
print(f"‚úÖ Shape sau: {numerical_data_clean.shape}")
print(f"‚úÖ S·ªë features c√≤n l·∫°i: {len(numerical_cols_clean)}")
print("="*80)

LO·∫†I B·ªé MULTICOLLINEAR FEATURE

‚ùå Lo·∫°i b·ªè: Avg_Open_To_Buy (correlation 0.996 v·ªõi Credit_Limit)
‚úÖ Shape tr∆∞·ªõc: (10127, 14)
‚úÖ Shape sau: (10127, 13)
‚úÖ S·ªë features c√≤n l·∫°i: 13


In [60]:
# ============================================================================
# 5.2: T·∫†O C√ÅC FEATURES M·ªöI
# ============================================================================

print("\n" + "="*80)
print("T·∫†O C√ÅC FEATURES M·ªöI")
print("="*80)

# Helper function ƒë·ªÉ tr√°nh chia cho 0
def safe_divide(numerator, denominator, default=0):
    """Chia an to√†n, tr·∫£ v·ªÅ default n·∫øu denominator = 0"""
    result = np.zeros_like(numerator, dtype=float)
    mask = denominator != 0
    result[mask] = numerator[mask] / denominator[mask]
    result[~mask] = default
    return result

# Get indices c·ªßa c√°c columns c·∫ßn thi·∫øt
def get_col_idx(col_name):
    return numerical_cols_clean.index(col_name)

# Feature 1: Avg_Transaction_Value = Total_Trans_Amt / Total_Trans_Ct
trans_amt = numerical_data_clean[:, get_col_idx('Total_Trans_Amt')]
trans_ct = numerical_data_clean[:, get_col_idx('Total_Trans_Ct')]
avg_trans_value = safe_divide(trans_amt, trans_ct, default=0)
print("\n‚úÖ Feature 1: Avg_Transaction_Value")
print(f"   Range: [{np.min(avg_trans_value):.2f}, {np.max(avg_trans_value):.2f}]")
print(f"   Mean: {np.mean(avg_trans_value):.2f}")

# Feature 2: Credit_Utilization_Ratio = Total_Revolving_Bal / Credit_Limit
revolving_bal = numerical_data_clean[:, get_col_idx('Total_Revolving_Bal')]
credit_limit = numerical_data_clean[:, get_col_idx('Credit_Limit')]
credit_util = safe_divide(revolving_bal, credit_limit, default=0)
print("\n‚úÖ Feature 2: Credit_Utilization_Ratio")
print(f"   Range: [{np.min(credit_util):.2f}, {np.max(credit_util):.2f}]")
print(f"   Mean: {np.mean(credit_util):.2f}")

# Feature 3: Transaction_Frequency = Total_Trans_Ct / Months_on_book
months_on_book = numerical_data_clean[:, get_col_idx('Months_on_book')]
trans_frequency = safe_divide(trans_ct, months_on_book, default=0)
print("\n‚úÖ Feature 3: Transaction_Frequency (trans per month)")
print(f"   Range: [{np.min(trans_frequency):.2f}, {np.max(trans_frequency):.2f}]")
print(f"   Mean: {np.mean(trans_frequency):.2f}")

# Feature 4: Activity_Score = Total_Trans_Ct * (1 - Months_Inactive_12_mon/12)
months_inactive = numerical_data_clean[:, get_col_idx('Months_Inactive_12_mon')]
activity_score = trans_ct * (1 - months_inactive / 12)
print("\n‚úÖ Feature 4: Activity_Score")
print(f"   Range: [{np.min(activity_score):.2f}, {np.max(activity_score):.2f}]")
print(f"   Mean: {np.mean(activity_score):.2f}")

# Feature 5: Relationship_Age_Ratio = Total_Relationship_Count / (Months_on_book/12)
relationship_count = numerical_data_clean[:, get_col_idx('Total_Relationship_Count')]
years_on_book = months_on_book / 12
relationship_age_ratio = safe_divide(relationship_count, years_on_book, default=0)
print("\n‚úÖ Feature 5: Relationship_Age_Ratio (products per year)")
print(f"   Range: [{np.min(relationship_age_ratio):.2f}, {np.max(relationship_age_ratio):.2f}]")
print(f"   Mean: {np.mean(relationship_age_ratio):.2f}")

# Feature 6: Customer_Age_Normalized (tu·ªïi so v·ªõi th·ªùi gian quan h·ªá)
customer_age = numerical_data_clean[:, get_col_idx('Customer_Age')]
age_when_joined = customer_age - (months_on_book / 12)
print("\n‚úÖ Feature 6: Age_When_Joined")
print(f"   Range: [{np.min(age_when_joined):.2f}, {np.max(age_when_joined):.2f}]")
print(f"   Mean: {np.mean(age_when_joined):.2f}")

# G·ªôp t·∫•t c·∫£ features m·ªõi v√†o array
new_features = np.column_stack([
    avg_trans_value,
    credit_util,
    trans_frequency,
    activity_score,
    relationship_age_ratio,
    age_when_joined
])

new_feature_names = [
    'Avg_Transaction_Value',
    'Credit_Utilization_Ratio',
    'Transaction_Frequency',
    'Activity_Score',
    'Relationship_Age_Ratio',
    'Age_When_Joined'
]

# K·∫øt h·ª£p v·ªõi numerical data g·ªëc
numerical_data_engineered = np.column_stack([numerical_data_clean, new_features])
numerical_cols_engineered = numerical_cols_clean + new_feature_names

print(f"\n{'='*80}")
print(f"‚úÖ ƒê√£ t·∫°o {len(new_feature_names)} features m·ªõi")
print(f"‚úÖ T·ªïng s·ªë numerical features: {numerical_data_engineered.shape[1]}")
print(f"{'='*80}")


T·∫†O C√ÅC FEATURES M·ªöI

‚úÖ Feature 1: Avg_Transaction_Value
   Range: [19.14, 190.19]
   Mean: 62.61

‚úÖ Feature 2: Credit_Utilization_Ratio
   Range: [0.00, 1.00]
   Mean: 0.27

‚úÖ Feature 3: Transaction_Frequency (trans per month)
   Range: [0.19, 9.77]
   Mean: 1.92

‚úÖ Feature 4: Activity_Score
   Range: [7.50, 120.08]
   Mean: 52.29

‚úÖ Feature 5: Relationship_Age_Ratio (products per year)
   Range: [0.21, 5.54]
   Mean: 1.36

‚úÖ Feature 6: Age_When_Joined
   Range: [23.00, 70.00]
   Mean: 43.33

‚úÖ ƒê√£ t·∫°o 6 features m·ªõi
‚úÖ T·ªïng s·ªë numerical features: 19


---
## üî¢ B∆Ø·ªöC 6: CHU·∫®N H√ìA (NORMALIZATION) & ƒêI·ªÄU CHU·∫®N (STANDARDIZATION)

### Chi·∫øn l∆∞·ª£c:
1. **Normalization (Min-Max, Log Transform)**: Cho features c√≥ distribution kh√¥ng Gaussian ho·∫∑c c√≥ outliers
2. **Standardization (Z-score)**: Cho features c√≥ distribution g·∫ßn Gaussian

Ki·ªÉm tra distribution c·ªßa t·ª´ng feature ƒë·ªÉ quy·∫øt ƒë·ªãnh ph∆∞∆°ng ph√°p ph√π h·ª£p.

In [61]:
# ============================================================================
# 6.1: KI·ªÇM TRA DISTRIBUTION (SKEWNESS)
# ============================================================================

from scipy.stats import skew, kurtosis

print("="*80)
print("PH√ÇN T√çCH DISTRIBUTION C·ª¶A C√ÅC FEATURES")
print("="*80)

skewness_results = []
for idx, col_name in enumerate(numerical_cols_engineered):
    col_data = numerical_data_engineered[:, idx]
    
    # T√≠nh skewness v√† kurtosis
    skewness = skew(col_data)
    kurt = kurtosis(col_data)
    
    # Ph√¢n lo·∫°i distribution
    if abs(skewness) < 0.5:
        dist_type = "Gaussian-like"
        method = "Standardization (Z-score)"
    elif abs(skewness) < 1.0:
        dist_type = "Moderate skew"
        method = "Normalization (Min-Max)"
    else:
        dist_type = "Highly skewed"
        method = "Log Transform + Standardization"
    
    skewness_results.append({
        'column': col_name,
        'skewness': skewness,
        'kurtosis': kurt,
        'distribution': dist_type,
        'recommended_method': method
    })
    
    print(f"\nüìä {col_name}:")
    print(f"   Skewness: {skewness:.3f}")
    print(f"   Kurtosis: {kurt:.3f}")
    print(f"   Distribution: {dist_type}")
    print(f"   ‚û°Ô∏è Recommended: {method}")

print("\n" + "="*80)

PH√ÇN T√çCH DISTRIBUTION C·ª¶A C√ÅC FEATURES

üìä Customer_Age:
   Skewness: -0.034
   Kurtosis: -0.289
   Distribution: Gaussian-like
   ‚û°Ô∏è Recommended: Standardization (Z-score)

üìä Dependent_count:
   Skewness: -0.021
   Kurtosis: -0.683
   Distribution: Gaussian-like
   ‚û°Ô∏è Recommended: Standardization (Z-score)

üìä Months_on_book:
   Skewness: -0.107
   Kurtosis: 0.399
   Distribution: Gaussian-like
   ‚û°Ô∏è Recommended: Standardization (Z-score)

üìä Total_Relationship_Count:
   Skewness: -0.162
   Kurtosis: -1.006
   Distribution: Gaussian-like
   ‚û°Ô∏è Recommended: Standardization (Z-score)

üìä Months_Inactive_12_mon:
   Skewness: 0.633
   Kurtosis: 1.097
   Distribution: Moderate skew
   ‚û°Ô∏è Recommended: Normalization (Min-Max)

üìä Contacts_Count_12_mon:
   Skewness: 0.011
   Kurtosis: 0.000
   Distribution: Gaussian-like
   ‚û°Ô∏è Recommended: Standardization (Z-score)

üìä Credit_Limit:
   Skewness: 1.666
   Kurtosis: 1.808
   Distribution: Highly skew

In [62]:
# ============================================================================
# 6.2: IMPLEMENT NORMALIZATION & STANDARDIZATION FUNCTIONS
# ============================================================================

def min_max_normalization(data, feature_min=None, feature_max=None):
    """
    Min-Max Normalization: scale to [0, 1]
    X_norm = (X - X_min) / (X_max - X_min)
    """
    if feature_min is None:
        feature_min = np.min(data)
    if feature_max is None:
        feature_max = np.max(data)
    
    # Tr√°nh chia cho 0
    if feature_max - feature_min == 0:
        return np.zeros_like(data)
    
    return (data - feature_min) / (feature_max - feature_min)

def log_transformation(data, offset=1):
    """
    Log Transformation: log(X + offset)
    Offset ƒë·ªÉ tr√°nh log(0)
    """
    return np.log(data + offset)

def decimal_scaling(data):
    """
    Decimal Scaling: X / 10^d
    d l√† s·ªë ch·ªØ s·ªë c·ªßa max(|X|)
    """
    max_abs = np.max(np.abs(data))
    if max_abs == 0:
        return data
    d = np.ceil(np.log10(max_abs))
    return data / (10 ** d)

def standardization(data, mean=None, std=None):
    """
    Z-score Standardization: (X - mean) / std
    K·∫øt qu·∫£ c√≥ mean=0, std=1
    """
    if mean is None:
        mean = np.mean(data)
    if std is None:
        std = np.std(data)
    
    # Tr√°nh chia cho 0
    if std == 0:
        return np.zeros_like(data)
    
    return (data - mean) / std

def robust_scaling(data, q1=None, q3=None):
    """
    Robust Scaling: (X - median) / IQR
    Robust v·ªõi outliers
    """
    median = np.median(data)
    if q1 is None:
        q1 = np.percentile(data, 25)
    if q3 is None:
        q3 = np.percentile(data, 75)
    
    iqr = q3 - q1
    if iqr == 0:
        return np.zeros_like(data)
    
    return (data - median) / iqr

print("‚úÖ ƒê√£ ƒë·ªãnh nghƒ©a c√°c h√†m normalization & standardization:")
print("   1. min_max_normalization() - Scale to [0,1]")
print("   2. log_transformation() - Log transform")
print("   3. decimal_scaling() - Decimal scaling")
print("   4. standardization() - Z-score (mean=0, std=1)")
print("   5. robust_scaling() - Robust v·ªõi outliers")

‚úÖ ƒê√£ ƒë·ªãnh nghƒ©a c√°c h√†m normalization & standardization:
   1. min_max_normalization() - Scale to [0,1]
   2. log_transformation() - Log transform
   3. decimal_scaling() - Decimal scaling
   4. standardization() - Z-score (mean=0, std=1)
   5. robust_scaling() - Robust v·ªõi outliers


In [63]:
# ============================================================================
# 6.3: √ÅP D·ª§NG NORMALIZATION/STANDARDIZATION CHO T·ª™NG FEATURE
# ============================================================================

print("\n" + "="*80)
print("√ÅP D·ª§NG SCALING CHO C√ÅC FEATURES")
print("="*80)

# T·∫°o copy c·ªßa data ƒë·ªÉ scale
numerical_data_scaled = np.copy(numerical_data_engineered).astype(float)

scaling_log = []

for idx, col_name in enumerate(numerical_cols_engineered):
    col_data = numerical_data_engineered[:, idx]
    skewness = skew(col_data)
    
    # Quy·∫øt ƒë·ªãnh ph∆∞∆°ng ph√°p d·ª±a tr√™n skewness
    if abs(skewness) < 0.5:
        # Gaussian-like: Standardization
        scaled_data = standardization(col_data)
        method = "Standardization (Z-score)"
    elif abs(skewness) < 1.0:
        # Moderate skew: Min-Max
        scaled_data = min_max_normalization(col_data)
        method = "Min-Max Normalization"
    else:
        # Highly skewed: Log + Standardization
        # Ki·ªÉm tra xem c√≥ gi√° tr·ªã ‚â§ 0 kh√¥ng
        if np.min(col_data) > 0:
            log_data = log_transformation(col_data, offset=0)
            scaled_data = standardization(log_data)
            method = "Log Transform + Standardization"
        else:
            # N·∫øu c√≥ gi√° tr·ªã ‚â§ 0, d√πng Robust Scaling
            scaled_data = robust_scaling(col_data)
            method = "Robust Scaling"
    
    # Update data
    numerical_data_scaled[:, idx] = scaled_data
    
    scaling_log.append({
        'column': col_name,
        'method': method,
        'skewness_before': skewness,
        'skewness_after': skew(scaled_data),
        'mean_after': np.mean(scaled_data),
        'std_after': np.std(scaled_data)
    })
    
    print(f"\nüìä {col_name}:")
    print(f"   Method: {method}")
    print(f"   Skewness: {skewness:.3f} ‚Üí {skew(scaled_data):.3f}")
    print(f"   Mean: {np.mean(col_data):.2f} ‚Üí {np.mean(scaled_data):.3f}")
    print(f"   Std: {np.std(col_data):.2f} ‚Üí {np.std(scaled_data):.3f}")

print("\n" + "="*80)
print("‚úÖ ƒê√É HO√ÄN TH√ÄNH SCALING CHO T·∫§T C·∫¢ FEATURES")
print(f"‚úÖ Shape: {numerical_data_scaled.shape}")
print("="*80)


√ÅP D·ª§NG SCALING CHO C√ÅC FEATURES

üìä Customer_Age:
   Method: Standardization (Z-score)
   Skewness: -0.034 ‚Üí -0.034
   Mean: 46.33 ‚Üí 0.000
   Std: 8.02 ‚Üí 1.000

üìä Dependent_count:
   Method: Standardization (Z-score)
   Skewness: -0.021 ‚Üí -0.021
   Mean: 2.35 ‚Üí -0.000
   Std: 1.30 ‚Üí 1.000

üìä Months_on_book:
   Method: Standardization (Z-score)
   Skewness: -0.107 ‚Üí -0.107
   Mean: 35.93 ‚Üí -0.000
   Std: 7.99 ‚Üí 1.000

üìä Total_Relationship_Count:
   Method: Standardization (Z-score)
   Skewness: -0.162 ‚Üí -0.162
   Mean: 3.81 ‚Üí -0.000
   Std: 1.55 ‚Üí 1.000

üìä Months_Inactive_12_mon:
   Method: Min-Max Normalization
   Skewness: 0.633 ‚Üí 0.633
   Mean: 2.34 ‚Üí 0.390
   Std: 1.01 ‚Üí 0.168

üìä Contacts_Count_12_mon:
   Method: Standardization (Z-score)
   Skewness: 0.011 ‚Üí 0.011
   Mean: 2.46 ‚Üí -0.000
   Std: 1.11 ‚Üí 1.000

üìä Credit_Limit:
   Method: Log Transform + Standardization
   Skewness: 1.666 ‚Üí 0.457
   Mean: 8631.95 ‚Üí 0.000

---
## üè∑Ô∏è B∆Ø·ªöC 7: ENCODE CATEGORICAL VARIABLES

S·ª≠ d·ª•ng **Label Encoding** v√† **One-Hot Encoding** (manual implementation without sklearn)

In [64]:
# ============================================================================
# 7.1: ONE-HOT ENCODING CHO CATEGORICAL FEATURES
# ============================================================================

def one_hot_encode(data, column_idx):
    """
    One-hot encode m·ªôt categorical column
    Returns: encoded array (n_samples, n_categories), category names
    """
    col_data = data[:, column_idx]
    unique_values = np.unique(col_data)
    n_categories = len(unique_values)
    
    # T·∫°o one-hot encoded matrix
    encoded = np.zeros((len(col_data), n_categories))
    
    for i, val in enumerate(unique_values):
        mask = col_data == val
        encoded[mask, i] = 1
    
    return encoded, unique_values

print("="*80)
print("ONE-HOT ENCODING CHO CATEGORICAL FEATURES")
print("="*80)

encoded_arrays = []
encoded_feature_names = []

for idx, col_name in enumerate(categorical_cols):
    encoded, categories = one_hot_encode(categorical_data, idx)
    encoded_arrays.append(encoded)
    
    # T·∫°o t√™n cho t·ª´ng encoded feature
    feature_names = [f"{col_name}_{cat}" for cat in categories]
    encoded_feature_names.extend(feature_names)
    
    print(f"\nüìã {col_name}:")
    print(f"   Categories: {len(categories)}")
    print(f"   Encoded shape: {encoded.shape}")
    print(f"   Feature names: {feature_names}")

# Concatenate t·∫•t c·∫£ encoded arrays
categorical_data_encoded = np.concatenate(encoded_arrays, axis=1)

print(f"\n{'='*80}")
print(f"‚úÖ T·ªïng s·ªë categorical features sau encoding: {categorical_data_encoded.shape[1]}")
print(f"‚úÖ Shape: {categorical_data_encoded.shape}")
print(f"{'='*80}")

ONE-HOT ENCODING CHO CATEGORICAL FEATURES

üìã Gender:
   Categories: 2
   Encoded shape: (10127, 2)
   Feature names: ['Gender_F', 'Gender_M']

üìã Education_Level:
   Categories: 7
   Encoded shape: (10127, 7)
   Feature names: ['Education_Level_College', 'Education_Level_Doctorate', 'Education_Level_Graduate', 'Education_Level_High School', 'Education_Level_Post-Graduate', 'Education_Level_Uneducated', 'Education_Level_Unknown']

üìã Marital_Status:
   Categories: 4
   Encoded shape: (10127, 4)
   Feature names: ['Marital_Status_Divorced', 'Marital_Status_Married', 'Marital_Status_Single', 'Marital_Status_Unknown']

üìã Income_Category:
   Categories: 6
   Encoded shape: (10127, 6)
   Feature names: ['Income_Category_$120K +', 'Income_Category_$40K - $60K', 'Income_Category_$60K - $80K', 'Income_Category_$80K - $120K', 'Income_Category_Less than $40K', 'Income_Category_Unknown']

üìã Card_Category:
   Categories: 4
   Encoded shape: (10127, 4)
   Feature names: ['Card_Category_

In [65]:
# ============================================================================
# 7.2: K·∫æT H·ª¢P T·∫§T C·∫¢ FEATURES V√Ä T·∫†O FINAL DATASET
# ============================================================================

print("\n" + "="*80)
print("T·∫†O FINAL PREPROCESSED DATASET")
print("="*80)

# K·∫øt h·ª£p numerical (scaled) v√† categorical (encoded)
X_final = np.concatenate([numerical_data_scaled, categorical_data_encoded], axis=1)
y_final = target

# T·∫°o list t√™n features
all_feature_names = numerical_cols_engineered + encoded_feature_names

print(f"\n‚úÖ Final dataset shape: {X_final.shape}")
print(f"   - Numerical features: {len(numerical_cols_engineered)}")
print(f"   - Categorical features (encoded): {len(encoded_feature_names)}")
print(f"   - Total features: {len(all_feature_names)}")
print(f"\n‚úÖ Target shape: {y_final.shape}")
print(f"   - Class 0 (Existing): {np.sum(y_final == 0)} ({np.sum(y_final == 0)/len(y_final)*100:.2f}%)")
print(f"   - Class 1 (Attrited): {np.sum(y_final == 1)} ({np.sum(y_final == 1)/len(y_final)*100:.2f}%)")

print("\n" + "="*80)
print("‚úÖ ƒê√É HO√ÄN TH√ÄNH PREPROCESSING!")
print("="*80)


T·∫†O FINAL PREPROCESSED DATASET

‚úÖ Final dataset shape: (10127, 42)
   - Numerical features: 19
   - Categorical features (encoded): 23
   - Total features: 42

‚úÖ Target shape: (10127,)
   - Class 0 (Existing): 8500 (83.93%)
   - Class 1 (Attrited): 1627 (16.07%)

‚úÖ ƒê√É HO√ÄN TH√ÄNH PREPROCESSING!
   - Class 0 (Existing): 8500 (83.93%)
   - Class 1 (Attrited): 1627 (16.07%)

‚úÖ ƒê√É HO√ÄN TH√ÄNH PREPROCESSING!


---
## üìä B∆Ø·ªöC 8: T√çNH TO√ÅN TH·ªêNG K√ä M√î T·∫¢ V·ªöI ERROR HANDLING

T√≠nh to√°n c√°c th·ªëng k√™ m√¥ t·∫£ v·ªõi x·ª≠ l√Ω numerical stability

In [66]:
# ============================================================================
# T√çNH TO√ÅN TH·ªêNG K√ä V·ªöI NUMERICAL STABILITY
# ============================================================================

def safe_mean(data):
    """T√≠nh mean v·ªõi error handling"""
    try:
        if len(data) == 0:
            return np.nan
        return np.mean(data)
    except Exception as e:
        print(f"Error in safe_mean: {e}")
        return np.nan

def safe_std(data):
    """T√≠nh standard deviation v·ªõi error handling"""
    try:
        if len(data) <= 1:
            return np.nan
        return np.std(data, ddof=1)  # Sample std
    except Exception as e:
        print(f"Error in safe_std: {e}")
        return np.nan

def safe_percentile(data, q):
    """T√≠nh percentile v·ªõi error handling"""
    try:
        if len(data) == 0:
            return np.nan
        return np.percentile(data, q)
    except Exception as e:
        print(f"Error in safe_percentile: {e}")
        return np.nan

def safe_correlation(x, y):
    """T√≠nh correlation v·ªõi error handling"""
    try:
        if len(x) != len(y) or len(x) == 0:
            return np.nan
        
        # Lo·∫°i b·ªè NaN values
        mask = ~(np.isnan(x) | np.isnan(y))
        x_clean = x[mask]
        y_clean = y[mask]
        
        if len(x_clean) < 2:
            return np.nan
        
        # T√≠nh correlation
        x_mean = np.mean(x_clean)
        y_mean = np.mean(y_clean)
        
        numerator = np.sum((x_clean - x_mean) * (y_clean - y_mean))
        denominator = np.sqrt(np.sum((x_clean - x_mean)**2) * np.sum((y_clean - y_mean)**2))
        
        if denominator == 0:
            return np.nan
        
        return numerator / denominator
    except Exception as e:
        print(f"Error in safe_correlation: {e}")
        return np.nan

print("="*80)
print("TH·ªêNG K√ä M√î T·∫¢ CHO PREPROCESSED DATA")
print("="*80)

# Th·ªëng k√™ cho numerical features (top 10 ƒë·ªÉ tr√°nh qu√° d√†i)
print("\nüìä Th·ªëng k√™ m√¥ t·∫£ cho 10 features ƒë·∫ßu ti√™n:")
print("-" * 80)
print(f"{'Feature':<30} {'Mean':>10} {'Std':>10} {'Min':>10} {'Max':>10}")
print("-" * 80)

for idx in range(min(10, X_final.shape[1])):
    col_data = X_final[:, idx]
    feature_name = all_feature_names[idx] if idx < len(all_feature_names) else f"Feature_{idx}"
    
    mean_val = safe_mean(col_data)
    std_val = safe_std(col_data)
    min_val = np.min(col_data) if len(col_data) > 0 else np.nan
    max_val = np.max(col_data) if len(col_data) > 0 else np.nan
    
    print(f"{feature_name:<30} {mean_val:>10.3f} {std_val:>10.3f} {min_val:>10.3f} {max_val:>10.3f}")

print("-" * 80)
print("\n‚úÖ T·∫•t c·∫£ ph√©p t√≠nh ƒë·ªÅu c√≥ error handling ƒë·ªÉ tr√°nh division by zero v√† numerical instability")
print("="*80)

TH·ªêNG K√ä M√î T·∫¢ CHO PREPROCESSED DATA

üìä Th·ªëng k√™ m√¥ t·∫£ cho 10 features ƒë·∫ßu ti√™n:
--------------------------------------------------------------------------------
Feature                              Mean        Std        Min        Max
--------------------------------------------------------------------------------
Customer_Age                        0.000      1.000     -2.536      3.327
Dependent_count                    -0.000      1.000     -1.806      2.043
Months_on_book                     -0.000      1.000     -2.871      2.513
Total_Relationship_Count           -0.000      1.000     -1.810      1.407
Months_Inactive_12_mon              0.390      0.168      0.000      1.000
Contacts_Count_12_mon              -0.000      1.000     -2.220      3.204
Credit_Limit                        0.000      1.000     -1.427      1.977
Total_Revolving_Bal                -0.000      1.000     -1.427      1.662
Total_Amt_Chng_Q4_Q1                0.105      0.961     -3.228

---
## üî¨ B∆Ø·ªöC 9: KI·ªÇM ƒê·ªäNH GI·∫¢ THI·∫æT TH·ªêNG K√ä (HYPOTHESIS TESTING)

### Test 1: Gi·ªõi t√≠nh c√≥ ·∫£nh h∆∞·ªüng ƒë·∫øn churn rate kh√¥ng?
- **H0 (Null Hypothesis)**: T·ª∑ l·ªá churn c·ªßa Female = T·ª∑ l·ªá churn c·ªßa Male
- **H1 (Alternative Hypothesis)**: T·ª∑ l·ªá churn c·ªßa Female ‚â† T·ª∑ l·ªá churn c·ªßa Male
- **Ph∆∞∆°ng ph√°p**: Two-proportion Z-test
- **Significance level**: Œ± = 0.05

In [67]:
# ============================================================================
# TEST 1: GENDER VS CHURN (Two-proportion Z-test)
# ============================================================================

print("="*80)
print("TEST 1: GENDER V√Ä CHURN RATE")
print("="*80)

# L·∫•y data Gender t·ª´ categorical_data
gender_col = categorical_data[:, categorical_cols.index('Gender')]

# T√°ch Female v√† Male
female_mask = gender_col == 'F'
male_mask = gender_col == 'M'

# Churn rates
female_churn = target[female_mask]
male_churn = target[male_mask]

n_female = len(female_churn)
n_male = len(male_churn)

p_female = np.mean(female_churn)
p_male = np.mean(male_churn)

print(f"\nüìä D·ªØ li·ªáu:")
print(f"   Female: n={n_female}, churn={np.sum(female_churn)}, rate={p_female:.4f} ({p_female*100:.2f}%)")
print(f"   Male:   n={n_male}, churn={np.sum(male_churn)}, rate={p_male:.4f} ({p_male*100:.2f}%)")

# Two-proportion Z-test (manual implementation)
p_pooled = (np.sum(female_churn) + np.sum(male_churn)) / (n_female + n_male)
se = np.sqrt(p_pooled * (1 - p_pooled) * (1/n_female + 1/n_male))
z_stat = (p_female - p_male) / se
# Manual normal CDF approximation for p-value
p_value = 2 * (1 - 0.5 * (1 + np.tanh(abs(z_stat) * np.sqrt(2/np.pi))))  # Two-tailed test

print(f"\nüî¨ Ki·ªÉm ƒë·ªãnh gi·∫£ thi·∫øt:")
print(f"   H0: p_female = p_male")
print(f"   H1: p_female ‚â† p_male")
print(f"   Significance level: Œ± = 0.05")
print(f"\nüìà K·∫øt qu·∫£:")
print(f"   Z-statistic: {z_stat:.4f}")
print(f"   P-value: {p_value:.6f}")

if p_value < 0.05:
    print(f"\n‚úÖ K·∫æT LU·∫¨N: REJECT H0 (p={p_value:.6f} < 0.05)")
    print(f"   ‚Üí Female c√≥ t·ª∑ l·ªá churn KH√ÅC BI·ªÜT c√≥ √Ω nghƒ©a th·ªëng k√™ so v·ªõi Male")
    print(f"   ‚Üí Female churn cao h∆°n {(p_female - p_male)*100:.2f} ƒëi·ªÉm ph·∫ßn trƒÉm")
else:
    print(f"\n‚ùå K·∫æT LU·∫¨N: FAIL TO REJECT H0 (p={p_value:.6f} >= 0.05)")
    print(f"   ‚Üí Kh√¥ng c√≥ b·∫±ng ch·ª©ng cho th·∫•y Female v√† Male c√≥ t·ª∑ l·ªá churn kh√°c nhau")

print("="*80)

TEST 1: GENDER V√Ä CHURN RATE

üìä D·ªØ li·ªáu:
   Female: n=5358, churn=930, rate=0.1736 (17.36%)
   Male:   n=4769, churn=697, rate=0.1462 (14.62%)

üî¨ Ki·ªÉm ƒë·ªãnh gi·∫£ thi·∫øt:
   H0: p_female = p_male
   H1: p_female ‚â† p_male
   Significance level: Œ± = 0.05

üìà K·∫øt qu·∫£:
   Z-statistic: 3.7508
   P-value: 0.005018

‚úÖ K·∫æT LU·∫¨N: REJECT H0 (p=0.005018 < 0.05)
   ‚Üí Female c√≥ t·ª∑ l·ªá churn KH√ÅC BI·ªÜT c√≥ √Ω nghƒ©a th·ªëng k√™ so v·ªõi Male
   ‚Üí Female churn cao h∆°n 2.74 ƒëi·ªÉm ph·∫ßn trƒÉm


### Test 2: Card Category c√≥ ·∫£nh h∆∞·ªüng ƒë·∫øn churn kh√¥ng?
- **H0 (Null Hypothesis)**: Churn rate ƒë·ªôc l·∫≠p v·ªõi Card Category
- **H1 (Alternative Hypothesis)**: Churn rate ph·ª• thu·ªôc v√†o Card Category
- **Ph∆∞∆°ng ph√°p**: Chi-square test of independence
- **Significance level**: Œ± = 0.05

In [68]:
# ============================================================================
# TEST 2: CARD CATEGORY VS CHURN (Chi-square test)
# ============================================================================

print("\n" + "="*80)
print("TEST 2: CARD CATEGORY V√Ä CHURN RATE")
print("="*80)

# L·∫•y Card Category data
card_col = categorical_data[:, categorical_cols.index('Card_Category')]
card_categories = np.unique(card_col)

# T·∫°o contingency table
contingency_table = []
for category in card_categories:
    mask = card_col == category
    n_total = np.sum(mask)
    n_churn = np.sum(target[mask])
    n_existing = n_total - n_churn
    contingency_table.append([n_existing, n_churn])

contingency_table = np.array(contingency_table)

print(f"\nüìä Contingency Table:")
print(f"{'Category':<15} {'Existing':>10} {'Churned':>10} {'Total':>10} {'Churn %':>10}")
print("-" * 60)
for i, category in enumerate(card_categories):
    existing = contingency_table[i, 0]
    churned = contingency_table[i, 1]
    total = existing + churned
    churn_rate = churned / total * 100
    print(f"{category:<15} {existing:>10} {churned:>10} {total:>10} {churn_rate:>9.2f}%")

# Chi-square test (manual implementation)
chi2, p_value, dof, expected = chi_square_test(contingency_table)

print(f"\nüî¨ Ki·ªÉm ƒë·ªãnh gi·∫£ thi·∫øt:")
print(f"   H0: Churn rate ƒë·ªôc l·∫≠p v·ªõi Card Category")
print(f"   H1: Churn rate ph·ª• thu·ªôc v√†o Card Category")
print(f"   Significance level: Œ± = 0.05")
print(f"\nüìà K·∫øt qu·∫£:")
print(f"   Chi-square statistic: {chi2:.4f}")
print(f"   Degrees of freedom: {dof}")
print(f"   P-value: {p_value:.6f}")

if p_value < 0.05:
    print(f"\n‚úÖ K·∫æT LU·∫¨N: REJECT H0 (p={p_value:.6f} < 0.05)")
    print(f"   ‚Üí Churn rate PH·ª§ THU·ªòC v√†o Card Category (c√≥ √Ω nghƒ©a th·ªëng k√™)")
    print(f"   ‚Üí Premium cards (Gold, Platinum, Silver) c√≥ churn rate th·∫•p h∆°n Blue")
else:
    print(f"\n‚ùå K·∫æT LU·∫¨N: FAIL TO REJECT H0 (p={p_value:.6f} >= 0.05)")
    print(f"   ‚Üí Kh√¥ng c√≥ b·∫±ng ch·ª©ng cho th·∫•y Card Category ·∫£nh h∆∞·ªüng ƒë·∫øn churn")

print("="*80)


TEST 2: CARD CATEGORY V√Ä CHURN RATE

üìä Contingency Table:
Category          Existing    Churned      Total    Churn %
------------------------------------------------------------
Blue                  7917       1519       9436     16.10%
Gold                    95         21        116     18.10%
Platinum                15          5         20     25.00%
Silver                 473         82        555     14.77%

üî¨ Ki·ªÉm ƒë·ªãnh gi·∫£ thi·∫øt:
   H0: Churn rate ƒë·ªôc l·∫≠p v·ªõi Card Category
   H1: Churn rate ph·ª• thu·ªôc v√†o Card Category
   Significance level: Œ± = 0.05

üìà K·∫øt qu·∫£:
   Chi-square statistic: 2.2342
   Degrees of freedom: 3
   P-value: 0.525301

‚ùå K·∫æT LU·∫¨N: FAIL TO REJECT H0 (p=0.525301 >= 0.05)
   ‚Üí Kh√¥ng c√≥ b·∫±ng ch·ª©ng cho th·∫•y Card Category ·∫£nh h∆∞·ªüng ƒë·∫øn churn

TEST 2: CARD CATEGORY V√Ä CHURN RATE

üìä Contingency Table:
Category          Existing    Churned      Total    Churn %
-----------------------------------------

### Test 3: Transaction count c√≥ kh√°c bi·ªát gi·ªØa Existing v√† Churned customers?
- **H0 (Null Hypothesis)**: Mean transaction count c·ªßa Existing = Mean c·ªßa Churned
- **H1 (Alternative Hypothesis)**: Mean transaction count c·ªßa Existing > Mean c·ªßa Churned
- **Ph∆∞∆°ng ph√°p**: Independent two-sample t-test
- **Significance level**: Œ± = 0.05

In [69]:
# ============================================================================
# TEST 3: TRANSACTION COUNT - EXISTING VS CHURNED (T-test)
# ============================================================================

print("\n" + "="*80)
print("TEST 3: TRANSACTION COUNT - EXISTING VS CHURNED")
print("="*80)

# L·∫•y Total_Trans_Ct t·ª´ original data (ch∆∞a scale)
trans_ct_idx = numerical_cols_clean.index('Total_Trans_Ct')
trans_ct = numerical_data_clean[:, trans_ct_idx]

# T√°ch theo Existing vs Churned
existing_trans = trans_ct[target == 0]
churned_trans = trans_ct[target == 1]

print(f"\nüìä D·ªØ li·ªáu:")
print(f"   Existing Customers:")
print(f"      n = {len(existing_trans)}")
print(f"      Mean = {np.mean(existing_trans):.2f} transactions")
print(f"      Std = {np.std(existing_trans, ddof=1):.2f}")
print(f"\n   Churned Customers:")
print(f"      n = {len(churned_trans)}")
print(f"      Mean = {np.mean(churned_trans):.2f} transactions")
print(f"      Std = {np.std(churned_trans, ddof=1):.2f}")

# Difference
diff = np.mean(existing_trans) - np.mean(churned_trans)
pct_diff = diff / np.mean(churned_trans) * 100

print(f"\n   Difference: {diff:.2f} transactions ({pct_diff:.1f}% higher for Existing)")

# Independent t-test (manual implementation)
t_stat, p_value_two_tailed = t_test_independent(existing_trans, churned_trans)
p_value = p_value_two_tailed / 2  # One-tailed

print(f"\nüî¨ Ki·ªÉm ƒë·ªãnh gi·∫£ thi·∫øt:")
print(f"   H0: mean_existing = mean_churned")
print(f"   H1: mean_existing > mean_churned (one-tailed)")
print(f"   Significance level: Œ± = 0.05")
print(f"\nüìà K·∫øt qu·∫£:")
print(f"   T-statistic: {t_stat:.4f}")
print(f"   P-value (one-tailed): {p_value:.10f}")

if p_value < 0.05 and t_stat > 0:
    print(f"\n‚úÖ K·∫æT LU·∫¨N: REJECT H0 (p={p_value:.10f} < 0.05)")
    print(f"   ‚Üí Existing customers c√≥ transaction count CAO H∆†N c√≥ √Ω nghƒ©a th·ªëng k√™")
    print(f"   ‚Üí Customers ho·∫°t ƒë·ªông nhi·ªÅu h∆°n c√≥ XU H∆Ø·ªöNG GI·ªÆ CH√ÇN t·ªët h∆°n")
    print(f"   ‚Üí Business insight: Transaction frequency l√† ch·ªâ s·ªë quan tr·ªçng ƒë·ªÉ predict churn")
else:
    print(f"\n‚ùå K·∫æT LU·∫¨N: FAIL TO REJECT H0")
    print(f"   ‚Üí Kh√¥ng c√≥ b·∫±ng ch·ª©ng th·ªëng k√™")

print("="*80)


TEST 3: TRANSACTION COUNT - EXISTING VS CHURNED

üìä D·ªØ li·ªáu:
   Existing Customers:
      n = 8500
      Mean = 68.67 transactions
      Std = 22.92

   Churned Customers:
      n = 1627
      Mean = 44.93 transactions
      Std = 14.57

   Difference: 23.74 transactions (52.8% higher for Existing)

üî¨ Ki·ªÉm ƒë·ªãnh gi·∫£ thi·∫øt:
   H0: mean_existing = mean_churned
   H1: mean_existing > mean_churned (one-tailed)
   Significance level: Œ± = 0.05

üìà K·∫øt qu·∫£:
   T-statistic: 54.1419
   P-value (one-tailed): 0.0000000000

‚úÖ K·∫æT LU·∫¨N: REJECT H0 (p=0.0000000000 < 0.05)
   ‚Üí Existing customers c√≥ transaction count CAO H∆†N c√≥ √Ω nghƒ©a th·ªëng k√™
   ‚Üí Customers ho·∫°t ƒë·ªông nhi·ªÅu h∆°n c√≥ XU H∆Ø·ªöNG GI·ªÆ CH√ÇN t·ªët h∆°n
   ‚Üí Business insight: Transaction frequency l√† ch·ªâ s·ªë quan tr·ªçng ƒë·ªÉ predict churn


---
## üíæ B∆Ø·ªöC 10: L∆ØU PREPROCESSED DATA

L∆∞u d·ªØ li·ªáu ƒë√£ x·ª≠ l√Ω ƒë·ªÉ s·ª≠ d·ª•ng cho modeling

In [70]:
# ============================================================================
# L∆ØU PREPROCESSED DATA
# ============================================================================

import os

# T·∫°o th∆∞ m·ª•c processed n·∫øu ch∆∞a c√≥
processed_dir = '../data/processed'
if not os.path.exists(processed_dir):
    os.makedirs(processed_dir)
    print(f"‚úÖ ƒê√£ t·∫°o th∆∞ m·ª•c: {processed_dir}")

# L∆∞u features v√† target
np.save(os.path.join(processed_dir, 'X_preprocessed.npy'), X_final)
np.save(os.path.join(processed_dir, 'y_target.npy'), y_final)

# L∆∞u feature names
with open(os.path.join(processed_dir, 'feature_names.txt'), 'w', encoding='utf-8') as f:
    for name in all_feature_names:
        f.write(name + '\n')

# L∆∞u preprocessing metadata
metadata = {
    'n_samples': X_final.shape[0],
    'n_features': X_final.shape[1],
    'n_numerical': len(numerical_cols_engineered),
    'n_categorical_encoded': len(encoded_feature_names),
    'target_distribution': {
        'existing': int(np.sum(y_final == 0)),
        'churned': int(np.sum(y_final == 1))
    }
}

with open(os.path.join(processed_dir, 'metadata.txt'), 'w', encoding='utf-8') as f:
    f.write("PREPROCESSING METADATA\n")
    f.write("="*50 + "\n")
    for key, value in metadata.items():
        f.write(f"{key}: {value}\n")

print("\n" + "="*80)
print("üíæ ƒê√É L∆ØU PREPROCESSED DATA")
print("="*80)
print(f"\nüìÅ Files ƒë√£ l∆∞u:")
print(f"   ‚úÖ {processed_dir}/X_preprocessed.npy - Shape: {X_final.shape}")
print(f"   ‚úÖ {processed_dir}/y_target.npy - Shape: {y_final.shape}")
print(f"   ‚úÖ {processed_dir}/feature_names.txt - {len(all_feature_names)} features")
print(f"   ‚úÖ {processed_dir}/metadata.txt - Preprocessing info")
print("\n" + "="*80)


üíæ ƒê√É L∆ØU PREPROCESSED DATA

üìÅ Files ƒë√£ l∆∞u:
   ‚úÖ ../data/processed/X_preprocessed.npy - Shape: (10127, 42)
   ‚úÖ ../data/processed/y_target.npy - Shape: (10127,)
   ‚úÖ ../data/processed/feature_names.txt - 42 features
   ‚úÖ ../data/processed/metadata.txt - Preprocessing info



---
---

# üéâ T·ªîNG K·∫æT PREPROCESSING

## ‚úÖ ƒê√É HO√ÄN TH√ÄNH T·∫§T C·∫¢ C√ÅC B∆Ø·ªöC:

### 1. ‚úÖ **Ki·ªÉm tra t√≠nh h·ª£p l·ªá c·ªßa gi√° tr·ªã**
- Validated 14 numerical columns (t·∫•t c·∫£ h·ª£p l·ªá)
- Validated 5 categorical columns
- Ph√°t hi·ªán 'Unknown' values trong 3 c·ªôt (Education, Income, Marital)

### 2. ‚úÖ **X√°c ƒë·ªãnh v√† x·ª≠ l√Ω outliers**
- Ph∆∞∆°ng ph√°p IQR (1.5*IQR)
- Ph∆∞∆°ng ph√°p Z-score (threshold=3)
- **Quy·∫øt ƒë·ªãnh**: Gi·ªØ nguy√™n outliers (c√≥ √Ω nghƒ©a business)

### 3. ‚úÖ **X·ª≠ l√Ω Missing Values**
- Gi·ªØ 'Unknown' nh∆∞ m·ªôt category h·ª£p l·ªá
- D·ª±a tr√™n EDA: missing pattern l√† RANDOM

### 4. ‚úÖ **Feature Engineering**
- Lo·∫°i b·ªè multicollinear feature: `Avg_Open_To_Buy` (r=0.996)
- T·∫°o 6 features m·ªõi:
  * Avg_Transaction_Value
  * Credit_Utilization_Ratio
  * Transaction_Frequency
  * Activity_Score
  * Relationship_Age_Ratio
  * Age_When_Joined
- **K·∫øt qu·∫£**: 13 ‚Üí 19 numerical features

### 5. ‚úÖ **Chu·∫©n h√≥a (Normalization)**
- Min-Max Normalization: Cho moderate skew features
- Log Transformation + Standardization: Cho highly skewed features
- Robust Scaling: Cho features v·ªõi outliers

### 6. ‚úÖ **ƒêi·ªÅu chu·∫©n (Standardization)**
- Z-score standardization: (X - mean) / std
- ƒê·∫°t mean ‚âà 0, std ‚âà 1
- √Åp d·ª•ng cho Gaussian-like distributions

### 7. ‚úÖ **Encode Categorical Variables**
- One-Hot Encoding cho 5 categorical columns
- **K·∫øt qu·∫£**: 5 categorical ‚Üí 23 encoded features

### 8. ‚úÖ **T√≠nh to√°n th·ªëng k√™ v·ªõi error handling**
- Implemented: safe_mean, safe_std, safe_percentile, safe_correlation
- X·ª≠ l√Ω: division by zero, NaN values, numerical instability

### 9. ‚úÖ **Ki·ªÉm ƒë·ªãnh gi·∫£ thi·∫øt th·ªëng k√™**

**Test 1: Gender vs Churn**
- H0: p_female = p_male
- H1: p_female ‚â† p_male
- **K·∫øt qu·∫£**: REJECT H0 (p=0.000176 < 0.05)
- **K·∫øt lu·∫≠n**: Female c√≥ churn rate cao h∆°n 2.74% c√≥ √Ω nghƒ©a th·ªëng k√™

**Test 2: Card Category vs Churn**
- H0: Churn ƒë·ªôc l·∫≠p v·ªõi Card Category
- H1: Churn ph·ª• thu·ªôc Card Category
- **K·∫øt qu·∫£**: FAIL TO REJECT H0 (p=0.525 > 0.05)
- **K·∫øt lu·∫≠n**: Kh√¥ng c√≥ b·∫±ng ch·ª©ng th·ªëng k√™ (m·∫∑c d√π descriptive stats cho th·∫•y c√≥ pattern)

**Test 3: Transaction Count - Existing vs Churned**
- H0: mean_existing = mean_churned
- H1: mean_existing > mean_churned
- **K·∫øt qu·∫£**: REJECT H0 (p ‚âà 0 < 0.05)
- **K·∫øt lu·∫≠n**: Existing customers c√≥ transaction count cao h∆°n 52.8% (r·∫•t c√≥ √Ω nghƒ©a)

### 10. ‚úÖ **L∆∞u preprocessed data**
- X_preprocessed.npy: (10,127 √ó 42)
- y_target.npy: (10,127,)
- feature_names.txt: 42 feature names
- metadata.txt: Preprocessing info

---

## üìä FINAL DATASET SUMMARY:

| Metric | Value |
|--------|-------|
| **Samples** | 10,127 |
| **Total Features** | 42 |
| **Numerical Features** | 19 (13 original + 6 engineered) |
| **Categorical Features** | 23 (one-hot encoded from 5) |
| **Target Distribution** | 83.93% Existing / 16.07% Churned |
| **Missing Values** | 0 (handled as 'Unknown' category) |
| **Outliers** | Retained (business meaningful) |
| **Scaling** | Applied (method depends on distribution) |

---

## üéØ KEY INSIGHTS T·ª™ PREPROCESSING:

1. **Transaction behavior** l√† predictor m·∫°nh nh·∫•t (p ‚âà 0)
2. **Gender** c√≥ ·∫£nh h∆∞·ªüng nh·ªè nh∆∞ng c√≥ √Ω nghƒ©a th·ªëng k√™ (p=0.0002)
3. **Card Category** kh√¥ng c√≥ √Ω nghƒ©a th·ªëng k√™ trong chi-square test (sample size nh·ªè cho premium cards)
4. **Engineered features** (Activity_Score, Transaction_Frequency) s·∫Ω r·∫•t h·ªØu √≠ch cho modeling
5. **Data quality** t·ªët: kh√¥ng c√≥ invalid values, outliers c√≥ √Ω nghƒ©a business

---

## üöÄ S·∫¥N S√ÄNG CHO MODELING!

Dataset ƒë√£ ƒë∆∞·ª£c preprocessing ho√†n ch·ªânh v√† ready cho:
- Logistic Regression
- Decision Trees / Random Forest
- Gradient Boosting (XGBoost, LightGBM)
- Neural Networks
- Support Vector Machines

**Next step**: Chuy·ªÉn sang notebook `03_modeling.ipynb` ƒë·ªÉ build v√† evaluate models! üéØ