In [54]:
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest

# Configuration for a clean workflow
RAW_DATA_PATH = '/Users/ramesh/Desktop/RestConnect/backend/data/processed/bank_churn_cleaned_eda.csv' # Adjust based on your structure
PROCESSED_DATA_DIR = '../data/processed/'
RANDOM_STATE = 42

# Ensure the output directory exists
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

# Load the raw dataset
df = pd.read_csv(RAW_DATA_PATH)
print(f"Dataset loaded successfully. Initial Shape: {df.shape}")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/ramesh/Desktop/RestConnect/backend/data/processed/bank_churn_cleaned_eda.csv'

In [None]:
def standardize_categories(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans string-based columns by removing leading/trailing whitespace 
    and ensuring consistent Title Case.
    
    Args:
        df: Input DataFrame.
    Returns:
        DataFrame with standardized string columns.
    """
    # Identify object-type (string) columns
    str_cols = df.select_dtypes(include=['object']).columns
    
    for col in str_cols:
        # Step-by-step: Strip whitespace and capitalize first letter of each word
        df[col] = df[col].str.strip().str.title()
        
    print(f"Standardized columns: {list(str_cols)}")
    return df

df = standardize_categories(df)

Standardized columns: ['country']


In [None]:
def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Handles null values using strategic imputation.
    - Numerical: Median imputation (robust to outliers).
    - Categorical: 'Unknown' label to capture the 'Missingness' as a feature.
    """
    # Numerical Imputation
    num_cols = df.select_dtypes(include=[np.number]).columns
    num_imputer = SimpleImputer(strategy='median')
    df[num_cols] = num_imputer.fit_transform(df[num_cols])
    
    # Categorical Imputation
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        df[col] = df[col].fillna('Unknown')
        
    print("Missing values handled using Median/Unknown strategy.")
    return df

df = handle_missing_values(df)

Missing values handled using Median/Unknown strategy.


In [None]:
def detect_anomalies(df: pd.DataFrame) -> pd.DataFrame:
    """
    Uses Isolation Forest to detect and remove statistical anomalies.
    We assume a 1% contamination rate based on the initial audit.
    """
    # Select key financial features for anomaly detection
    features = ['age', 'balance', 'credit_score', 'estimated_salary']
    
    iso = IsolationForest(contamination=0.01, random_state=RANDOM_STATE)
    # -1 = outlier, 1 = inlier
    is_inlier = iso.fit_predict(df[features])
    
    # Filter for inliers only
    df_clean = df[is_inlier == 1].copy()
    
    print(f"Anomalies removed: {np.sum(is_inlier == -1)} rows dropped.")
    return df_clean

df = detect_anomalies(df)

Anomalies removed: 100 rows dropped.


In [None]:
def apply_domain_logic(df: pd.DataFrame) -> pd.DataFrame:
    """
    Applies business-rule constraints to the dataset.
    - Age must be between 18 and 100.
    - Tenure cannot be greater than Age - 15.
    """
    # Filter for legal banking age
    df = df[(df['age'] >= 18) & (df['age'] <= 100)]
    
    # Correct illogical tenure (if any)
    # logic: Work/Banking typically starts after 15 in this context
    illogical_tenure = df['tenure'] > (df['age'] - 15)
    df.loc[illogical_tenure, 'tenure'] = df['age'] - 15
    
    print("Domain logic and constraints applied successfully.")
    return df

df = apply_domain_logic(df)

Domain logic and constraints applied successfully.


In [None]:
def drop_noisy_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drops identifiers and redundant columns to prevent model overfitting.
    """
    cols_to_drop = ['customer_id']
    
    # Only drop if the column exists to prevent script errors
    existing_drops = [c for c in cols_to_drop if c in df.columns]
    df = df.drop(columns=existing_drops)
    
    print(f"Dropped non-predictive features: {existing_drops}")
    return df

df = drop_noisy_features(df)

Dropped non-predictive features: []


In [55]:
# Final validation check
assert df.isnull().sum().sum() == 0, "ERROR: Null values detected in final dataset!"
assert df.duplicated().sum() == 0, "ERROR: Duplicate rows detected in final dataset!"

# Export to Processed layer
output_path = os.path.join(PROCESSED_DATA_DIR, 'cleaned_churn_data.csv')
df.to_csv(output_path, index=False)

print("-" * 30)
print(f"PIPELINE COMPLETE")
print(f"Final Row Count: {len(df)}")
print(f"Cleaned dataset saved to: {output_path}")
print("-" * 30)

------------------------------
PIPELINE COMPLETE
Final Row Count: 9900
Cleaned dataset saved to: ../data/processed/cleaned_churn_data.csv
------------------------------
