In [None]:
# Data gathering
import pandas as pd
import msoffcrypto
import io

def load_encrypted_excel(file_path: str, password: str) -> pd.DataFrame:
    with open(file_path, 'rb') as f:
        office_file = msoffcrypto.OfficeFile(f)
        office_file.load_key(password=password)
        decrypted = io.BytesIO()
        office_file.decrypt(decrypted)
        decrypted.seek(0)
        return pd.read_excel(decrypted)

# File configurations
files = [
    # Core Client & FNA Process Tables
    {"name": "client", "path": "client.xlsx", "password": "_XlN@a9)EVy1"},
    {"name": "emfc2fna", "path": "emfc2fna.xlsx", "password": "dQq9T%pC^?22"},
    {"name": "emfc2personalinformation", "path": "emfc2personalinformation.xlsx", "password": "ZqYmaFgC@Zv3"},
    {"name": "emfc2", "path": "emfc2.xlsx", "password": "79GYEd%l(2Bf"},
    {"name": "EMFC2Assets", "path": "EMFC2Assets.xlsx", "password": "!suNZ=%YA13k"},
    {"name": "emfc2portofolioinsurance", "path": "emfc2portofolioinsurance.xlsx", "password": "BcxM>wz*(hxF"},

    # Product & Solution Workflow
    {"name": "emfc2productsolution", "path": "emfc2productsolution.xlsx", "password": "@OFn7oA5!Joe"},
    {"name": "EMFC2ProductIntegrationApplication", "path": "EMFC2ProductIntegrationApplication.xlsx", "password": "(FZsw7#vz-bN"},
    {"name": "EMFC2ProductIntegrationLog", "path": "EMFC2ProductIntegrationLog.xlsx", "password": "?wcAx*P4n=9&"},

    # Product & Category Lookup Tables
    {"name": "ProductMainPlan", "path": "ProductMainPlan.xlsx", "password": ")XQ4ZDssowrA"},
    {"name": "ProductType", "path": "ProductType.xlsx", "password": "#9zCw?^-xTO?"},
    {"name": "ProductCategory", "path": "ProductCategory.xlsx", "password": "#F)cdAEOVJ@4"},
    {"name": "productsubcategory", "path": "productsubcategory.xlsx", "password": "y-^t$N9>%S%C"}
]

# Load all datasets into memory
datasets = {}

print("=== LOADING ALL DATASETS ===")
for file in files:
    print(f"Loading {file['name']}...", end=" ")
    try:
        datasets[file['name']] = load_encrypted_excel(file["path"], file["password"])
        shape = datasets[file['name']].shape
        print(f"✓ ({shape[0]:,} rows, {shape[1]} columns)")
    except Exception as e:
        print(f"✗ Error: {e}")

print(f"\nSuccessfully loaded {len(datasets)} datasets")
print("Available datasets:", list(datasets.keys()))


=== LOADING ALL DATASETS ===
Loading client... ✓ (45,688 rows, 49 columns)
Loading emfc2fna... ✓ (51,772 rows, 31 columns)
Loading emfc2personalinformation... ✓ (51,837 rows, 37 columns)
Loading emfc2... ✓ (51,769 rows, 8 columns)
Loading EMFC2Assets... ✓ (50,500 rows, 39 columns)
Loading emfc2portofolioinsurance... ✓ (27,437 rows, 25 columns)
Loading emfc2productsolution... ✓ (61,733 rows, 25 columns)
Loading emfc2productsolutiondetail... ✓ (154,286 rows, 5 columns)
Loading EMFC2ProductIntegrationApplication... ✓ (560 rows, 14 columns)
Loading EMFC2ProductIntegrationLog... ✓ (977 rows, 21 columns)
Loading ProductMainPlan... ✓ (1,532 rows, 22 columns)
Loading ProductType... ✓ (4 rows, 8 columns)
Loading ProductCategory... ✓ (10 rows, 6 columns)
Loading productsubcategory... ✓ (39 rows, 13 columns)

Successfully loaded 14 datasets
Available datasets: ['client', 'emfc2fna', 'emfc2personalinformation', 'emfc2', 'EMFC2Assets', 'emfc2portofolioinsurance', 'emfc2productsolution', 'emfc2produ

In [7]:
print("\\n=== COLUMN HEADERS FOR EACH DATASET ===\\n")
for name, df in datasets.items():
    print(f"📄 Dataset: {name}")
    print(f"🧾 Columns ({len(df.columns)}):")
    for col in df.columns:
        print(f"  - {col} ({df[col].dtype})")
    print("-" * 40)


\n=== COLUMN HEADERS FOR EACH DATASET ===\n
📄 Dataset: client
🧾 Columns (49):
  - # (int64)
  - ClientId (object)
  - ClientName (object)
  - ClientMobileNumber (object)
  - ClientMNVerified (bool)
  - ClientMNVeriCode (float64)
  - ClientMNVeriCodeTime (datetime64[ns])
  - ClientEmail (object)
  - ClientContactPreferences (object)
  - ClientGender (object)
  - ClientDOB (datetime64[ns])
  - ClientCPFContributionCategoryId (object)
  - IDNumber (object)
  - Nationality (object)
  - SpokenLanguage (object)
  - WrittenLanguage (object)
  - Education (object)
  - EmploymentStatus (object)
  - Occupation (object)
  - MaritalStatus (object)
  - PrimaryAddress (object)
  - CorrespondingAddress (object)
  - IncomeRange (object)
  - AccompaniedbyTrustedIndividual (float64)
  - ClientInvitedDate (datetime64[ns])
  - ClientStatus (object)
  - RiskProfile (object)
  - RiskProfileSubmissionDate (datetime64[ns])
  - CKAProfile (object)
  - CARProfile (object)
  - CKACARSubmissionDate (datetime64[ns

In [4]:
"""
=== ENHANCED PHASE 1: DATA UNIFICATION WITH ADVANCED FEATURE ENGINEERING ===
Enhanced version with temporal features, behavioral scoring, and improved portfolio analysis
"""

import pandas as pd
import numpy as np
from datetime import datetime
import msoffcrypto
import io
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

print("=== ENHANCED UNIFIED DATASET CREATION ===")

def load_encrypted_excel(file_path: str, password: str) -> pd.DataFrame:
    """Load password-protected Excel files"""
    with open(file_path, 'rb') as f:
        office_file = msoffcrypto.OfficeFile(f)
        office_file.load_key(password=password)
        decrypted = io.BytesIO()
        office_file.decrypt(decrypted)
        decrypted.seek(0)
        return pd.read_excel(decrypted)

# Load all datasets
files = [
    {"name": "client", "path": "client.xlsx", "password": "_XlN@a9)EVy1"},
    {"name": "emfc2personalinformation", "path": "emfc2personalinformation.xlsx", "password": "ZqYmaFgC@Zv3"},
    {"name": "emfc2", "path": "emfc2.xlsx", "password": "79GYEd%l(2Bf"},
    {"name": "EMFC2Assets", "path": "EMFC2Assets.xlsx", "password": "!suNZ=%YA13k"},
    {"name": "emfc2portofolioinsurance", "path": "emfc2portofolioinsurance.xlsx", "password": "BcxM>wz*(hxF"},
    {"name": "emfc2fna", "path": "emfc2fna.xlsx", "password": "dQq9T%pC^?22"},  # Added FNA table
    {"name": "emfc2productsolution", "path": "emfc2productsolution.xlsx", "password": "@OFn7oA5!Joe"}  # Added for temporal features
]

datasets = {}
print("=== LOADING DATASETS ===")
for file in files:
    print(f"Loading {file['name']}...", end=" ")
    try:
        datasets[file['name']] = load_encrypted_excel(file["path"], file["password"])
        shape = datasets[file['name']].shape
        print(f"✓ ({shape[0]:,} rows, {shape[1]} columns)")
    except Exception as e:
        print(f"✗ Error: {e}")

def create_enhanced_unified_dataset(datasets):
    """Create unified dataset with enhanced feature engineering"""
    
    print("\n🚀 Starting enhanced dataset creation...")
    start_time = datetime.now()
    
    # Get base datasets
    client_df = datasets['client'].copy()
    personal_df = datasets['emfc2personalinformation'].copy()
    emfc2_df = datasets['emfc2'].copy()
    assets_df = datasets['EMFC2Assets'].copy()
    portfolio_df = datasets['emfc2portofolioinsurance'].copy()
    
    # Start with client as base
    unified_df = client_df.copy()
    
    # Add unique columns from personal info
    personal_unique_cols = ['PersonalInformationId', 'EMFC2FNAId', 'ClientAge', 'ClientRetrieval']
    personal_merge_cols = ['ClientId'] + personal_unique_cols
    available_personal_cols = [col for col in personal_merge_cols if col in personal_df.columns]
    
    unified_df = unified_df.merge(
        personal_df[available_personal_cols],
        on='ClientId',
        how='left',
        suffixes=('', '_personal')
    )
    
    # Smart replacement for overlapping columns
    overlap_cols = [
        'ClientGender', 'IncomeRange', 'MaritalStatus', 'Education', 
        'EmploymentStatus', 'RiskProfile', 'CKAProfile', 'CARProfile',
        'Nationality', 'SpokenLanguage', 'WrittenLanguage'
    ]
    
    for col in overlap_cols:
        if col in personal_df.columns and col in unified_df.columns:
            personal_updates = personal_df[['ClientId', col]].dropna()
            
            if len(personal_updates) > 0:
                for _, row in personal_updates.iterrows():
                    client_id = row['ClientId']
                    new_value = row[col]
                    
                    mask = unified_df['ClientId'] == client_id
                    if mask.any():
                        null_mask = mask & unified_df[col].isnull()
                        if null_mask.any():
                            unified_df.loc[null_mask, col] = new_value
    
    # Add EMFC2Id bridge with temporal features
    emfc2_bridge = emfc2_df.sort_values(['ClientId', 'EMFCSubmitDate'], na_position='last')
    emfc2_bridge = emfc2_bridge.groupby('ClientId').agg({
        'EMFC2Id': 'last',
        'EMFCStartDate': ['last', 'count'],  # Added count for frequency
        'EMFCSubmitDate': ['last', 'first'],  # Added first for duration
        'EMFCStatus': 'last'
    }).reset_index()
    
    # Flatten column names
    emfc2_bridge.columns = ['ClientId', 'EMFC2Id', 'EMFCStartDate', 'EMFC_Count', 
                           'EMFCSubmitDate', 'First_EMFC_Date', 'EMFCStatus']
    
    unified_df = unified_df.merge(emfc2_bridge, on='ClientId', how='left')
    
    # Add financial assets
    asset_cols = [
        'EMFC2Id', 'SavingsAccounts', 'FixedDepositsAccount', 'HomeAsset', 
        'MotorAsset', 'InsuranceCashValues', 'StocksPortofolio', 'BondPortofolio',
        'UTFEquityAsset', 'ETFs', 'InvestmentProperties', 'CPFOABalance', 
        'CPFSABalance', 'CPFMABalance', 'SRSEquityAsset'
    ]
    
    available_asset_cols = [col for col in asset_cols if col in assets_df.columns]
    unified_df = unified_df.merge(assets_df[available_asset_cols], on='EMFC2Id', how='left')
    
    # Enhanced portfolio analysis
    portfolio_analysis = portfolio_df.groupby('EMFC2Id').agg({
        'PolicyNumber': 'count',
        'PlanType': lambda x: list(x.dropna().unique()),
        'SumAssuredforLossofLife': 'sum',
        'SumAssuredforTPD': 'sum',
        'SumAssuredforCI': 'sum',
        'SumAssuredforHospitalIncome': 'sum',
        'SumAssuredforLongTermCare': 'sum',
        'AnnualCashPremium': 'sum',
        'InsurerCompanyName': lambda x: list(x.dropna().unique()),
        'CoverageStartAge': ['min', 'mean'],  # Added for temporal analysis
        'PaymentEndAge': 'max'  # Added for premium duration
    }).reset_index()
    
    # Flatten and rename columns
    portfolio_analysis.columns = [
        'EMFC2Id', 'Total_Policies', 'Plan_Types', 'Total_Life_Coverage',
        'Total_TPD_Coverage', 'Total_CI_Coverage', 'Total_Hospital_Income',
        'Total_LTC_Coverage', 'Total_Annual_Premium', 'Insurance_Companies',
        'First_Policy_Age', 'Avg_Policy_Start_Age', 'Max_Payment_End_Age'
    ]
    
    unified_df = unified_df.merge(portfolio_analysis, on='EMFC2Id', how='left')
    
    # Create all enhanced features
    unified_df = create_temporal_features(unified_df)
    unified_df = create_behavioral_features(unified_df)
    unified_df = create_business_driven_features(unified_df)
    unified_df = create_interaction_features(unified_df)
    unified_df = create_portfolio_sequence_features(unified_df, portfolio_df)
    
    # Original derived features (kept for compatibility)
    unified_df = create_original_derived_features(unified_df)
    
    end_time = datetime.now()
    processing_time = (end_time - start_time).total_seconds()
    
    print(f"\n✅ ENHANCED UNIFICATION COMPLETE!")
    print(f"   Final records: {len(unified_df):,}")
    print(f"   Final columns: {len(unified_df.columns)}")
    print(f"   Processing time: {processing_time:.1f} seconds")
    
    return unified_df

def create_temporal_features(df):
    """Create time-based features"""
    print("\n⏰ Creating temporal features...")
    
    # Current date for calculations
    current_date = pd.Timestamp.now()
    
    # Client relationship duration
    if 'ClientInvitedDate' in df.columns:
        df['Client_Tenure_Days'] = (current_date - pd.to_datetime(df['ClientInvitedDate'])).dt.days
        df['Client_Tenure_Years'] = df['Client_Tenure_Days'] / 365.25
    
    # Time since last FNA
    if 'EMFCSubmitDate' in df.columns:
        df['Days_Since_Last_FNA'] = (current_date - pd.to_datetime(df['EMFCSubmitDate'])).dt.days
        df['Months_Since_Last_FNA'] = df['Days_Since_Last_FNA'] / 30.44
    
    # Client engagement frequency
    if 'EMFC_Count' in df.columns and 'Client_Tenure_Days' in df.columns:
        df['FNA_Frequency'] = df['EMFC_Count'] / (df['Client_Tenure_Days'] / 365.25 + 1)
    
    # Insurance holding duration
    if 'First_Policy_Age' in df.columns and 'ClientAge' in df.columns:
        df['Years_With_Insurance'] = df['ClientAge'] - df['First_Policy_Age']
        df['Years_With_Insurance'] = df['Years_With_Insurance'].clip(lower=0)
    
    print("   ✅ Created temporal features")
    return df

def create_behavioral_features(df):
    """Create behavioral and engagement features"""
    print("\n🎯 Creating behavioral features...")
    
    # Client engagement score
    df['Engagement_Score'] = 0
    if 'ClientEmailVerified' in df.columns:
        df['Engagement_Score'] += df['ClientEmailVerified'].astype(float) * 0.25
    if 'ClientMNVerified' in df.columns:
        df['Engagement_Score'] += df['ClientMNVerified'].astype(float) * 0.25
    if 'RiskProfile' in df.columns:
        df['Engagement_Score'] += df['RiskProfile'].notna().astype(float) * 0.25
    if 'EMFC_Count' in df.columns:
        df['Engagement_Score'] += (df['EMFC_Count'] > 1).astype(float) * 0.25
    
    # Financial maturity index
    if 'Total_Investments' in df.columns and 'Total_Liquid_Assets' in df.columns:
        df['Financial_Maturity_Index'] = np.where(
            df['Total_Liquid_Assets'] > 0,
            df['Total_Investments'] / (df['Total_Liquid_Assets'] + df['Total_Investments'] + 1),
            0
        )
    
    # Digital adoption score
    contact_prefs = df.get('ClientContactPreferences', pd.Series(['[]'] * len(df)))
    df['Digital_Adoption_Score'] = contact_prefs.apply(
        lambda x: 1 if 'Email' in str(x) or 'Text' in str(x) else 0
    )
    
    # Protection gap score
    df['Protection_Gap_Score'] = 0
    if 'Life_Coverage_Gap' in df.columns:
        df['Protection_Gap_Score'] += df['Life_Coverage_Gap'] * 0.4
    if 'CI_Coverage_Gap' in df.columns:
        df['Protection_Gap_Score'] += df['CI_Coverage_Gap'] * 0.3
    if 'Has_Hospital_Coverage' in df.columns:
        df['Protection_Gap_Score'] += (df['Has_Hospital_Coverage'] == 0).astype(float) * 0.3
    
    print("   ✅ Created behavioral features")
    return df

def create_business_driven_features(df):
    """Create features based on business logic and insurance needs"""
    print("\n💼 Creating business-driven features...")
    
    # Underinsured indicator
    if 'Life_Coverage_Multiple' in df.columns and 'Income_Numeric' in df.columns:
        df['Underinsured'] = (
            (df['Life_Coverage_Multiple'] < 8) & 
            (df['Income_Numeric'] > 50000)
        ).astype(int)
    
    # Retirement planning need
    if 'ClientAge' in df.columns and 'Total_CPF' in df.columns and 'Income_Numeric' in df.columns:
        df['Retirement_Planning_Need'] = (
            (df['ClientAge'] > 40) & 
            (df['Total_CPF'] < df['Income_Numeric'] * 3)
        ).astype(int)
    
    # Wealth accumulation opportunity
    if 'Age_Group' in df.columns and 'Investment_Ratio' in df.columns:
        df['Wealth_Accumulation_Opp'] = (
            (df['ClientAge'].between(35, 55)) &
            (df['Investment_Ratio'] < 0.3) &
            (df['Income_Numeric'] > 75000)
        ).astype(int)
    
    # Critical illness vulnerability
    if 'ClientAge' in df.columns and 'Has_CI_Coverage' in df.columns:
        df['CI_Vulnerability'] = (
            (df['ClientAge'] > 35) &
            (df['Has_CI_Coverage'] == 0)
        ).astype(int)
    
    # Family protection priority
    if 'MaritalStatus' in df.columns and 'ClientAge' in df.columns:
        df['Family_Protection_Priority'] = (
            (df['MaritalStatus'].isin(['Married', 'Divorced'])) &
            (df['ClientAge'].between(25, 50)) &
            (df.get('Life_Coverage_Multiple', 0) < 10)
        ).astype(int)
    
    # Premium affordability ratio
    if 'Total_Annual_Premium' in df.columns and 'Income_Numeric' in df.columns:
        df['Premium_Affordability_Buffer'] = np.where(
            df['Income_Numeric'] > 0,
            1 - (df['Total_Annual_Premium'] / (df['Income_Numeric'] * 0.15)),  # 15% income rule
            1
        ).clip(0, 1)
    
    print("   ✅ Created business-driven features")
    return df

def create_interaction_features(df):
    """Create interaction features between key variables"""
    print("\n🔗 Creating interaction features...")
    
    # Age × Income interaction
    if 'ClientAge' in df.columns and 'Income_Numeric' in df.columns:
        df['Age_Income_Interaction'] = df['ClientAge'] * np.log1p(df['Income_Numeric'])
    
    # Insurance × Tenure interaction
    if 'Has_Insurance' in df.columns and 'Years_With_Insurance' in df.columns:
        df['Insurance_Tenure_Interaction'] = df['Has_Insurance'] * df['Years_With_Insurance']
    
    # Sophistication × Gap interaction
    if 'Financial_Sophistication' in df.columns and 'Protection_Gap_Score' in df.columns:
        df['Sophistication_Gap_Interaction'] = (
            df['Financial_Sophistication'].map({'High': 2, 'Medium': 1, 'Low': 0}).fillna(0) * 
            df['Protection_Gap_Score']
        )
    
    # Wealth × Age interaction
    if 'Estimated_Net_Worth' in df.columns and 'Age_Group' in df.columns:
        age_numeric = df['ClientAge'].fillna(40)
        df['Wealth_Age_Ratio'] = np.log1p(df['Estimated_Net_Worth']) / (age_numeric / 10)
    
    print("   ✅ Created interaction features")
    return df

def create_portfolio_sequence_features(df, portfolio_df):
    """Create features based on product sequence patterns"""
    print("\n📊 Creating portfolio sequence features...")
    
    # Product affinity mapping
    product_sequences = {
        'Term': ['Whole Life', 'Investment-Linked', 'Universal Life'],
        'Integrated Shield': ['Critical Illness', 'Hospital Income', 'Disability Income'],
        'Whole Life': ['Investment-Linked', 'Endowment', 'Annuity'],
        'Endowment': ['Investment-Linked', 'Annuity'],
        'Critical Illness': ['Early Critical Illness', 'Long Term Care']
    }
    
    # Create affinity scores
    if 'Plan_Types' in df.columns:
        for current_product, next_products in product_sequences.items():
            for next_product in next_products:
                feature_name = f'Affinity_{current_product.replace(" ", "_")}_to_{next_product.replace(" ", "_")}'
                df[feature_name] = df['Plan_Types'].apply(
                    lambda x: 1 if isinstance(x, list) and any(current_product in str(p) for p in x) else 0
                )
    
    # Product diversity score
    if 'Plan_Types' in df.columns:
        df['Product_Diversity_Score'] = df['Plan_Types'].apply(
            lambda x: len(set(x)) if isinstance(x, list) else 0
        )
    
    # Insurance evolution stage
    def get_insurance_evolution_stage(plan_types):
        if not isinstance(plan_types, list) or len(plan_types) == 0:
            return 0  # No insurance
        
        plan_str = ' '.join(str(p) for p in plan_types)
        
        if 'Investment-Linked' in plan_str or 'Annuity' in plan_str:
            return 4  # Advanced
        elif 'Whole Life' in plan_str or 'Endowment' in plan_str:
            return 3  # Intermediate
        elif 'Critical Illness' in plan_str or 'Disability' in plan_str:
            return 2  # Developing
        else:
            return 1  # Basic
    
    if 'Plan_Types' in df.columns:
        df['Insurance_Evolution_Stage'] = df['Plan_Types'].apply(get_insurance_evolution_stage)
    
    print("   ✅ Created portfolio sequence features")
    return df

def create_original_derived_features(df):
    """Keep original derived features for compatibility"""
    print("\n🔧 Creating original derived features...")
    
    # Income features
    if 'IncomeRange' in df.columns:
        income_mapping = {
            'No Income': 0,
            'Below S$30,000': 15000,
            'S$30,000 - S$49,999': 40000,
            'S$50,000 - S$99,999': 75000,
            'S$100,000 and above': 150000
        }
        df['Income_Numeric'] = df['IncomeRange'].map(income_mapping).fillna(0)
        
        income_category_mapping = {
            'No Income': 'Low', 'Below S$30,000': 'Low',
            'S$30,000 - S$49,999': 'Medium', 'S$50,000 - S$99,999': 'Medium',
            'S$100,000 and above': 'High'
        }
        df['Income_Category'] = df['IncomeRange'].map(income_category_mapping).fillna('Unknown')
    
    # Asset aggregations
    liquid_assets = ['SavingsAccounts', 'FixedDepositsAccount']
    investment_assets = ['StocksPortofolio', 'BondPortofolio', 'UTFEquityAsset', 'ETFs']
    cpf_assets = ['CPFOABalance', 'CPFSABalance', 'CPFMABalance']
    
    df['Total_Liquid_Assets'] = df[liquid_assets].fillna(0).sum(axis=1)
    df['Total_Investments'] = df[investment_assets].fillna(0).sum(axis=1)
    df['Total_CPF'] = df[cpf_assets].fillna(0).sum(axis=1)
    
    # Net worth calculation
    wealth_components = ['Total_Liquid_Assets', 'Total_Investments', 'Total_CPF', 'InvestmentProperties']
    df['Estimated_Net_Worth'] = df[[c for c in wealth_components if c in df.columns]].fillna(0).sum(axis=1)
    
    # Investment ratio
    total_financial = df['Total_Liquid_Assets'] + df['Total_Investments']
    df['Investment_Ratio'] = np.where(
        total_financial > 0,
        df['Total_Investments'] / total_financial,
        0
    )
    
    # Insurance features
    df['Has_Insurance'] = (df['Total_Policies'].fillna(0) > 0).astype(int)
    
    if 'Income_Numeric' in df.columns:
        df['Life_Coverage_Multiple'] = np.where(
            df['Income_Numeric'] > 0,
            df['Total_Life_Coverage'].fillna(0) / df['Income_Numeric'],
            0
        )
        
        df['Premium_to_Income_Ratio'] = np.where(
            df['Income_Numeric'] > 0,
            df['Total_Annual_Premium'].fillna(0) / df['Income_Numeric'],
            0
        )
    
    # Coverage indicators
    df['Has_Life_Coverage'] = (df['Total_Life_Coverage'].fillna(0) > 0).astype(int)
    df['Has_CI_Coverage'] = (df['Total_CI_Coverage'].fillna(0) > 0).astype(int)
    df['Has_Hospital_Coverage'] = (df['Total_Hospital_Income'].fillna(0) > 0).astype(int)
    df['Has_LTC_Coverage'] = (df['Total_LTC_Coverage'].fillna(0) > 0).astype(int)
    
    # Coverage gaps
    df['Life_Coverage_Gap'] = ((df['Has_Insurance'] == 1) & (df['Has_Life_Coverage'] == 0)).astype(int)
    df['CI_Coverage_Gap'] = ((df['Has_Insurance'] == 1) & (df['Has_CI_Coverage'] == 0)).astype(int)
    
    # Insurance sophistication
    def calculate_insurance_sophistication(row):
        policies = row.get('Total_Policies', 0)
        if policies == 0:
            return 'No_Insurance'
        
        coverage_types = (row.get('Has_Life_Coverage', 0) + row.get('Has_CI_Coverage', 0) + 
                         row.get('Has_Hospital_Coverage', 0) + row.get('Has_LTC_Coverage', 0))
        
        if coverage_types >= 3:
            return 'Comprehensive'
        elif coverage_types >= 2:
            return 'Moderate'
        else:
            return 'Basic'
    
    df['Insurance_Sophistication'] = df.apply(calculate_insurance_sophistication, axis=1)
    
    # Age groups
    if 'ClientAge' in df.columns:
        df['Age_Group'] = pd.cut(
            df['ClientAge'], 
            bins=[0, 25, 35, 45, 55, 65, 100], 
            labels=['Under_25', '25-35', '35-45', '45-55', '55-65', 'Over_65'],
            include_lowest=True
        )
    
    # Life stage
    if 'Age_Group' in df.columns and 'MaritalStatus' in df.columns:
        def determine_life_stage(row):
            age_group = row['Age_Group']
            marital_status = row['MaritalStatus']
            
            if pd.isna(age_group) or pd.isna(marital_status):
                return 'Unknown'
            
            age_str = str(age_group)
            marital_str = str(marital_status).lower()
            
            if age_str in ['Under_25', '25-35']:
                return 'Young_Single' if 'single' in marital_str else 'Young_Family'
            elif age_str in ['35-45', '45-55']:
                return 'Mid_Career_Single' if 'single' in marital_str else 'Mid_Career_Family'
            else:
                return 'Pre_Retirement'
        
        df['Life_Stage'] = df.apply(determine_life_stage, axis=1)
    
    # Financial sophistication
    if 'Education' in df.columns and 'Investment_Ratio' in df.columns:
        def calculate_financial_sophistication(row):
            score = 0
            education = str(row.get('Education', '')).lower()
            if 'university' in education or 'degree' in education:
                score += 2
            elif 'diploma' in education:
                score += 1
            
            inv_ratio = row.get('Investment_Ratio', 0)
            if inv_ratio > 0.3:
                score += 2
            elif inv_ratio > 0.1:
                score += 1
            
            return 'High' if score >= 3 else 'Medium' if score >= 1 else 'Low'
        
        df['Financial_Sophistication'] = df.apply(calculate_financial_sophistication, axis=1)
    
    print("   ✅ Created original derived features")
    return df

# Execute the enhanced unification
print("\n🚀 STARTING ENHANCED DATA UNIFICATION...")
enhanced_unified_dataset = create_enhanced_unified_dataset(datasets)

# Save the enhanced dataset
enhanced_unified_dataset.to_excel('enhanced_unified_client_dataset.xlsx', index=False)
print(f"\n💾 Saved: enhanced_unified_client_dataset.xlsx")

# Display enhancement summary
print("\n📊 ENHANCEMENT SUMMARY:")
new_features = [
    'Client_Tenure_Years', 'Days_Since_Last_FNA', 'FNA_Frequency',
    'Years_With_Insurance', 'Engagement_Score', 'Financial_Maturity_Index',
    'Digital_Adoption_Score', 'Protection_Gap_Score', 'Underinsured',
    'Retirement_Planning_Need', 'Wealth_Accumulation_Opp', 'CI_Vulnerability',
    'Family_Protection_Priority', 'Premium_Affordability_Buffer',
    'Age_Income_Interaction', 'Insurance_Evolution_Stage', 'Product_Diversity_Score'
]

print(f"✨ New Enhanced Features Added: {len(new_features)}")
for i, feat in enumerate(new_features, 1):
    if feat in enhanced_unified_dataset.columns:
        non_null = enhanced_unified_dataset[feat].notna().sum()
        coverage = (non_null / len(enhanced_unified_dataset)) * 100
        print(f"   {i:2d}. {feat}: {coverage:.1f}% coverage")

print(f"\n🎉 ENHANCED PHASE 1 COMPLETE!")
print(f"✅ Total features: {len(enhanced_unified_dataset.columns)}")
print(f"✅ Enhanced features provide better signals for ML modeling")
print(f"✅ Ready for Phase 2: Target Variable Engineering")

=== ENHANCED UNIFIED DATASET CREATION ===
=== LOADING DATASETS ===
Loading client... ✓ (45,688 rows, 49 columns)
Loading emfc2personalinformation... ✓ (52,305 rows, 37 columns)
Loading emfc2... ✓ (51,769 rows, 8 columns)
Loading EMFC2Assets... ✓ (50,500 rows, 39 columns)
Loading emfc2portofolioinsurance... ✓ (27,437 rows, 25 columns)
Loading emfc2fna... ✓ (51,772 rows, 31 columns)
Loading emfc2productsolution... ✓ (43,501 rows, 25 columns)

🚀 STARTING ENHANCED DATA UNIFICATION...

🚀 Starting enhanced dataset creation...

⏰ Creating temporal features...
   ✅ Created temporal features

🎯 Creating behavioral features...
   ✅ Created behavioral features

💼 Creating business-driven features...
   ✅ Created business-driven features

🔗 Creating interaction features...
   ✅ Created interaction features

📊 Creating portfolio sequence features...
   ✅ Created portfolio sequence features

🔧 Creating original derived features...
   ✅ Created original derived features

✅ ENHANCED UNIFICATION COMPLE

In [6]:
"""
=== ENHANCED PHASE 2: TARGET VARIABLE ENGINEERING WITH VALIDATION ===
Enhanced version with better data quality checks and business logic validation
"""

import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

print("=== ENHANCED PHASE 2: TARGET VARIABLE ENGINEERING ===")
print("Goal: Create high-quality ML training data with validated targets")

# Step 1: Load datasets
print("\n=== STEP 1: LOAD ENHANCED DATASETS ===")

def load_encrypted_excel(file_path: str, password: str) -> pd.DataFrame:
    import msoffcrypto
    import io
    with open(file_path, 'rb') as f:
        office_file = msoffcrypto.OfficeFile(f)
        office_file.load_key(password=password)
        decrypted = io.BytesIO()
        office_file.decrypt(decrypted)
        decrypted.seek(0)
        return pd.read_excel(decrypted)

# Load enhanced client dataset
clients_df = pd.read_excel('enhanced_unified_client_dataset.xlsx')
print(f"✅ Loaded enhanced client dataset: {len(clients_df):,} unique clients")
print(f"   Total features: {len(clients_df.columns)}")

# Load product tables
product_tables = {
    'emfc2productsolution': '@OFn7oA5!Joe',
    'ProductMainPlan': ')XQ4ZDssowrA', 
    'ProductType': '#9zCw?^-xTO?',
    'ProductCategory': '#F)cdAEOVJ@4',
    'productsubcategory': 'y-^t$N9>%S%C',
    'emfc2fna': 'dQq9T%pC^?22'  # For additional validation
}

product_datasets = {}
for table_name, password in product_tables.items():
    try:
        product_datasets[table_name] = load_encrypted_excel(f"{table_name}.xlsx", password)
        print(f"✅ Loaded {table_name}: {len(product_datasets[table_name]):,} records")
    except Exception as e:
        print(f"❌ Failed to load {table_name}: {e}")

# Step 2: Enhanced Product Hierarchy with Validation
print("\n=== STEP 2: BUILD VALIDATED PRODUCT HIERARCHY ===")

# Build and validate product hierarchy
main_plans = product_datasets['ProductMainPlan']
subcategories = product_datasets['productsubcategory']
categories = product_datasets['ProductCategory']
product_types = product_datasets['ProductType']

# Validate data quality
print("\n🔍 Validating product data quality...")
print(f"   Main plans with null SubCategoryId: {main_plans['ProductSubCategoryId'].isnull().sum()}")
print(f"   Subcategories with null CategoryId: {subcategories['ProductCategoryId'].isnull().sum()}")
print(f"   Categories with null TypeId: {categories['ProductTypeId'].isnull().sum()}")

# Build complete hierarchy with validation
complete_product_hierarchy = (
    main_plans
    .merge(subcategories[['ProductSubCategoryId', 'SubCategoryName', 'ProductCategoryId']], 
           on='ProductSubCategoryId', how='left')
    .merge(categories[['ProductCategoryId', 'CategoryName', 'ProductTypeId']], 
           on='ProductCategoryId', how='left')
    .merge(product_types[['ProductTypeId', 'TypeName', 'InvestmentType']], 
           on='ProductTypeId', how='left')
)

# Check hierarchy completeness
hierarchy_complete = complete_product_hierarchy[['SubCategoryName', 'CategoryName', 'TypeName']].notna().all(axis=1)
print(f"✅ Products with complete hierarchy: {hierarchy_complete.sum():,} / {len(complete_product_hierarchy):,}")

# Step 3: Analyze and Filter Recommendations
print("\n=== STEP 3: ANALYZE RECOMMENDATIONS WITH BUSINESS LOGIC ===")

raw_recommendations = product_datasets['emfc2productsolution']
fna_data = product_datasets['emfc2fna']

# Apply business filters
print("\n🔧 Applying business logic filters...")
recommendations = raw_recommendations[
    (raw_recommendations['Status'] == 'Approved') &
    (raw_recommendations['RecommendedProductRejected'] != True)
].copy()

print(f"📊 RECOMMENDATION FILTERING:")
print(f"   Total recommendations: {len(raw_recommendations):,}")
print(f"   Approved: {len(recommendations):,}")
print(f"   Rejection rate: {(1 - len(recommendations)/len(raw_recommendations))*100:.1f}%")

# Add temporal context from FNA
if 'EMFC2FNAId' in recommendations.columns and 'EMFC2FNAId' in fna_data.columns:
    recommendations = recommendations.merge(
        fna_data[['EMFC2FNAId', 'SubmissionTime', 'FNAStatus']], 
        on='EMFC2FNAId', 
        how='left'
    )
    print(f"   Recommendations with FNA context: {recommendations['SubmissionTime'].notna().sum():,}")

# Join with product hierarchy
recommendations_with_products = recommendations.merge(
    complete_product_hierarchy[['ProductId', 'SubCategoryName', 'CategoryName', 'TypeName', 'InvestmentType']], 
    on='ProductId', 
    how='left'
)

# Analyze subcategory distribution
subcat_distribution = recommendations_with_products['SubCategoryName'].value_counts()
print(f"\n🎯 SUBCATEGORY DISTRIBUTION (TOP 20):")
for i, (subcat, count) in enumerate(subcat_distribution.head(20).items(), 1):
    pct = (count / len(recommendations_with_products)) * 100
    print(f"   {i:2d}. {subcat}: {count:,} ({pct:.1f}%)")

# Step 4: Enhanced Client-Recommendation Join
print("\n=== STEP 4: ENHANCED CLIENT-RECOMMENDATION JOIN ===")

# Try multiple join strategies
join_results = {}

# Strategy 1: PersonalInformationId
if 'PersonalInformationId' in clients_df.columns:
    join1 = clients_df.merge(recommendations_with_products, on='PersonalInformationId', how='inner')
    join_results['PersonalInformationId'] = len(join1)

# Strategy 2: EMFC2FNAId
if 'EMFC2FNAId' in clients_df.columns:
    join2 = clients_df.merge(recommendations_with_products, on='EMFC2FNAId', how='inner')
    join_results['EMFC2FNAId'] = len(join2)

# Strategy 3: EMFC2Id (if available)
if 'EMFC2Id' in clients_df.columns and 'EMFC2Id' in recommendations_with_products.columns:
    join3 = clients_df.merge(recommendations_with_products, on='EMFC2Id', how='inner')
    join_results['EMFC2Id'] = len(join3)

print("\n🔗 JOIN RESULTS:")
best_join_key = max(join_results, key=join_results.get)
for key, count in join_results.items():
    print(f"   {key}: {count:,} records {'✅ BEST' if key == best_join_key else ''}")

# Use best join method
ml_training_data = clients_df.merge(
    recommendations_with_products, 
    on=best_join_key, 
    how='inner'
)

print(f"\n✅ Final training data: {len(ml_training_data):,} records")
print(f"   Unique clients: {ml_training_data['ClientId'].nunique():,}")
print(f"   Coverage: {ml_training_data['ClientId'].nunique() / len(clients_df) * 100:.1f}% of all clients")

# Step 5: Data Quality Validation
print("\n=== STEP 5: DATA QUALITY VALIDATION ===")

# Check for data leakage
print("\n🔍 Checking for data leakage...")
if 'DateCreated' in ml_training_data.columns and 'SubmissionTime' in ml_training_data.columns:
    ml_training_data['DateCreated'] = pd.to_datetime(ml_training_data['DateCreated'])
    ml_training_data['SubmissionTime'] = pd.to_datetime(ml_training_data['SubmissionTime'])
    
    # Ensure recommendation comes after client data
    valid_temporal = ml_training_data['DateCreated'] <= ml_training_data['SubmissionTime']
    print(f"   Temporally valid records: {valid_temporal.sum():,} / {len(ml_training_data):,}")
    
    if not valid_temporal.all():
        print(f"   ⚠️  Removing {(~valid_temporal).sum()} records with temporal issues")
        ml_training_data = ml_training_data[valid_temporal]

# Remove invalid subcategories
ml_training_data = ml_training_data[ml_training_data['SubCategoryName'].notna()]
print(f"   Records with valid subcategories: {len(ml_training_data):,}")

# Step 6: Prepare Target Variable with Business Logic
print("\n=== STEP 6: PREPARE TARGET VARIABLE ===")

def clean_subcategory_name(subcategory):
    """Clean and standardize subcategory names"""
    if pd.isna(subcategory):
        return 'Unknown'
    
    subcategory = str(subcategory).strip()
    subcategory = (subcategory.replace(' ', '_')
                             .replace('-', '_')
                             .replace('(', '')
                             .replace(')', '')
                             .replace('/', '_')
                             .replace('&', 'and')
                             .replace(',', ''))
    return subcategory

ml_training_data['Target_SubCategory'] = ml_training_data['SubCategoryName'].apply(clean_subcategory_name)

# Analyze class distribution
target_dist = ml_training_data['Target_SubCategory'].value_counts()
print(f"\n📊 TARGET DISTRIBUTION ANALYSIS:")
print(f"   Total subcategories: {target_dist.nunique()}")
print(f"   Imbalance ratio: {target_dist.max() / target_dist.min():.0f}:1")

# Step 7: Feature Quality Assessment
print("\n=== STEP 7: ENHANCED FEATURE QUALITY ASSESSMENT ===")

# Define comprehensive feature sets
feature_categories = {
    'Demographics': ['ClientAge', 'ClientGender', 'Nationality', 'MaritalStatus'],
    'Socioeconomic': ['IncomeRange', 'Income_Numeric', 'Education', 'EmploymentStatus'],
    'Financial': ['Total_Liquid_Assets', 'Total_Investments', 'Estimated_Net_Worth', 'Investment_Ratio'],
    'Insurance': ['Has_Insurance', 'Life_Coverage_Multiple', 'Insurance_Sophistication', 'Insurance_Evolution_Stage'],
    'Behavioral': ['Engagement_Score', 'Financial_Maturity_Index', 'Digital_Adoption_Score'],
    'Temporal': ['Client_Tenure_Years', 'Days_Since_Last_FNA', 'Years_With_Insurance'],
    'Business': ['Underinsured', 'Retirement_Planning_Need', 'Protection_Gap_Score']
}

print("\n📊 FEATURE QUALITY BY CATEGORY:")
feature_quality_summary = {}

for category, features in feature_categories.items():
    available = [f for f in features if f in ml_training_data.columns]
    missing = [f for f in features if f not in ml_training_data.columns]
    
    if available:
        avg_completeness = (ml_training_data[available].notna().sum() / len(ml_training_data)).mean()
        feature_quality_summary[category] = {
            'available': len(available),
            'missing': len(missing),
            'avg_completeness': avg_completeness
        }
        
        print(f"\n   {category}:")
        print(f"      Available: {len(available)}/{len(features)} features")
        print(f"      Avg completeness: {avg_completeness*100:.1f}%")
        if missing:
            print(f"      Missing: {missing}")

# Step 8: Handle Class Imbalance
print("\n=== STEP 8: HANDLE CLASS IMBALANCE ===")

# Set thresholds
min_samples_threshold = 50
max_samples_per_class = 5000  # Increased for better representation

# Filter small classes
small_classes = target_dist[target_dist < min_samples_threshold]
if len(small_classes) > 0:
    print(f"\n📊 Removing {len(small_classes)} small subcategories (<{min_samples_threshold} samples)")
    ml_training_clean = ml_training_data[
        ~ml_training_data['Target_SubCategory'].isin(small_classes.index)
    ].copy()
else:
    ml_training_clean = ml_training_data.copy()

print(f"   Records after filtering: {len(ml_training_clean):,}")
print(f"   Remaining subcategories: {ml_training_clean['Target_SubCategory'].nunique()}")

# Create balanced dataset
print(f"\n🔧 Creating balanced dataset (max {max_samples_per_class} per class)...")
balanced_samples = []

for subcategory in ml_training_clean['Target_SubCategory'].value_counts().index:
    subcat_data = ml_training_clean[ml_training_clean['Target_SubCategory'] == subcategory]
    
    if len(subcat_data) > max_samples_per_class:
        # Stratified sampling to maintain temporal distribution
        sampled = subcat_data.sample(n=max_samples_per_class, random_state=42)
    else:
        sampled = subcat_data.copy()
    
    balanced_samples.append(sampled)

balanced_training_data = pd.concat(balanced_samples, ignore_index=True)

# Step 9: Create Temporal Validation Split
print("\n=== STEP 9: CREATE TEMPORAL VALIDATION SPLIT ===")

if 'DateCreated' in ml_training_clean.columns:
    # Sort by date
    ml_training_clean = ml_training_clean.sort_values('DateCreated')
    
    # Use last 20% as temporal holdout
    split_idx = int(len(ml_training_clean) * 0.8)
    temporal_train = ml_training_clean.iloc[:split_idx]
    temporal_test = ml_training_clean.iloc[split_idx:]
    
    print(f"📅 Temporal split created:")
    print(f"   Train: {len(temporal_train):,} records (up to {temporal_train['DateCreated'].max().date()})")
    print(f"   Test: {len(temporal_test):,} records (from {temporal_test['DateCreated'].min().date()})")

# Step 10: Save Enhanced Datasets
print("\n=== STEP 10: SAVE ENHANCED ML DATASETS ===")

# Define final feature set
all_features = []
for features in feature_categories.values():
    all_features.extend(features)

# Add additional important features
additional_features = [
    'Age_Income_Interaction', 'Product_Diversity_Score', 'Premium_Affordability_Buffer',
    'Family_Protection_Priority', 'CI_Vulnerability', 'Wealth_Accumulation_Opp',
    'Target_SubCategory', 'DateCreated', 'ClientId'
]

final_features = list(set(all_features + additional_features))
available_final_features = [f for f in final_features if f in ml_training_clean.columns]

# Save datasets
datasets_to_save = {
    'ML_TRAINING_ENHANCED_FULL_V3.xlsx': ml_training_clean[available_final_features],
    'ML_TRAINING_ENHANCED_BALANCED_V3.xlsx': balanced_training_data[available_final_features],
}

if 'DateCreated' in ml_training_clean.columns:
    datasets_to_save['ML_TRAINING_TEMPORAL_TRAIN_V3.xlsx'] = temporal_train[available_final_features]
    datasets_to_save['ML_TRAINING_TEMPORAL_TEST_V3.xlsx'] = temporal_test[available_final_features]

for filename, dataset in datasets_to_save.items():
    dataset.to_excel(filename, index=False)
    print(f"💾 Saved: {filename} ({len(dataset):,} records)")

# Step 11: Generate Enhanced Summary Report
print("\n=== STEP 11: ENHANCED SUMMARY REPORT ===")

# Calculate enhanced metrics
enhancement_metrics = {
    'Original_Features': len(clients_df.columns),
    'Enhanced_Features': len(available_final_features),
    'New_Behavioral_Features': sum(1 for f in ['Engagement_Score', 'Financial_Maturity_Index', 'Digital_Adoption_Score'] if f in available_final_features),
    'New_Temporal_Features': sum(1 for f in ['Client_Tenure_Years', 'Days_Since_Last_FNA', 'Years_With_Insurance'] if f in available_final_features),
    'New_Business_Features': sum(1 for f in ['Underinsured', 'Retirement_Planning_Need', 'Protection_Gap_Score'] if f in available_final_features),
    'Feature_Completeness': f"{(ml_training_clean[available_final_features].notna().sum().sum() / (len(ml_training_clean) * len(available_final_features)) * 100):.1f}%"
}

with open('ENHANCED_PHASE_2_REPORT.txt', 'w') as f:
    f.write("=== ENHANCED PHASE 2: TARGET VARIABLE ENGINEERING REPORT ===\n")
    f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
    
    f.write("ENHANCEMENT METRICS:\n")
    for key, value in enhancement_metrics.items():
        f.write(f"  {key}: {value}\n")
    
    f.write("\nFEATURE QUALITY BY CATEGORY:\n")
    for category, metrics in feature_quality_summary.items():
        f.write(f"  {category}: {metrics['available']} features, {metrics['avg_completeness']*100:.1f}% complete\n")
    
    f.write(f"\nTARGET DISTRIBUTION (Top 10):\n")
    for i, (subcat, count) in enumerate(target_dist.head(10).items(), 1):
        f.write(f"  {i}. {subcat}: {count:,} ({count/len(ml_training_clean)*100:.1f}%)\n")

print(f"💾 Saved: ENHANCED_PHASE_2_REPORT.txt")

print(f"\n🎉 ENHANCED PHASE 2 COMPLETE!")
print(f"✅ Created high-quality ML training data with {len(available_final_features)} features")
print(f"✅ Enhanced features capture behavioral, temporal, and business signals")
print(f"✅ Multiple validation datasets created for robust model evaluation")
print(f"✅ Ready for Phase 3: Advanced Model Training")

=== ENHANCED PHASE 2: TARGET VARIABLE ENGINEERING ===
Goal: Create high-quality ML training data with validated targets

=== STEP 1: LOAD ENHANCED DATASETS ===
✅ Loaded enhanced client dataset: 65,010 unique clients
   Total features: 130
✅ Loaded emfc2productsolution: 43,501 records
✅ Loaded ProductMainPlan: 1,532 records
✅ Loaded ProductType: 4 records
✅ Loaded ProductCategory: 10 records
✅ Loaded productsubcategory: 39 records
✅ Loaded emfc2fna: 51,772 records

=== STEP 2: BUILD VALIDATED PRODUCT HIERARCHY ===

🔍 Validating product data quality...
   Main plans with null SubCategoryId: 2
   Subcategories with null CategoryId: 0
   Categories with null TypeId: 0
✅ Products with complete hierarchy: 1,485 / 1,532

=== STEP 3: ANALYZE RECOMMENDATIONS WITH BUSINESS LOGIC ===

🔧 Applying business logic filters...
📊 RECOMMENDATION FILTERING:
   Total recommendations: 43,501
   Approved: 43,500
   Rejection rate: 0.0%
   Recommendations with FNA context: 43,311

🎯 SUBCATEGORY DISTRIBUTION (

In [7]:
"""
=== ENHANCED PHASE 3: ADVANCED ML MODEL TRAINING ===
Enhanced version with ensemble methods, better feature selection, and business metrics
"""

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("=== ENHANCED PHASE 3: ADVANCED ML MODEL TRAINING ===")
print("Goal: Build high-performance ensemble model with business metrics")

# Step 1: Load Enhanced Training Data
print("\n=== STEP 1: LOAD ENHANCED TRAINING DATA ===")

# Load datasets
try:
    full_data = pd.read_excel('ML_TRAINING_ENHANCED_FULL_V3.xlsx')
    balanced_data = pd.read_excel('ML_TRAINING_ENHANCED_BALANCED_V3.xlsx')
    
    # Try to load temporal splits if available
    try:
        temporal_train = pd.read_excel('ML_TRAINING_TEMPORAL_TRAIN_V3.xlsx')
        temporal_test = pd.read_excel('ML_TRAINING_TEMPORAL_TEST_V3.xlsx')
        has_temporal = True
    except:
        has_temporal = False
        print("   ℹ️  Temporal splits not available")
    
except Exception as e:
    print(f"   ⚠️  Using fallback to original datasets: {e}")
    full_data = pd.read_excel('ML_TRAINING_SUBCATEGORIES_FULL_V2.xlsx')
    balanced_data = pd.read_excel('ML_TRAINING_SUBCATEGORIES_BALANCED_V2.xlsx')
    has_temporal = False

print(f"✅ Full dataset: {len(full_data):,} records, {full_data['Target_SubCategory'].nunique()} classes")
print(f"✅ Balanced dataset: {len(balanced_data):,} records, {balanced_data['Target_SubCategory'].nunique()} classes")
if has_temporal:
    print(f"✅ Temporal train: {len(temporal_train):,} records")
    print(f"✅ Temporal test: {len(temporal_test):,} records")

# Step 2: Enhanced Feature Engineering
print("\n=== STEP 2: ENHANCED FEATURE PREPARATION ===")

def prepare_enhanced_features(df):
    """Prepare features with enhanced encoding and scaling"""
    
    print(f"🔧 Preparing features for {len(df):,} records...")
    
    # Separate feature types
    categorical_features = []
    numerical_features = []
    binary_features = []
    
    # Identify feature types
    for col in df.columns:
        if col in ['Target_SubCategory', 'ClientId', 'DateCreated']:
            continue
            
        if df[col].dtype == 'object':
            categorical_features.append(col)
        elif df[col].nunique() == 2:
            binary_features.append(col)
        else:
            numerical_features.append(col)
    
    print(f"   📊 Features identified:")
    print(f"      Numerical: {len(numerical_features)}")
    print(f"      Categorical: {len(categorical_features)}")
    print(f"      Binary: {len(binary_features)}")
    
    X = pd.DataFrame(index=df.index)
    encoders = {}
    
    # Process numerical features with outlier handling
    for feature in numerical_features:
        if feature in df.columns:
            # Handle outliers using IQR method
            Q1 = df[feature].quantile(0.25)
            Q3 = df[feature].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Clip outliers
            X[feature] = df[feature].clip(lower=lower_bound, upper=upper_bound)
            
            # Fill missing with median
            median_val = X[feature].median()
            X[feature] = X[feature].fillna(median_val)
    
    # Process binary features
    for feature in binary_features:
        if feature in df.columns:
            X[feature] = df[feature].fillna(0).astype(int)
    
    # Process categorical features with target encoding for high cardinality
    for feature in categorical_features:
        if feature in df.columns:
            if df[feature].nunique() > 20:
                # Use target encoding for high cardinality
                print(f"      Using target encoding for {feature} ({df[feature].nunique()} unique values)")
                # Simple mean encoding (in production, use cross-validation)
                target_mean = df.groupby(feature)['Target_SubCategory'].apply(
                    lambda x: x.value_counts().index[0] if len(x) > 0 else 'Unknown'
                )
                X[feature + '_encoded'] = df[feature].map(target_mean).fillna('Unknown')
                
                # Convert to numeric
                le = LabelEncoder()
                X[feature + '_encoded'] = le.fit_transform(X[feature + '_encoded'].astype(str))
                encoders[feature] = {'type': 'target', 'mapping': target_mean, 'le': le}
            else:
                # Use label encoding for low cardinality
                le = LabelEncoder()
                X[feature] = le.fit_transform(df[feature].fillna('Unknown').astype(str))
                encoders[feature] = {'type': 'label', 'le': le}
    
    # Encode target variable
    target_encoder = LabelEncoder()
    y = target_encoder.fit_transform(df['Target_SubCategory'])
    
    print(f"   ✅ Feature matrix: {X.shape}")
    
    return X, y, encoders, target_encoder

# Prepare features for all datasets
X_full, y_full, encoders_full, target_encoder_full = prepare_enhanced_features(full_data)
X_balanced, y_balanced, encoders_balanced, target_encoder_balanced = prepare_enhanced_features(balanced_data)

if has_temporal:
    X_temporal_train, y_temporal_train, _, _ = prepare_enhanced_features(temporal_train)
    X_temporal_test, y_temporal_test, _, _ = prepare_enhanced_features(temporal_test)

# Step 3: Advanced Feature Selection
print("\n=== STEP 3: ADVANCED FEATURE SELECTION ===")

def select_diverse_features(X, y, n_features=30, correlation_threshold=0.85):
    """Select features using mutual information and correlation filtering"""
    
    print(f"🔍 Selecting top {n_features} diverse features...")
    
    # Calculate mutual information
    mi_scores = mutual_info_classif(X, y, random_state=42)
    mi_df = pd.DataFrame({
        'feature': X.columns,
        'mi_score': mi_scores
    }).sort_values('mi_score', ascending=False)
    
    # Calculate correlation matrix
    corr_matrix = X.corr().abs()
    
    # Select features iteratively to avoid high correlation
    selected_features = []
    for _, row in mi_df.iterrows():
        feature = row['feature']
        
        # Check correlation with already selected features
        if selected_features:
            max_corr = corr_matrix.loc[feature, selected_features].max()
            if max_corr < correlation_threshold:
                selected_features.append(feature)
        else:
            selected_features.append(feature)
        
        if len(selected_features) >= n_features:
            break
    
    print(f"   ✅ Selected {len(selected_features)} features with MI scores and low correlation")
    
    # Show top features
    print(f"\n   🏆 TOP 10 SELECTED FEATURES:")
    for i, feat in enumerate(selected_features[:10], 1):
        mi_score = mi_df[mi_df['feature'] == feat]['mi_score'].values[0]
        print(f"      {i:2d}. {feat}: MI={mi_score:.4f}")
    
    return selected_features

# Select features using balanced dataset
selected_features = select_diverse_features(X_balanced, y_balanced, n_features=40)

# Step 4: Create Train-Test Splits
print("\n=== STEP 4: CREATE TRAIN-TEST SPLITS ===")

if has_temporal:
    print("📅 Using temporal splits...")
    X_train = X_temporal_train[selected_features]
    X_test = X_temporal_test[selected_features]
    y_train = y_temporal_train
    y_test = y_temporal_test
else:
    print("📊 Creating stratified random splits...")
    # Use balanced dataset for development
    X_train, X_test, y_train, y_test = train_test_split(
        X_balanced[selected_features], y_balanced,
        test_size=0.2,
        random_state=42,
        stratify=y_balanced
    )

# Also prepare full dataset split for final training
X_full_train, X_full_test, y_full_train, y_full_test = train_test_split(
    X_full[selected_features], y_full,
    test_size=0.2,
    random_state=42,
    stratify=y_full
)

print(f"✅ Training set: {len(X_train):,} samples")
print(f"✅ Test set: {len(X_test):,} samples")

# Step 5: Build Ensemble Model
print("\n=== STEP 5: BUILD ADVANCED ENSEMBLE MODEL ===")

# Scale features for some models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define base models with optimized parameters
base_models = [
    ('rf', RandomForestClassifier(
        n_estimators=300,
        max_depth=20,
        min_samples_split=20,
        min_samples_leaf=10,
        max_features='sqrt',
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )),
    ('xgb', XGBClassifier(
        n_estimators=200,
        max_depth=10,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=1,
        random_state=42,
        n_jobs=-1
    )),
    ('lgbm', LGBMClassifier(
        n_estimators=200,
        max_depth=15,
        learning_rate=0.1,
        num_leaves=31,
        min_child_samples=20,
        subsample=0.8,
        colsample_bytree=0.8,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
]

# Create stacking ensemble
print("\n🔧 Training stacking ensemble...")
meta_learner = LogisticRegression(
    multi_class='multinomial',
    max_iter=1000,
    class_weight='balanced',
    random_state=42
)

stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_learner,
    cv=5,  # 5-fold cross-validation for meta-learner
    stack_method='predict_proba',
    n_jobs=-1
)

# Train ensemble
stacking_model.fit(X_train, y_train)
print("✅ Stacking ensemble trained")

# Step 6: Model Evaluation with Business Metrics
print("\n=== STEP 6: COMPREHENSIVE MODEL EVALUATION ===")

def evaluate_model_comprehensive(model, X_test, y_test, target_encoder, model_name="Model"):
    """Comprehensive evaluation including business metrics"""
    
    print(f"\n📊 Evaluating {model_name}...")
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    # Basic metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    # Top-k accuracy
    def top_k_accuracy(y_true, y_pred_proba, k):
        top_k_preds = np.argsort(y_pred_proba, axis=1)[:, -k:]
        return np.mean([y_true[i] in top_k_preds[i] for i in range(len(y_true))])
    
    top1_acc = accuracy
    top3_acc = top_k_accuracy(y_test, y_pred_proba, k=3)
    top5_acc = top_k_accuracy(y_test, y_pred_proba, k=5)
    
    # Business metrics
    # 1. Coverage metric: What percentage of clients get at least one correct recommendation in top-3
    coverage = top3_acc
    
    # 2. Confidence calibration: How well-calibrated are the probabilities
    top_confidences = np.max(y_pred_proba, axis=1)
    avg_confidence = np.mean(top_confidences)
    
    # 3. Diversity metric: How diverse are the recommendations
    top3_preds = np.argsort(y_pred_proba, axis=1)[:, -3:]
    unique_recommendations = len(np.unique(top3_preds))
    diversity_score = unique_recommendations / (3 * len(y_test))
    
    results = {
        'accuracy': accuracy,
        'top1_accuracy': top1_acc,
        'top3_accuracy': top3_acc,
        'top5_accuracy': top5_acc,
        'coverage': coverage,
        'avg_confidence': avg_confidence,
        'diversity_score': diversity_score
    }
    
    print(f"   ✅ Accuracy Metrics:")
    print(f"      Top-1: {top1_acc:.4f} ({top1_acc*100:.1f}%)")
    print(f"      Top-3: {top3_acc:.4f} ({top3_acc*100:.1f}%)")
    print(f"      Top-5: {top5_acc:.4f} ({top5_acc*100:.1f}%)")
    
    print(f"   📈 Business Metrics:")
    print(f"      Coverage (Top-3): {coverage:.4f} ({coverage*100:.1f}%)")
    print(f"      Avg Confidence: {avg_confidence:.4f}")
    print(f"      Recommendation Diversity: {diversity_score:.4f}")
    
    return results, y_pred_proba

# Evaluate stacking ensemble
ensemble_results, ensemble_proba = evaluate_model_comprehensive(
    stacking_model, X_test, y_test, target_encoder_balanced, "Stacking Ensemble"
)

# Step 7: Compare with Individual Models
print("\n=== STEP 7: MODEL COMPARISON ===")

model_comparison = {'Stacking Ensemble': ensemble_results}

# Train and evaluate individual models for comparison
for name, model in base_models:
    model.fit(X_train, y_train)
    results, _ = evaluate_model_comprehensive(model, X_test, y_test, target_encoder_balanced, name.upper())
    model_comparison[name.upper()] = results

# Create comparison DataFrame
comparison_df = pd.DataFrame(model_comparison).T
print("\n📊 MODEL PERFORMANCE COMPARISON:")
print(comparison_df.round(4))

# Select best model
best_model_name = comparison_df['top3_accuracy'].idxmax()
print(f"\n🏆 Best model: {best_model_name} with {comparison_df.loc[best_model_name, 'top3_accuracy']*100:.1f}% Top-3 accuracy")

# Step 8: Train Final Model on Full Dataset
print("\n=== STEP 8: TRAIN FINAL MODEL ON FULL DATASET ===")

print("🔧 Training final ensemble on full dataset...")
final_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_learner,
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

# Scale full dataset
X_full_train_scaled = scaler.fit_transform(X_full_train)
X_full_test_scaled = scaler.transform(X_full_test)

# Train on full dataset
final_model.fit(X_full_train, y_full_train)

# Evaluate on full test set
final_results, final_proba = evaluate_model_comprehensive(
    final_model, X_full_test, y_full_test, target_encoder_full, "Final Ensemble (Full Data)"
)

# Step 9: Probability Calibration
print("\n=== STEP 9: PROBABILITY CALIBRATION ===")

print("🎯 Calibrating probabilities for reliable confidence scores...")
calibrated_model = CalibratedClassifierCV(
    final_model, 
    method='sigmoid', 
    cv=3
)
calibrated_model.fit(X_full_train, y_full_train)

# Evaluate calibrated model
cal_results, cal_proba = evaluate_model_comprehensive(
    calibrated_model, X_full_test, y_full_test, target_encoder_full, "Calibrated Ensemble"
)

# Step 10: Feature Importance Analysis
print("\n=== STEP 10: FEATURE IMPORTANCE ANALYSIS ===")

# Get feature importance from Random Forest in the ensemble
rf_model = dict(base_models)['rf']
rf_model.fit(X_full_train, y_full_train)

feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\n🏆 TOP 15 MOST IMPORTANT FEATURES:")
for i, (_, row) in enumerate(feature_importance.head(15).iterrows(), 1):
    print(f"   {i:2d}. {row['Feature']:<30}: {row['Importance']:.4f}")

# Visualize feature importance
plt.figure(figsize=(10, 8))
top_features_plot = feature_importance.head(20)
plt.barh(range(len(top_features_plot)), top_features_plot['Importance'])
plt.yticks(range(len(top_features_plot)), top_features_plot['Feature'])
plt.xlabel('Importance Score')
plt.title('Top 20 Feature Importances')
plt.tight_layout()
plt.savefig('feature_importance_enhanced.png', dpi=300, bbox_inches='tight')
plt.close()

# Step 11: Create Enhanced Prediction Function
print("\n=== STEP 11: CREATE ENHANCED PREDICTION FUNCTION ===")

def predict_with_business_logic(client_features, model, target_encoder, selected_features, 
                               scaler=None, k=3, min_confidence=0.1):
    """
    Enhanced prediction function with business logic
    
    Returns:
        List of tuples: [(subcategory, confidence, reasoning), ...]
    """
    
    # Ensure features are in correct order
    X_pred = client_features[selected_features]
    
    # Scale if scaler provided
    if scaler:
        X_pred = scaler.transform(X_pred)
    
    # Get prediction probabilities
    pred_proba = model.predict_proba(X_pred)[0]
    
    # Get top-k predictions with minimum confidence threshold
    valid_indices = np.where(pred_proba >= min_confidence)[0]
    if len(valid_indices) < k:
        # If not enough predictions meet threshold, take top k anyway
        top_k_indices = np.argsort(pred_proba)[-k:][::-1]
    else:
        # Take top k from valid predictions
        valid_proba = pred_proba[valid_indices]
        valid_top_k = np.argsort(valid_proba)[-k:][::-1]
        top_k_indices = valid_indices[valid_top_k]
    
    # Get subcategories and confidences
    top_k_subcategories = target_encoder.inverse_transform(top_k_indices)
    top_k_confidences = pred_proba[top_k_indices]
    
    # Add business reasoning
    predictions_with_reasoning = []
    for subcat, conf in zip(top_k_subcategories, top_k_confidences):
        # Add simple reasoning based on subcategory
        if 'Term' in subcat:
            reasoning = "Basic protection need identified"
        elif 'Investment' in subcat:
            reasoning = "Wealth accumulation opportunity"
        elif 'Shield' in subcat or 'Hospital' in subcat:
            reasoning = "Healthcare coverage gap"
        elif 'Critical' in subcat:
            reasoning = "Critical illness protection recommended"
        elif 'Whole_Life' in subcat:
            reasoning = "Long-term protection and savings"
        elif 'Retirement' in subcat or 'Annuity' in subcat:
            reasoning = "Retirement planning need"
        else:
            reasoning = "Comprehensive financial planning"
        
        predictions_with_reasoning.append((subcat, conf, reasoning))
    
    return predictions_with_reasoning

# Test enhanced prediction function
print("\n🧪 Testing enhanced prediction function...")
test_sample = X_full_test.iloc[[0]]
predictions = predict_with_business_logic(
    test_sample, calibrated_model, target_encoder_full, 
    selected_features, scaler, k=3
)

print("   Sample prediction:")
for i, (subcat, conf, reasoning) in enumerate(predictions, 1):
    print(f"   {i}. {subcat}: {conf:.3f} ({conf*100:.1f}%) - {reasoning}")

# Step 12: Save Enhanced Model Package
print("\n=== STEP 12: SAVE ENHANCED MODEL PACKAGE ===")

# Create comprehensive model package
model_package = {
    'model': calibrated_model,
    'target_encoder': target_encoder_full,
    'feature_encoders': encoders_full,
    'selected_features': selected_features,
    'scaler': scaler,
    'model_performance': cal_results,
    'comparison_results': comparison_df.to_dict(),
    'feature_importance': feature_importance.to_dict(),
    'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'model_type': 'Calibrated Stacking Ensemble',
    'base_models': [name for name, _ in base_models]
}

# Save model package
joblib.dump(model_package, 'ENHANCED_SUBCATEGORY_MODEL_V3.pkl')
print("💾 Saved: ENHANCED_SUBCATEGORY_MODEL_V3.pkl")

# Save detailed performance report
with open('ENHANCED_MODEL_REPORT_V3.txt', 'w') as f:
    f.write("=== ENHANCED ML MODEL PERFORMANCE REPORT ===\n")
    f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
    
    f.write("MODEL ARCHITECTURE:\n")
    f.write(f"  Type: Calibrated Stacking Ensemble\n")
    f.write(f"  Base Models: {', '.join([name.upper() for name, _ in base_models])}\n")
    f.write(f"  Meta Learner: Multinomial Logistic Regression\n")
    f.write(f"  Calibration: Platt Scaling (Sigmoid)\n\n")
    
    f.write("PERFORMANCE METRICS:\n")
    for metric, value in cal_results.items():
        f.write(f"  {metric}: {value:.4f} ({value*100:.1f}%)\n")
    
    f.write("\nMODEL COMPARISON:\n")
    f.write(comparison_df.round(4).to_string())
    
    f.write("\n\nTOP 15 FEATURES:\n")
    for i, (_, row) in enumerate(feature_importance.head(15).iterrows(), 1):
        f.write(f"  {i:2d}. {row['Feature']}: {row['Importance']:.4f}\n")
    
    f.write(f"\nTRAINING DETAILS:\n")
    f.write(f"  Training samples: {len(X_full_train):,}\n")
    f.write(f"  Test samples: {len(X_full_test):,}\n")
    f.write(f"  Number of features: {len(selected_features)}\n")
    f.write(f"  Number of classes: {len(target_encoder_full.classes_)}\n")

print("💾 Saved: ENHANCED_MODEL_REPORT_V3.txt")

# Save feature importance
feature_importance.to_excel('ENHANCED_FEATURE_IMPORTANCE_V3.xlsx', index=False)
print("💾 Saved: ENHANCED_FEATURE_IMPORTANCE_V3.xlsx")

# Step 13: Final Summary
print("\n=== ENHANCED PHASE 3 COMPLETE ===")
print(f"\n🎉 FINAL MODEL PERFORMANCE:")
print(f"✅ Top-1 Accuracy: {cal_results['top1_accuracy']*100:.1f}%")
print(f"✅ Top-3 Accuracy: {cal_results['top3_accuracy']*100:.1f}%")
print(f"✅ Top-5 Accuracy: {cal_results['top5_accuracy']*100:.1f}%")
print(f"✅ Business Coverage: {cal_results['coverage']*100:.1f}%")
print(f"✅ Model Type: Calibrated Stacking Ensemble")

print(f"\n🚀 KEY IMPROVEMENTS:")
print(f"   • Advanced ensemble architecture (3 base models + meta-learner)")
print(f"   • Enhanced feature engineering (behavioral, temporal, business)")
print(f"   • Probability calibration for reliable confidence scores")
print(f"   • Business logic integration in predictions")
print(f"   • Comprehensive evaluation metrics")

print(f"\n📁 DELIVERABLES:")
print(f"   • ENHANCED_SUBCATEGORY_MODEL_V3.pkl (production-ready model)")
print(f"   • ENHANCED_MODEL_REPORT_V3.txt (detailed performance)")
print(f"   • ENHANCED_FEATURE_IMPORTANCE_V3.xlsx (feature analysis)")
print(f"   • feature_importance_enhanced.png (visualization)")

print(f"\n💡 USAGE EXAMPLE:")
print("""
# Load model
import joblib
model_package = joblib.load('ENHANCED_SUBCATEGORY_MODEL_V3.pkl')

# Prepare client data
client_data = pd.DataFrame({...})  # Client features

# Make predictions with business logic
predictions = predict_with_business_logic(
    client_data,
    model_package['model'],
    model_package['target_encoder'],
    model_package['selected_features'],
    model_package['scaler'],
    k=3
)

# Display recommendations
for i, (subcategory, confidence, reasoning) in enumerate(predictions, 1):
    print(f"{i}. {subcategory}: {confidence:.1%} - {reasoning}")
""")

=== ENHANCED PHASE 3: ADVANCED ML MODEL TRAINING ===
Goal: Build high-performance ensemble model with business metrics

=== STEP 1: LOAD ENHANCED TRAINING DATA ===
✅ Full dataset: 41,178 records, 15 classes
✅ Balanced dataset: 27,365 records, 15 classes
✅ Temporal train: 32,942 records
✅ Temporal test: 8,236 records

=== STEP 2: ENHANCED FEATURE PREPARATION ===
🔧 Preparing features for 41,178 records...
   📊 Features identified:
      Numerical: 14
      Categorical: 7
      Binary: 3
      Using target encoding for Nationality (73 unique values)
   ✅ Feature matrix: (41178, 24)
🔧 Preparing features for 27,365 records...
   📊 Features identified:
      Numerical: 14
      Categorical: 7
      Binary: 3
      Using target encoding for Nationality (69 unique values)
   ✅ Feature matrix: (27365, 24)
🔧 Preparing features for 32,942 records...
   📊 Features identified:
      Numerical: 14
      Categorical: 7
      Binary: 3
      Using target encoding for Nationality (72 unique values)
   