In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
from imblearn.pipeline import Pipeline as ImbPipeline  # Pipeline that can include SMOTE
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings('ignore')

print("Starting Fraud Model Training Pipeline...")

# 1. Load Data
try:
    df = pd.read_csv('indian_multi_insurance_fraud_dataset.csv')
except FileNotFoundError:
    print("Error: indian_multi_insurance_fraud_dataset.csv not found.")
    # Exit or raise error
    exit()

# 2. Global Feature Engineering (Applying logic from your notebook)
print("Performing global feature engineering...")
date_columns = ['policy_start_date', 'claim_filing_date', 'incident_date']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

df['policy_duration_days'] = (df['claim_filing_date'] - df['policy_start_date']).dt.days
df['incident_to_claim_days'] = (df['claim_filing_date'] - df['incident_date']).dt.days

# Handle potential NaT issues (if dates were invalid)
df['policy_duration_days'].fillna(df['policy_duration_days'].median(), inplace=True)
df['incident_to_claim_days'].fillna(df['incident_to_claim_days'].median(), inplace=True)


# 3. Define Feature Lists (Based on your notebook analysis)
# NOTE: We are intentionally DROPPING high-cardinality text fields 
# (accident_location, treatment_details) for V1, as they require complex NLP processing.
# Your notebook also struggled with these in the XAI phase.

TARGET = 'fraud_reported'

# Common features available for ALL types
common_features = [
    'insured_age', 'insured_sex', 'insured_occupation', 'policy_state',
    'policy_annual_premium', 'claim_amount', 'sum_insured',
    'claim_amount_to_sum_insured_ratio', 'previous_claims_count',
    'policy_renewal_status', 'premium_payment_delays',
    'coverage_changes_before_claim', 'policy_duration_days', 'incident_to_claim_days'
]

# Insurance-specific features (only features that are NOT high-cardinality text)
insurance_specific_features = {
    'health': ['claim_duration_days', 'hospital_name'], # Dropped treatment_details
    'life': ['nominee_relationship'],
    'automobile': ['auto_make', 'auto_model', 'auto_year', 'third_party_involved'], # Dropped accident_location
    'property': ['property_type'],
    'crop': ['crop_type', 'weather_condition'],
    'travel': [],
    'personal_accident': []
}

# Define data types for preprocessing
CATEGORICAL_COMMON = ['insured_sex', 'insured_occupation', 'policy_state',
                     'policy_renewal_status', 'premium_payment_delays',
                     'coverage_changes_before_claim']
                     
NUMERIC_COMMON = ['insured_age', 'policy_annual_premium', 'claim_amount', 'sum_insured',
                 'claim_amount_to_sum_insured_ratio', 'previous_claims_count',
                 'policy_duration_days', 'incident_to_claim_days']

CATEGORICAL_SPECIFIC = {
    'health': ['hospital_name'],
    'life': ['nominee_relationship'],
    'automobile': ['auto_make', 'auto_model', 'third_party_involved'],
    'property': ['property_type'],
    'crop': ['crop_type', 'weather_condition'],
    'travel': [],
    'personal_accident': []
}

NUMERIC_SPECIFIC = {
    'health': ['claim_duration_days'],
    'life': [],
    'automobile': ['auto_year'],
    'property': [],
    'crop': [],
    'travel': [],
    'personal_accident': []
}

# 4. Model Training Loop
# We will create and save 7 specialized pipelines.

artifacts = {}
model_performance = {}

for insurance_type in df['insurance_type'].unique():
    print(f"\n{'='*50}\nTraining model for: {insurance_type.upper()}\n{'='*50}")

    # 1. Filter data for the specific insurance type
    type_df = df[df['insurance_type'] == insurance_type].copy()

    # 2. Define features for this specific model
    specific_cats = CATEGORICAL_SPECIFIC.get(insurance_type, [])
    specific_nums = NUMERIC_SPECIFIC.get(insurance_type, [])
    
    all_numeric_features = NUMERIC_COMMON + specific_nums
    all_categorical_features = CATEGORICAL_COMMON + specific_cats
    
    all_features = all_numeric_features + all_categorical_features
    
    X = type_df[all_features]
    y = type_df[TARGET]

    # 3. Create Preprocessing Pipelines
    # Pipeline for numerical features: impute missing values (if any) with the median, then scale.
    numeric_pipeline = ImbPipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Pipeline for categorical features: impute missing (if any) with the most frequent value, then one-hot encode.
    # handle_unknown='ignore' is crucial so the model doesn't break if it sees a new value in production.
    categorical_pipeline = ImbPipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # 4. Create the main Preprocessor using ColumnTransformer
    # This applies the correct pipeline to the correct columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_pipeline, all_numeric_features),
            ('cat', categorical_pipeline, all_categorical_features)
        ],
        remainder='passthrough' # Pass through any columns we missed (should be none)
    )

    # 5. Create the Full ML Pipeline (Preprocessing + SMOTE + Model)
    # Using ImbPipeline allows SMOTE (for imbalance) to work correctly *within* the pipeline.
    full_pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('model', RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1))
    ])

    # 6. Split data and Train
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    print(f"Training {insurance_type} model with {len(all_features)} features on {len(X_train)} samples...")
    full_pipeline.fit(X_train, y_train)

    # 7. Evaluate
    y_pred = full_pipeline.predict(X_test)
    y_proba = full_pipeline.predict_proba(X_test)[:, 1]
    
    roc_auc = roc_auc_score(y_test, y_proba)
    f1 = f1_score(y_test, y_pred)
    
    print(f"\n--- {insurance_type.upper()} Model Performance ---")
    print(f"Test ROC-AUC: {roc_auc:.4f}")
    print(f"Test F1-Score: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # 8. Save the entire pipeline object (model + preprocessors)
    artifact_filename = f"{insurance_type}_fraud_pipeline.joblib"
    joblib.dump(full_pipeline, artifact_filename)
    print(f"✓ Saved pipeline to {artifact_filename}")
    
    # Store results
    artifacts[insurance_type] = artifact_filename
    model_performance[insurance_type] = {'roc_auc': roc_auc, 'f1_score': f1, 'features_used': all_features}


print(f"\n{'='*50}\nTRAINING COMPLETE\n{'='*50}")
print("All model pipelines have been trained and saved.")

# Display summary:
print("\nModel Performance Summary:")
for insurance_type, perf in model_performance.items():
    print(f"{insurance_type:<20} | ROC-AUC: {perf['roc_auc']:.4f} | F1: {perf['f1_score']:.4f}")

Starting Fraud Model Training Pipeline...
Performing global feature engineering...

Training model for: PROPERTY
Training property model with 15 features on 1947 samples...

--- PROPERTY Model Performance ---
Test ROC-AUC: 0.7416
Test F1-Score: 0.2745

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.92      0.86       387
           1       0.40      0.21      0.27       100

    accuracy                           0.77       487
   macro avg       0.61      0.56      0.57       487
weighted avg       0.73      0.77      0.74       487

Confusion Matrix:
[[355  32]
 [ 79  21]]
✓ Saved pipeline to property_fraud_pipeline.joblib

Training model for: LIFE
Training life model with 15 features on 1924 samples...

--- LIFE Model Performance ---
Test ROC-AUC: 0.8662
Test F1-Score: 0.6390

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.94      0.88       337
           1   