In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, make_scorer
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor

from preprocess import preprocess_data

def define_segments(df):
    """Define customer segments with fuzzy boundaries and consideration of data quality."""
    
    # Calculate reliability scores based on imputed values
    reliability_score = 1.0
    for col in ['Occupation_is_missing', 'Previous Claims_is_missing', 'Credit Score_is_missing']:
        if col in df.columns:
            reliability_score -= df[col] * 0.1
    
    segments = {
        'High_Value_Property': (
            (df['Property Type'].isin(['House', 'Condo'])) &
            (df['Annual Income'] >= df['Annual Income'].quantile(0.55)) &
            ((df['Policy Type'] == 'Premium') | 
             (df['Policy Type'] == 'Standard')) &
            (df['Credit Score'] > df['Credit Score'].quantile(0.45)) &
            (reliability_score >= 0.8)  # Only include highly reliable records
        ),
        
        'Low_Risk_Premium': (
            (df['Credit Score'] > df['Credit Score'].quantile(0.5)) &
            (df['Health Score'] > df['Health Score'].quantile(0.5)) &
            (df['Insurance Duration'] > 1) &
            (df['Annual Income'] > df['Annual Income'].quantile(0.4)) &
            (reliability_score >= 0.7)
        ),
        
        'Healthy_Professional': (
            ((df['Age'] <= df['Age'].quantile(0.4)) |
             (df['Exercise Frequency'].isin(['Daily', 'Weekly']))) &
            (df['Annual Income'] > df['Annual Income'].quantile(0.45)) &
            (df['Health Score'] > df['Health Score'].quantile(0.5)) &
            (reliability_score >= 0.7)
        ),
        
        'Family_Premium': (
            (df['Number of Dependents'] >= 1) &
            (df['Location'].isin(['Suburban', 'Rural'])) &
            (df['Marital Status'] == 'Married') &
            (df['Annual Income'] > df['Annual Income'].quantile(0.4)) &
            (reliability_score >= 0.8)
        ),
        
        'Basic_Coverage': (
            ((df['Annual Income'] <= df['Annual Income'].quantile(0.35)) |
             (df['Previous Claims'] >= 2) |
             (df['Credit Score'] < df['Credit Score'].quantile(0.35))) &
            (df['Policy Type'] == 'Basic')
        )
    }
    
    return segments

def define_model():
    """Define robust model configurations optimized for high-missing-data scenarios."""
    
    # Base configurations
    base_configs = {
        # Conservative RandomForest for general use
        'rf_robust': RandomForestRegressor(
            n_estimators=100,
            max_depth=6,                  # Reduced depth
            min_samples_leaf=50,          # Increased to prevent overfitting
            min_samples_split=100,        # Added to ensure robust splits
            max_features='sqrt',
            bootstrap=True,
            oob_score=True,              # Enable out-of-bag scoring
            n_jobs=-1,
            random_state=42
        ),
        
        # XGBoost with strong regularization
        'xgb_conservative': XGBRegressor(
            n_estimators=100,
            max_depth=4,                  # Very shallow trees
            learning_rate=0.01,           # Slower learning rate
            subsample=0.7,                # Reduced sample size
            colsample_bytree=0.7,         # Feature subsampling
            min_child_weight=10,          # Increased to prevent overfitting
            reg_alpha=1,                  # L1 regularization
            reg_lambda=2,                 # L2 regularization
            random_state=42
        ),
        
        # Gradient Boosting with focus on robustness
        'gbm_simple': GradientBoostingRegressor(
            n_estimators=80,
            max_depth=3,                  # Very shallow trees
            learning_rate=0.01,           # Slower learning rate
            subsample=0.7,                # Subsample for robustness
            min_samples_leaf=50,          # Conservative leaf size
            random_state=42
        )
    }

    # Segment-specific configurations
    model = {
        'High_Value_Property': {
            'model': base_configs['rf_robust'],
            'description': 'Robust RF for high-value properties'
        },
        'Low_Risk_Premium': {
            'model': base_configs['gbm_simple'],
            'description': 'Simple GBM for low-risk segment'
        },
        'Healthy_Professional': {
            'model': base_configs['rf_robust'],
            'description': 'Robust RF for professional segment'
        },
        'Family_Premium': {
            'model': base_configs['xgb_conservative'],
            'description': 'Conservative XGBoost for family segment'
        },
        'Senior_Premium': {
            'model': base_configs['gbm_simple'],
            'description': 'Simple GBM for senior segment'
        },
        'Basic_Coverage': {
            'model': base_configs['rf_robust'],
            'description': 'Robust RF for basic coverage'
        },
        'default': {
            'model': base_configs['rf_robust'],
            'description': 'Default robust RF model'
        }
    }

    return model

def segment_data(df):

    segments = define_segments(df)

    # Create a mask for all data assigned to a segment
    assigned_mask = np.zeros(len(df), dtype=bool)
    for mask in segments.values():
        assigned_mask |= mask

    # Create a default segment for unassigned data
    segments['Default_Segment'] = ~assigned_mask

    # Print distribution for debugging
    total_records = len(df)
    print("\nSegment Distribution:")
    total_assigned = 0
    for name, mask in segments.items():
        segment_size = mask.sum()
        total_assigned += segment_size
        percentage = (segment_size / total_records) * 100
        print(f"{name}: {segment_size:,} ({percentage:.1f}%)")

    # Additional debug info
    print(f"\nTotal records: {total_records:,}")
    print(f"Total assigned: {total_assigned:,}")
    print(f"Records per segment on average: {total_assigned/len(segments):,.1f}")

    return segments

class InsuranceSegmentModel:
    def __init__(self, df):
        """Initialize the model with a DataFrame"""
        self.df = df
        self.processed_df = None
        self.segment_results = {}
        self.cv_results = {}

        # Create standard category mappings at initialization
        self.categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
        if 'Premium Amount' in self.categorical_features:
            self.categorical_features.remove('Premium Amount')
            
        self.category_mappings = {
            col: sorted(df[col].unique()) for col in self.categorical_features
        }

        # Segment data
        self.segments = segment_data(df)

        # Define model
        self.segment_configs = define_model()

    def create_segment_pipeline(self, segment_name):
        """Creates a pipeline specific to a segment"""
        config = self.segment_configs.get(segment_name, self.segment_configs['default'])
        
        # Define feature groups
        numeric_features = self.df.select_dtypes(include=['int64', 'float64']).columns.tolist()
        if 'Premium Amount' in numeric_features:
            numeric_features.remove('Premium Amount')
                
        # Only need encoder for categorical features
        categorical_transformer = OneHotEncoder(
            categories=[self.category_mappings[col] for col in self.categorical_features],
            sparse_output=False,
            handle_unknown='ignore'
        )
        
        # Create preprocessor - numeric features pass through unchanged
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', 'passthrough', numeric_features),
                ('cat', categorical_transformer, self.categorical_features)
            ],
            sparse_threshold=0
        )
        
        # Create pipeline
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', config['model'])
        ])
        
        return pipeline
        
    def evaluate_predictions(self, y_true, y_pred):
        """Calculate comprehensive evaluation metrics"""
        return {
            'r2_score': r2_score(y_true, y_pred),
            'mae': mean_absolute_error(y_true, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
            'mape': np.mean(np.abs((y_true - y_pred) / y_true)) * 100,
            'median_ae': np.median(np.abs(y_true - y_pred))
        }
    
    def train_segment_model(self, X_seg, y_seg, segment_name):
        """Trains and evaluates a model for a specific customer segment"""
        # Create pipeline
        pipeline = self.create_segment_pipeline(segment_name)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X_seg, y_seg, test_size=0.2, random_state=42
        )
        
        # Train model
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        train_predictions = pipeline.predict(X_train)
        test_predictions = pipeline.predict(X_test)
        
        # Calculate metrics
        train_metrics = self.evaluate_predictions(y_train, train_predictions)
        test_metrics = self.evaluate_predictions(y_test, test_predictions)
        
        # Perform cross-validation
        cv_scores = cross_val_score(
            pipeline, X_seg, y_seg,
            cv=5,
            scoring=make_scorer(r2_score),
            n_jobs=-1
        )
        
        results = {
            'model': pipeline,
            'train_metrics': train_metrics,
            'test_metrics': test_metrics,
            'cv_scores': cv_scores,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'test_data': (X_test, y_test)
        }
        
        return results
    
    def analyze_feature_importance(self, segment_name):
        """Analyzes feature importance for a specific segment"""
        if segment_name not in self.segment_results:
            return None
        
        results = self.segment_results[segment_name]
        model = results['model']
        
        # Get feature names after preprocessing
        feature_names = []
        
        # Get numeric feature names
        num_features = model.named_steps['preprocessor'].transformers_[0][2]
        feature_names.extend(num_features)
        
        # Get encoded categorical feature names
        cat_features = model.named_steps['preprocessor'].transformers_[1][2]
        if len(cat_features) > 0:
            encoder = model.named_steps['preprocessor'].named_transformers_['cat']
            if hasattr(encoder, 'get_feature_names_out'):
                encoded_features = encoder.get_feature_names_out(cat_features)
                feature_names.extend(encoded_features)
        
        # Get feature importances
        if hasattr(model.named_steps['regressor'], 'feature_importances_'):
            importances = model.named_steps['regressor'].feature_importances_
        else:
            # For stacking regressor, use the average of base estimators
            importances = np.mean([
                est.feature_importances_ 
                for name, est in model.named_steps['regressor'].estimators_
                if hasattr(est, 'feature_importances_')
            ], axis=0)
        
        return pd.DataFrame({
            'feature': feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False)
    
    def train_all_segments(self):
        """Trains models for all segments and generates performance summary"""
        # Define feature and target columns
        feature_cols = [col for col in self.df.columns if col != 'Premium Amount']
        target_col = 'Premium Amount'
        
        # Use the original dataframe directly
        self.processed_df = self.df.copy()
        
        # Train models for each segment
        for name, mask in self.segments.items():
            print(f"\nProcessing {name} segment...")
            X_seg = self.processed_df[feature_cols][mask]
            y_seg = self.processed_df[target_col][mask]
            print(f"\nSegment Length {len(X_seg)}...")

            if len(X_seg) >= 100:
                results = self.train_segment_model(X_seg, y_seg, name)
                self.segment_results[name] = results
                
                print(f"Train R2: {results['train_metrics']['r2_score']:.4f}")
                print(f"Test R2: {results['test_metrics']['r2_score']:.4f}")
                print("\nPrediction Accuracy:")
                print(f"Mean Absolute Error: {results['test_metrics']['mae']:.2f}")
                print(f"Median Absolute Error: {results['test_metrics']['median_ae']:.2f}")
                print(f"Mean % Error: {results['test_metrics']['mape']:.2f}%")
                print(f"RMSE: {results['test_metrics']['rmse']:.2f}")
                print(f"CV Mean R2: {results['cv_mean']:.4f} (+/- {results['cv_std']*2:.4f})")
        
        # Create performance summary
        performance_df = pd.DataFrame.from_dict(
            {name: {
                'segment_size': len(self.processed_df[mask]),
                'train_r2': results['train_metrics']['r2_score'],
                'test_r2': results['test_metrics']['r2_score'],
                'cv_mean_r2': results['cv_mean'],
                'cv_std_r2': results['cv_std'],
                'mae': results['test_metrics']['mae'],
                'mape': results['test_metrics']['mape']
            } for name, results in self.segment_results.items()},
            orient='index'
        )
        
        print("\nSegment Performance Summary:")
        print(performance_df.sort_values('test_r2', ascending=False))
                
        return performance_df
    
    def get_segment_predictions(self, segment_name, X_new):
        """Get predictions for new data using a trained segment model"""
        if segment_name not in self.segment_results:
            raise ValueError(f"No trained model found for segment: {segment_name}")
        
        model = self.segment_results[segment_name]['model']
        return model.predict(X_new)
    
    def get_feature_importance(self, segment_name):
        """Get feature importance analysis for a specific segment"""
        return self.analyze_feature_importance(segment_name)
    
    def get_segment_metrics(self, segment_name):
        """Get detailed performance metrics for a specific segment"""
        if segment_name not in self.segment_results:
            raise ValueError(f"No results found for segment: {segment_name}")
        
        results = self.segment_results[segment_name]
        return {
            'train_metrics': results['train_metrics'],
            'test_metrics': results['test_metrics'],
            'cv_scores': results['cv_scores'],
            'cv_mean': results['cv_mean'],
            'cv_std': results['cv_std']
        }

if __name__ == "__main__":
    train = pd.read_csv('train.csv')
    processed_df, _ = preprocess_data(train)
    insurance_model = InsuranceSegmentModel(processed_df)
    performance_summary = insurance_model.train_all_segments()



Segment Distribution:
High_Value_Property: 59,844 (5.0%)
Low_Risk_Premium: 153,743 (12.8%)
Healthy_Professional: 234,873 (19.6%)
Family_Premium: 128,426 (10.7%)
Basic_Coverage: 278,995 (23.2%)
Default_Segment: 591,853 (49.3%)

Total records: 1,200,000
Total assigned: 1,447,734
Records per segment on average: 241,289.0

Processing High_Value_Property segment...

Segment Length 59844...
Train R2: 0.0293
Test R2: 0.0226

Prediction Accuracy:
Mean Absolute Error: 712.71
Median Absolute Error: 593.72
Mean % Error: 337.67%
RMSE: 917.61
CV Mean R2: 0.0218 (+/- 0.0023)

Processing Low_Risk_Premium segment...

Segment Length 153743...
Train R2: 0.0198
Test R2: 0.0202

Prediction Accuracy:
Mean Absolute Error: 676.79
Median Absolute Error: 548.09
Mean % Error: 311.28%
RMSE: 876.70
CV Mean R2: 0.0195 (+/- 0.0017)

Processing Healthy_Professional segment...

Segment Length 234873...
Train R2: 0.0131
Test R2: 0.0099

Prediction Accuracy:
Mean Absolute Error: 687.66
Median Absolute Error: 573.00
Me

In [2]:
def segment_test_data(df):

    segments = segment_data(df)

    # Apply the boolean mask to the DataFrame and return actual data segments
    for key, mask in segments.items():
        segments[key] = df[mask]

    return segments

def predict_and_export(test_df, model, output_file='predicted_premiums.csv'):
    test_df_processed, test_ids = preprocess_data(test_df)
    predictions = []

    # Generate segments for the test data using the standalone function
    segments = segment_test_data(test_df_processed)

    for segment_name, test_segment in segments.items():
        # Ensure there are indices in this segment
        if not test_segment.empty:
            test_segment_ids = test_ids[test_segment.index]

            # Predict the segment
            try:
                predicted_values = model.get_segment_predictions(segment_name, test_segment)
                # Collect ID and corresponding predictions
                predictions.extend(zip(test_segment_ids, predicted_values))
            except Exception as e:
                print(f"Error processing segment {segment_name}: {e}")

    # Convert predictions to DataFrame
    predictions_df = pd.DataFrame(predictions, columns=['id', 'Premium Amount'])

    # Average the premium amounts for IDs with multiple entries
    predictions_df = predictions_df.groupby('id')['Premium Amount'].mean().reset_index()

    # Save the averaged results to CSV
    predictions_df.to_csv(output_file, index=False)
    print(f"Predictions exported to {output_file}.")

if __name__ == "__main__":
    # Load the test dataset
    test = pd.read_csv('test.csv')

    # Assuming the model is already instantiated and available as `insurance_model`
    predict_and_export(test, insurance_model, 'predicted_premiums.csv')


Segment Distribution:
High_Value_Property: 39,416 (4.9%)
Low_Risk_Premium: 102,125 (12.8%)
Healthy_Professional: 154,566 (19.3%)
Family_Premium: 85,681 (10.7%)
Basic_Coverage: 186,554 (23.3%)
Default_Segment: 394,765 (49.3%)

Total records: 800,000
Total assigned: 963,107
Records per segment on average: 160,517.8
Predictions exported to predicted_premiums.csv.
