In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, make_scorer

def preprocess_insurance_data(df):
    # Create a copy of the dataframe to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Isolate the id column for return
    ids = df['id'].copy()
    
    # Drop the 'id' column as it is just an identifier
    df.drop('id', axis=1, inplace=True)
    
    # Replace infinite values with NaN globally
    df.replace([float('inf'), float('-inf')], np.nan, inplace=True)
    
    # Convert Policy Start Date to datetime
    df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'])
    
    # Use today's date dynamically
    current_date = pd.Timestamp.now().normalize()  # normalize() sets time to midnight
    
    # Calculate days active
    df['Days_Active'] = (current_date - df['Policy Start Date']).dt.days
    
    # Drop the Policy Start Date
    df.drop('Policy Start Date', axis=1, inplace=True)
    
    # Print NaN status before imputation
    print("\nNaN values before imputation:")
    print(df.isna().sum())
    
    # Handle missing values through imputation
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    
    # For numeric columns: use median imputation
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median())
    
    # For categorical columns: use mode imputation
    for col in categorical_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    # Print NaN status after imputation
    print("\nNaN values after imputation:")
    print(df.isna().sum())
    
    return df, ids

def segment_data(df):
    segments = {
        'Low_Risk_Premium': (
            (df['Credit Score'] > df['Credit Score'].quantile(0.5)) &
            (df['Health Score'] > df['Health Score'].quantile(0.5)) &
            (df['Insurance Duration'] > 1)
        ),
        'High_Risk_Premium': (
            (df['Previous Claims'] >= 2) &
            ((df['Credit Score'] < df['Credit Score'].quantile(0.3)) |
             (df['Health Score'] < df['Health Score'].quantile(0.3)))
        ),
        'Young_Urban_Professional': (
            (df['Age'] <= df['Age'].quantile(0.4)) &
            (df['Location'] == 'Urban') &
            (df['Annual Income'] > df['Annual Income'].quantile(0.4)) &
            (df['Insurance Duration'] > 0)
        ),
        'Family_Suburban': (
            (df['Number of Dependents'] >= 1) &
            (df['Location'].isin(['Suburban', 'Rural'])) &
            (df['Marital Status'] == 'Married') &
            (df['Insurance Duration'] > 0)
        ),
        'Senior_Stable': (
            (df['Age'] >= df['Age'].quantile(0.6)) &
            (df['Insurance Duration'] > df['Insurance Duration'].quantile(0.4)) &
            (df['Credit Score'] > df['Credit Score'].quantile(0.4))
        ),
        'Budget_Basic': (
            (df['Annual Income'] <= df['Annual Income'].quantile(0.4)) &
            ((df['Policy Type'] == 'Basic') | 
             (df['Policy Type'] == 'Standard')) &
            (df['Insurance Duration'] >= 0)
        ),
        'Premium_Healthy': (
            ((df['Policy Type'] == 'Premium') | 
             (df['Policy Type'] == 'Standard')) &
            (df['Exercise Frequency'].isin(['Daily', 'Weekly', 'Monthly'])) &
            (df['Health Score'] > df['Health Score'].quantile(0.6)) &
            (df['Annual Income'] > df['Annual Income'].quantile(0.4))
        ),
        'High_Value_Property': (
            (df['Property Type'].isin(['House', 'Condo'])) &
            (df['Annual Income'] >= df['Annual Income'].quantile(0.6)) &
            ((df['Policy Type'] == 'Premium') | 
             (df['Policy Type'] == 'Standard')) &
            (df['Credit Score'] > df['Credit Score'].quantile(0.5))
        )
    }

    # Create a mask for all data assigned to a segment
    assigned_mask = np.zeros(len(df), dtype=bool)
    for mask in segments.values():
        assigned_mask |= mask

    # Create a default segment for unassigned data
    segments['Default_Segment'] = ~assigned_mask

    # Print distribution for debugging
    total_records = len(df)
    print("\nSegment Distribution:")
    total_assigned = 0
    for name, mask in segments.items():
        segment_size = mask.sum()
        total_assigned += segment_size
        percentage = (segment_size / total_records) * 100
        print(f"{name}: {segment_size:,} ({percentage:.1f}%)")

    # Additional debug info
    print(f"\nTotal records: {total_records:,}")
    print(f"Total assigned: {total_assigned:,}")
    print(f"Records per segment on average: {total_assigned/len(segments):,.1f}")

    return segments

class InsuranceSegmentModel:
    def __init__(self, df):
        """Initialize the model with a DataFrame"""
        self.df = df
        self.processed_df = None
        self.segment_results = {}
        self.cv_results = {}

        self.segments = segment_data(df)

        # Define model configurations
        self.segment_configs = {
            'High_Value_Property': {
                'model': StackingRegressor(
                    estimators=[
                        ('rf', RandomForestRegressor(
                            n_estimators=80,  # Reduced complexity
                            max_depth=6,       # Lower depth to prevent overfitting
                            min_samples_leaf=20,
                            random_state=42
                        )),
                        ('gbm', GradientBoostingRegressor(
                            n_estimators=80,  # Reduced complexity
                            learning_rate=0.08,  # Slightly lower to improve generalization
                            max_depth=4,  # Lower depth to prevent overfitting
                            random_state=42
                        ))
                    ],
                    final_estimator=LassoCV(cv=5, random_state=42),  # Increased CV folds
                    cv=5,  # Increased CV folds
                    n_jobs=-1
                ),
            },
            'Budget_Basic': {
                'model': RandomForestRegressor(  # Adjusted parameters for possibly better generalization
                    n_estimators=200,
                    max_depth=10,  # Slightly more depth to capture more complex patterns
                    min_samples_leaf=25,  # Allowing more fine-grained leaf nodes
                    n_jobs=-1,
                    random_state=42
                ),
            },
            'High_Risk_Premium': {
                'model': GradientBoostingRegressor(  # Adjust parameters to improve fitting
                    n_estimators=120,  # More estimators for better learning
                    learning_rate=0.08,  # Slightly lower to improve stability
                    max_depth=5,  # Increased depth to capture more complex relationships
                    random_state=42
                ),
            },
            'default': {
                'model': RandomForestRegressor(
                    n_estimators=120,  # Slightly more estimators
                    max_depth=10,  # Increased depth to potentially improve model capture
                    min_samples_leaf=25,  # Smaller leaf size for better granularity
                    n_jobs=-1,
                    random_state=42
                ),
            }
        }
    
    def create_segment_pipeline(self, segment_name):
        """Creates a pipeline specific to a segment"""
        config = self.segment_configs.get(segment_name, self.segment_configs['default'])
        
        # Define feature groups based on available columns
        numeric_features = [col for col in self.df.select_dtypes(include=['int64', 'float64']).columns 
                        if col != 'Premium Amount']
        
        categorical_features = [col for col in self.df.select_dtypes(include=['object', 'category']).columns
                            if col != 'Premium Amount']
        
        print(f"\nFeatures for {segment_name}:")
        print("Numeric features:", numeric_features)
        print("Categorical features:", categorical_features)
        
        # Create transformers
        numeric_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        
        categorical_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])
        
        # Create preprocessor
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ],
            sparse_threshold=0
        )
        
        # Create pipeline
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', config['model'])
        ])
        
        # Print transformed feature names
        pipeline.fit(self.df.head(1), [0])  # Fit on one row to get feature names
        print("\nTransformed feature names:", pipeline.get_feature_names_out())
        print("Total features:", len(pipeline.get_feature_names_out()))
        
        return pipeline
    
    def evaluate_predictions(self, y_true, y_pred):
        """Calculate comprehensive evaluation metrics"""
        return {
            'r2_score': r2_score(y_true, y_pred),
            'mae': mean_absolute_error(y_true, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
            'mape': np.mean(np.abs((y_true - y_pred) / y_true)) * 100,
            'median_ae': np.median(np.abs(y_true - y_pred))
        }
    
    def train_segment_model(self, X_seg, y_seg, segment_name):
        """Trains and evaluates a model for a specific customer segment"""
        # Create pipeline
        pipeline = self.create_segment_pipeline(segment_name)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X_seg, y_seg, test_size=0.2, random_state=42
        )
        
        # Train model
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        train_predictions = pipeline.predict(X_train)
        test_predictions = pipeline.predict(X_test)
        
        # Calculate metrics
        train_metrics = self.evaluate_predictions(y_train, train_predictions)
        test_metrics = self.evaluate_predictions(y_test, test_predictions)
        
        # Perform cross-validation
        cv_scores = cross_val_score(
            pipeline, X_seg, y_seg,
            cv=5,
            scoring=make_scorer(r2_score),
            n_jobs=-1
        )
        
        results = {
            'model': pipeline,
            'train_metrics': train_metrics,
            'test_metrics': test_metrics,
            'cv_scores': cv_scores,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'test_data': (X_test, y_test)
        }
        
        return results
    
    def analyze_feature_importance(self, segment_name):
        """Analyzes feature importance for a specific segment"""
        if segment_name not in self.segment_results:
            return None
        
        results = self.segment_results[segment_name]
        model = results['model']
        
        # Get feature names after preprocessing
        feature_names = []
        
        # Get numeric feature names
        num_features = model.named_steps['preprocessor'].transformers_[0][2]
        feature_names.extend(num_features)
        
        # Get encoded categorical feature names
        cat_features = model.named_steps['preprocessor'].transformers_[1][2]
        if len(cat_features) > 0:
            encoder = model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['encoder']
            if hasattr(encoder, 'get_feature_names_out'):
                encoded_features = encoder.get_feature_names_out(cat_features)
                feature_names.extend(encoded_features)
        
        # Get feature importances
        if hasattr(model.named_steps['regressor'], 'feature_importances_'):
            importances = model.named_steps['regressor'].feature_importances_
        else:
            # For stacking regressor, use the average of base estimators
            importances = np.mean([
                est.feature_importances_ 
                for name, est in model.named_steps['regressor'].estimators_
                if hasattr(est, 'feature_importances_')
            ], axis=0)
        
        return pd.DataFrame({
            'feature': feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False)
    
    def train_all_segments(self):
        """Trains models for all segments and generates performance summary"""
        # Define feature and target columns
        feature_cols = [col for col in self.df.columns if col != 'Premium Amount']
        target_col = 'Premium Amount'
        
        # Use the original dataframe directly
        self.processed_df = self.df.copy()
        
        # Train models for each segment
        for name, mask in self.segments.items():
            print(f"\nProcessing {name} segment...")
            X_seg = self.processed_df[feature_cols][mask]
            y_seg = self.processed_df[target_col][mask]
            print(f"\nSegment Length {len(X_seg)}...")

            if len(X_seg) >= 100:
                results = self.train_segment_model(X_seg, y_seg, name)
                self.segment_results[name] = results
                
                print(f"Train R2: {results['train_metrics']['r2_score']:.4f}")
                print(f"Test R2: {results['test_metrics']['r2_score']:.4f}")
                print("\nPrediction Accuracy:")
                print(f"Mean Absolute Error: {results['test_metrics']['mae']:.2f}")
                print(f"Median Absolute Error: {results['test_metrics']['median_ae']:.2f}")
                print(f"Mean % Error: {results['test_metrics']['mape']:.2f}%")
                print(f"RMSE: {results['test_metrics']['rmse']:.2f}")
                print(f"CV Mean R2: {results['cv_mean']:.4f} (+/- {results['cv_std']*2:.4f})")
        
        # Create performance summary
        performance_df = pd.DataFrame.from_dict(
            {name: {
                'segment_size': len(self.processed_df[mask]),
                'train_r2': results['train_metrics']['r2_score'],
                'test_r2': results['test_metrics']['r2_score'],
                'cv_mean_r2': results['cv_mean'],
                'cv_std_r2': results['cv_std'],
                'mae': results['test_metrics']['mae'],
                'mape': results['test_metrics']['mape']
            } for name, results in self.segment_results.items()},
            orient='index'
        )
        
        print("\nSegment Performance Summary:")
        print(performance_df.sort_values('test_r2', ascending=False))
                
        return performance_df
    
    def get_segment_predictions(self, segment_name, X_new):
        """Get predictions for new data using a trained segment model"""
        if segment_name not in self.segment_results:
            raise ValueError(f"No trained model found for segment: {segment_name}")
        
        model = self.segment_results[segment_name]['model']
        return model.predict(X_new)
    
    def get_feature_importance(self, segment_name):
        """Get feature importance analysis for a specific segment"""
        return self.analyze_feature_importance(segment_name)
    
    def get_segment_metrics(self, segment_name):
        """Get detailed performance metrics for a specific segment"""
        if segment_name not in self.segment_results:
            raise ValueError(f"No results found for segment: {segment_name}")
        
        results = self.segment_results[segment_name]
        return {
            'train_metrics': results['train_metrics'],
            'test_metrics': results['test_metrics'],
            'cv_scores': results['cv_scores'],
            'cv_mean': results['cv_mean'],
            'cv_std': results['cv_std']
        }

if __name__ == "__main__":
    train = pd.read_csv('train.csv')
    processed_df, _ = preprocess_insurance_data(train)
    insurance_model = InsuranceSegmentModel(processed_df)
    performance_summary = insurance_model.train_all_segments()


In [None]:
def segment_test_data(df):

    segments = segment_data(df)

    # Apply the boolean mask to the DataFrame and return actual data segments
    for key, mask in segments.items():
        segments[key] = df[mask]

    return segments

def predict_and_export(test_df, model, output_file='predicted_premiums.csv'):
    test_df_processed, test_ids = preprocess_insurance_data(test_df)
    predictions = []

    # Generate segments for the test data using the standalone function
    segments = segment_test_data(test_df_processed)

    for segment_name, test_segment in segments.items():
        # Ensure there are indices in this segment
        if not test_segment.empty:
            print(f"Segment {segment_name} processing with data:")
            print(test_segment.head())  # Check the first few rows to ensure data is present and correct
            test_segment_ids = test_ids[test_segment.index]

            # Predict the segment
            try:
                predicted_values = model.get_segment_predictions(segment_name, test_segment)
                # Collect ID and corresponding predictions
                predictions.extend(zip(test_segment_ids, predicted_values))
            except Exception as e:
                print(f"Error processing segment {segment_name}: {e}")

    # Convert predictions to DataFrame
    predictions_df = pd.DataFrame(predictions, columns=['id', 'Premium Amount'])

    # Average the premium amounts for IDs with multiple entries
    predictions_df = predictions_df.groupby('id')['Premium Amount'].mean().reset_index()

    # Save the averaged results to CSV
    predictions_df.to_csv(output_file, index=False)
    print(f"Predictions exported to {output_file}.")

if __name__ == "__main__":
    # Load the test dataset
    test = pd.read_csv('test.csv')
    # Assuming the model is already instantiated and available as `insurance_model`
    predict_and_export(test, insurance_model, 'predicted_premiums.csv')