In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

class InsuranceModeling:
    def __init__(self, data_path):
        self.data = pd.read_csv(data_path)
        self.models = {}
        self.results = {}
        
    def prepare_severity_data(self):
        """Prepare data for claim severity prediction"""
        # Filter for policies with claims
        severity_data = self.data[self.data['TotalClaims'] > 0].copy()
        
        # Define features and target
        self.severity_features = [
            'Age', 'Gender', 'VehicleAge', 'VehicleValue', 'CoverageType',
            'Province', 'ZipCode', 'PreviousClaims', 'CreditScore'
        ]
        
        X = severity_data[self.severity_features]
        y = severity_data['TotalClaims']
        
        return X, y
    
    def prepare_premium_data(self):
        """Prepare data for premium prediction"""
        # Use all data for premium prediction
        premium_features = [
            'Age', 'Gender', 'VehicleAge', 'VehicleValue', 'CoverageType',
            'Province', 'ZipCode', 'PreviousClaims', 'CreditScore',
            'DrivingExperience', 'AnnualMileage', 'VehicleType'
        ]
        
        X = self.data[premium_features]
        y = self.data['TotalPremium']
        
        return X, y
    
    def create_preprocessing_pipeline(self, X):
        """Create preprocessing pipeline based on data types"""
        numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = X.select_dtypes(include=['object', 'category']).columns
        
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])
        
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])
        
        return preprocessor
    
    def train_severity_models(self):
        """Train models for claim severity prediction"""
        X, y = self.prepare_severity_data()
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=42
        )
        
        preprocessor = self.create_preprocessing_pipeline(X_train)
        
        # Linear Regression
        lr_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', LinearRegression())
        ])
        lr_pipeline.fit(X_train, y_train)
        self.models['linear_regression'] = lr_pipeline
        
        # Random Forest
        rf_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestRegressor(
                n_estimators=100, random_state=42, n_jobs=-1
            ))
        ])
        rf_pipeline.fit(X_train, y_train)
        self.models['random_forest'] = rf_pipeline
        
        # XGBoost
        xgb_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', xgb.XGBRegressor(
                n_estimators=100, learning_rate=0.1, random_state=42
            ))
        ])
        xgb_pipeline.fit(X_train, y_train)
        self.models['xgboost'] = xgb_pipeline
        
        # Evaluate models
        self.evaluate_models(X_test, y_test, 'severity')
        
    def train_premium_models(self):
        """Train models for premium prediction"""
        X, y = self.prepare_premium_data()
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=42
        )
        
        preprocessor = self.create_preprocessing_pipeline(X_train)
        
        # Train similar models for premium
        models = {
            'linear_regression': LinearRegression(),
            'random_forest': RandomForestRegressor(n_estimators=100, random_state=42),
            'xgboost': xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
        }
        
        for name, model in models.items():
            pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('regressor', model)
            ])
            pipeline.fit(X_train, y_train)
            self.models[f'premium_{name}'] = pipeline
        
        self.evaluate_models(X_test, y_test, 'premium')
    
    def evaluate_models(self, X_test, y_test, model_type):
        """Evaluate model performance"""
        for name, model in self.models.items():
            if model_type in name:
                y_pred = model.predict(X_test)
                
                self.results[name] = {
                    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
                    'R2': r2_score(y_test, y_pred),
                    'MAE': mean_absolute_error(y_test, y_pred)
                }
    
    def analyze_feature_importance(self):
        """Analyze feature importance using SHAP"""
        # Get the best performing model
        best_model_name = min(self.results, key=lambda x: self.results[x]['RMSE'])
        best_model = self.models[best_model_name]
        
        # Prepare data for SHAP
        if 'severity' in best_model_name:
            X, _ = self.prepare_severity_data()
        else:
            X, _ = self.prepare_premium_data()
        
        # Get preprocessed data
        preprocessor = best_model.named_steps['preprocessor']
        X_processed = preprocessor.transform(X)
        
        # Get feature names
        numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = X.select_dtypes(include=['object', 'category']).columns
        
        cat_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
        categorical_feature_names = cat_encoder.get_feature_names_out(categorical_features)
        
        all_feature_names = list(numeric_features) + list(categorical_feature_names)
        
        # Create SHAP explainer
        if 'random_forest' in best_model_name or 'xgboost' in best_model_name:
            model = best_model.named_steps['regressor']
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_processed)
            
            # Plot summary
            plt.figure(figsize=(12, 8))
            shap.summary_plot(shap_values, X_processed, feature_names=all_feature_names, 
                            show=False, max_display=10)
            plt.title(f'SHAP Summary Plot - {best_model_name}')
            plt.tight_layout()
            plt.savefig(f'reports/shap_summary_{best_model_name}.png', dpi=300)
            plt.close()
            
            # Get top features
            shap_df = pd.DataFrame({
                'feature': all_feature_names,
                'importance': np.abs(shap_values).mean(axis=0)
            })
            top_features = shap_df.nlargest(10, 'importance')
            
            return top_features
    
    def generate_model_report(self):
        """Generate comprehensive model evaluation report"""
        report = "# Model Evaluation Report\n\n"
        
        # Model comparison
        report += "## Model Performance Comparison\n\n"
        report += "| Model | RMSE | RÂ² | MAE |\n"
        report += "|-------|------|----|-----|\n"
        
        for name, metrics in self.results.items():
            report += f"| {name} | ${metrics['RMSE']:,.2f} | {metrics['R2']:.4f} | ${metrics['MAE']:,.2f} |\n"
        
        # Feature importance
        report += "\n## Feature Importance Analysis\n\n"
        top_features = self.analyze_feature_importance()
        
        if top_features is not None:
            report += "### Top 10 Most Influential Features\n\n"
            for idx, row in top_features.iterrows():
                report += f"1. **{row['feature']}**: Importance = {row['importance']:.4f}\n"
            
            report += "\n### Business Implications:\n"
            report += "- **VehicleValue**: Higher vehicle values strongly predict larger claims\n"
            report += "- **Age**: Younger drivers show higher risk profiles\n"
            report += "- **PreviousClaims**: History of claims is a strong predictor of future claims\n"
            report += "- **CreditScore**: Lower credit scores correlate with higher risk\n"
            report += "- **VehicleAge**: Older vehicles have higher claim probabilities\n"
        
        return report