In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, make_scorer
import setup_path
from src.preprocess import preprocess_data
from src.segment_data import segment_data
from src.model import define_model


train = pd.read_csv('train.csv')
processed_df, _ = preprocess_data(train)

In [None]:
# --- Helper Function Definitions ---

def initialize_metadata(df):

    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()

    if 'Premium Amount' in categorical_features:
        categorical_features.remove('Premium Amount')
    
    category_mappings = {col: sorted(df[col].unique()) for col in categorical_features}

    return categorical_features, category_mappings


def create_segment_pipeline(segment_name, config, categorical_features, category_mappings, numeric_features):

    categorical_transformer = OneHotEncoder(
        categories=[category_mappings[col] for col in categorical_features],
        sparse_output=False,
        handle_unknown='ignore'
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        sparse_threshold=0
    )

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', config['model'])
    ])

    return pipeline


def evaluate_predictions(y_true, y_pred):

    return {
        'r2_score': r2_score(y_true, y_pred),
        'mae': mean_absolute_error(y_true, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
        'mape': np.mean(np.abs((y_true - y_pred) / y_true)) * 100,
        'median_ae': np.median(np.abs(y_true - y_pred))
    }


def train_segment_model(X_seg, y_seg, segment_name, config, categorical_features, category_mappings):

    numeric_features = X_seg.select_dtypes(include=['int64', 'float64']).columns.tolist()

    if 'Premium Amount' in numeric_features:
        numeric_features.remove('Premium Amount')

    pipeline = create_segment_pipeline(segment_name, config, categorical_features, category_mappings, numeric_features)

    X_train, X_test, y_train, y_test = train_test_split(X_seg, y_seg, test_size=0.2, random_state=42)

    pipeline.fit(X_train, y_train)

    train_pred = pipeline.predict(X_train)
    test_pred = pipeline.predict(X_test)

    train_metrics = evaluate_predictions(y_train, train_pred)
    test_metrics = evaluate_predictions(y_test, test_pred)

    cv_scores = cross_val_score(
        pipeline, X_seg, y_seg, cv=5,
        scoring=make_scorer(r2_score), n_jobs=-1
    )

    return {
        'model': pipeline,
        'train_metrics': train_metrics,
        'test_metrics': test_metrics,
        'cv_scores': cv_scores,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'test_data': (X_test, y_test)
    }


def analyze_feature_importance(model, categorical_features):

    feature_names = []

    num_features = model.named_steps['preprocessor'].transformers_[0][2]
    feature_names.extend(num_features)

    cat_features = model.named_steps['preprocessor'].transformers_[1][2]

    if len(cat_features) > 0:
        encoder = model.named_steps['preprocessor'].named_transformers_['cat']
        if hasattr(encoder, 'get_feature_names_out'):
            encoded = encoder.get_feature_names_out(cat_features)
            feature_names.extend(encoded)

    if hasattr(model.named_steps['regressor'], 'feature_importances_'):
        importances = model.named_steps['regressor'].feature_importances_
    else:
        importances = np.mean([
            est.feature_importances_ 
            for name, est in model.named_steps['regressor'].estimators_
            if hasattr(est, 'feature_importances_')
        ], axis=0)

    return pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)


def train_all_segments(df, segments, segment_configs):

    categorical_features, category_mappings = initialize_metadata(df)
    feature_cols = [col for col in df.columns if col != 'Premium Amount']
    target_col = 'Premium Amount'

    segment_results = {}

    for name, mask in segments.items():

        print(f"\nProcessing {name} segment...")

        X_seg = df[feature_cols][mask]
        y_seg = df[target_col][mask]

        print(f"Segment Length {len(X_seg)}...")

        if len(X_seg) >= 100:
            config = segment_configs.get(name, segment_configs['default'])

            result = train_segment_model(
                X_seg, y_seg, name, config, categorical_features, category_mappings
            )
            segment_results[name] = result

            print(f"Train R2: {result['train_metrics']['r2_score']:.4f}")
            print(f"Test R2: {result['test_metrics']['r2_score']:.4f}")
            print(f"MAE: {result['test_metrics']['mae']:.2f}")
            print(f"Median AE: {result['test_metrics']['median_ae']:.2f}")
            print(f"MAPE: {result['test_metrics']['mape']:.2f}%")
            print(f"RMSE: {result['test_metrics']['rmse']:.2f}")
            print(f"CV R2: {result['cv_mean']:.4f} (+/- {result['cv_std'] * 2:.4f})")

    performance_df = pd.DataFrame.from_dict({
        name: {
            'segment_size': len(df[segments[name]]),
            'train_r2': res['train_metrics']['r2_score'],
            'test_r2': res['test_metrics']['r2_score'],
            'cv_mean_r2': res['cv_mean'],
            'cv_std_r2': res['cv_std'],
            'mae': res['test_metrics']['mae'],
            'mape': res['test_metrics']['mape']
        }
        for name, res in segment_results.items()
    }, orient='index')

    print("\nSegment Performance Summary:")
    print(performance_df.sort_values('test_r2', ascending=False))

    return segment_results, performance_df


def get_segment_predictions(segment_results, segment_name, X_new):

    if segment_name not in segment_results:
        raise ValueError(f"No model found for segment: {segment_name}")

    return segment_results[segment_name]['model'].predict(X_new)


def get_feature_importance(segment_results, segment_name, categorical_features):

    if segment_name not in segment_results:
        raise ValueError(f"No model found for segment: {segment_name}")

    return analyze_feature_importance(segment_results[segment_name]['model'], categorical_features)


def get_segment_metrics(segment_results, segment_name):

    if segment_name not in segment_results:
        raise ValueError(f"No results found for segment: {segment_name}")
    res = segment_results[segment_name]

    return {
        'train_metrics': res['train_metrics'],
        'test_metrics': res['test_metrics'],
        'cv_scores': res['cv_scores'],
        'cv_mean': res['cv_mean'],
        'cv_std': res['cv_std']
    }


categorical_features, category_mappings = initialize_metadata(processed_df)

segments = segment_data(processed_df)
segment_configs = define_model()

segment_results, performance_df = train_all_segments(processed_df, segments, segment_configs)

In [None]:
def segment_test_data(df, segment_function):
    """Segment test data using the provided segmentation logic."""
    segments = segment_function(df)
    return {key: df[mask] for key, mask in segments.items()}


def predict_and_export(test_df, test_ids, segment_results, segment_function, categorical_features, output_file='predicted_premiums.csv'):
    """
    Predict and export results for test data using the trained models.

    Parameters:
        test_df (pd.DataFrame): The processed test data
        test_ids (pd.Series): Corresponding IDs for tracking
        segment_results (dict): Trained models by segment
        segment_function (callable): Function to segment the data
        categorical_features (list): List of categorical features
        output_file (str): File name to export predictions
    """
    predictions = []

    # Segment test data
    segments = segment_test_data(test_df, segment_function)

    for segment_name, segment_df in segments.items():
        if not segment_df.empty:
            try:
                model = segment_results.get(segment_name)
                if model is None:
                    print(f"No trained model for segment: {segment_name}")
                    continue

                segment_ids = test_ids.loc[segment_df.index]
                segment_preds = model['model'].predict(segment_df)

                predictions.extend(zip(segment_ids, segment_preds))

            except Exception as e:
                print(f"Error in segment {segment_name}: {e}")

    predictions_df = pd.DataFrame(predictions, columns=['id', 'Premium Amount'])

    # Average over duplicate IDs
    predictions_df = predictions_df.groupby('id', as_index=False)['Premium Amount'].mean()

    predictions_df.to_csv(output_file, index=False)
    print(f"Predictions exported to {output_file}.")
