In [1]:
import pandas as pd
import numpy as np
import os
import setup_path
from src.preprocess import preprocess_data
from src.segment_data import segment_data
from src.model import define_model
from src.training import train_all_segments


# Load the dataset
script_directory = os.getcwd()  # Gets the current working directory
data_directory = os.path.join(script_directory, '..', 'data')

train = pd.read_csv(os.path.join(data_directory, f"train.csv"))
processed_df, _ = preprocess_data(train)


In [2]:
segments = segment_data(processed_df)
segment_configs = define_model()

segment_results, performance_df = train_all_segments(processed_df, segments, segment_configs)


Segment Distribution:
High_Value_Property: 59,844 (5.0%)
Low_Risk_Premium: 153,743 (12.8%)
Healthy_Professional: 234,873 (19.6%)
Family_Premium: 128,403 (10.7%)
Basic_Coverage: 278,963 (23.2%)
Default_Segment: 591,878 (49.3%)

Total records: 1,200,000
Total assigned: 1,447,704
Records per segment on average: 241,284.0

Processing High_Value_Property segment...
Segment Length 59844...
Train R2: 0.0299
Test R2: 0.0212
MAE: 713.12
Median AE: 592.45
MAPE: 338.04%
RMSE: 918.23
CV R2: 0.0217 (+/- 0.0036)

Processing Low_Risk_Premium segment...
Segment Length 153743...
Train R2: 0.0192
Test R2: 0.0201
MAE: 677.10
Median AE: 548.52
MAPE: 311.74%
RMSE: 876.73
CV R2: 0.0191 (+/- 0.0015)

Processing Healthy_Professional segment...
Segment Length 234873...
Train R2: 0.0131
Test R2: 0.0096
MAE: 687.60
Median AE: 573.20
MAPE: 302.91%
RMSE: 887.50
CV R2: 0.0102 (+/- 0.0011)

Processing Family_Premium segment...
Segment Length 128403...
Train R2: 0.0134
Test R2: 0.0122
MAE: 670.47
Median AE: 553.99
M

In [3]:
def segment_test_data(df, segment_function):

    """Segment test data using the provided segmentation logic."""
    segments = segment_function(df)

    return {key: df[mask] for key, mask in segments.items()}


def predict_and_export(test_df, test_ids, segment_results, segment_function, output_file='predicted_premiums.csv'):

    """
    Predict and export results for test data using the trained models.

    Parameters:
        test_df (pd.DataFrame): The processed test data
        test_ids (pd.Series): Corresponding IDs for tracking
        segment_results (dict): Trained models by segment
        segment_function (callable): Function to segment the data
        categorical_features (list): List of categorical features
        output_file (str): File name to export predictions
    """
    predictions = []

    # Segment test data
    segments = segment_test_data(test_df, segment_function)

    for segment_name, segment_df in segments.items():

        if not segment_df.empty:

            try:
                model = segment_results.get(segment_name)
                if model is None:
                    print(f"No trained model for segment: {segment_name}")
                    continue

                segment_ids = test_ids.loc[segment_df.index]
                segment_preds = model['model'].predict(segment_df)

                predictions.extend(zip(segment_ids, segment_preds))

            except Exception as e:
                print(f"Error in segment {segment_name}: {e}")

    predictions_df = pd.DataFrame(predictions, columns=['id', 'Premium Amount'])

    # Average over duplicate IDs
    predictions_df = predictions_df.groupby('id', as_index=False)['Premium Amount'].mean()

    predictions_df.to_csv(output_file, index=False)
    print(f"Predictions exported to {output_file}.")


# Load test data
test_raw = pd.read_csv(os.path.join(data_directory, f"test.csv"))

# Apply your test preprocessing (you need to define preprocess_data)
test_processed, test_ids = preprocess_data(test_raw)

# Run predictions
predict_and_export(
    test_df=test_processed,
    test_ids=test_ids,
    segment_results=segment_results,
    segment_function=segment_data,  # Your segmentation logic
    output_file='predicted_premiums.csv'
)



Segment Distribution:
High_Value_Property: 39,416 (4.9%)
Low_Risk_Premium: 102,125 (12.8%)
Healthy_Professional: 154,566 (19.3%)
Family_Premium: 85,709 (10.7%)
Basic_Coverage: 186,659 (23.3%)
Default_Segment: 394,798 (49.3%)

Total records: 800,000
Total assigned: 963,273
Records per segment on average: 160,545.5
Predictions exported to predicted_premiums.csv.
