In [None]:
import numpy as np
import pandas as pd 
import time
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler
from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import gc

In [None]:
pd.set_option('display.max_columns', None)

# Load and preprocess data

The dataset was loaded with optimized data types to reduce memory usage, and unnecessary columns, such as the 'id' column, were removed. This step ensures efficient data processing for further analysis, which is crucial for handling large datasets in real-world applications, where memory optimization is key for smooth operation.


In [None]:
start_time = time.time()

dtypes = {
    'Gender': 'category',
    'Driving_License': 'category',
    'Previously_Insured': 'category',
    'Vehicle_Age': 'category',
    'Vehicle_Damage': 'category',
    'Region_Code': 'category',
    'Policy_Sales_Channel': 'category',
    'Age': 'int8',
    'Vintage': 'int16',
    'Annual_Premium': 'float32'
}

train = pd.read_csv('/kaggle/input/playground-series-s4e7/train.csv', dtype=dtypes)
test = pd.read_csv('/kaggle/input/playground-series-s4e7/test.csv', dtype=dtypes)

train.columns = train.columns.str.strip()
test.columns = test.columns.str.strip()

train.drop(columns=['id'], inplace=True)

test_ids = test['id']
test.drop(columns=['id'], inplace=True)

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Time load: {elapsed_time:.2f} seconds")

# Feature engineering and preprocessing

Categorical variables, like `Vehicle_Age`, were transformed into numerical values to enhance model interpretability. 

The train and test sets were temporarily combined to apply uniform transformations across both datasets. 

Proper feature engineering improves model performance by providing structured, meaningful data for the model to learn from.

The feature engineering approach in this notebook is adapted from the work that Khang and I contributed together in our team's  [EDA_Classification_Insurance](https://www.kaggle.com/code/khangtran94vn/eda-classification-insurance/notebook) notebook. 

In [None]:
start_time = time.time()

test['Response'] = 0
test['Response'] = test['Response'].astype('int64')

cb = pd.concat([train, test]) #combined

# Encode categorical variables
vehicle_age_order = {"< 1 Year": 0, "1-2 Year": 1, "> 2 Years": 2}
cb["Vehicle_Age"] = cb["Vehicle_Age"].astype('category').cat.set_categories(vehicle_age_order.keys()).cat.rename_categories(vehicle_age_order).astype('int8')

vehicle_damage = {'No':0, 'Yes':1}
cb["Vehicle_Damage"] = cb["Vehicle_Damage"].astype('category').cat.set_categories(vehicle_damage.keys()).cat.rename_categories(vehicle_damage).astype('int8')

categorical_columns = ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Region_Code', 'Policy_Sales_Channel']
label_encoders = {col: LabelEncoder() for col in categorical_columns}
for col in categorical_columns:
    cb[col] = label_encoders[col].fit_transform(cb[col])

# Convert datatypes to save memory
cb['Gender'] = cb['Gender'].astype('int8')
cb['Vehicle_Age'] = cb['Vehicle_Age'].astype('int8')
cb['Vehicle_Damage'] = cb['Vehicle_Damage'].astype('int8')
cb['Driving_License'] = cb['Driving_License'].astype('int8')
cb['Previously_Insured'] = cb['Previously_Insured'].astype('int8')
cb['Region_Code'] = cb['Region_Code'].astype('int8')
cb['Policy_Sales_Channel'] = cb['Policy_Sales_Channel'].astype('int16')

# Create interaction features
cb['Insured_Annual_Premium'] = pd.factorize(cb['Previously_Insured'].astype(str) + cb['Annual_Premium'].astype(str))[0]
cb['Insured_Vehicle_Age'] = pd.factorize(cb['Previously_Insured'].astype(str) + cb['Vehicle_Age'].astype(str))[0]
cb['Insured_Vehicle_Damage'] = pd.factorize(cb['Previously_Insured'].astype(str) + cb['Vehicle_Damage'].astype(str))[0]
cb['Insured_Vintage'] = pd.factorize(cb['Previously_Insured'].astype(str) + cb['Vintage'].astype(str))[0]

train = cb.iloc[:len(train)]
test = cb.iloc[len(train):]
test = test.drop(columns=['Response'])

del cb
gc.collect()
train.head()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Time load: {elapsed_time:.2f} seconds")

# Modeling

### Cross-Validation
Class imbalance in the target variable was addressed using RandomUnderSampler to create a balanced view of both classes. 

This approach prevents model bias toward the majority class and improves generalization to the minority class, which is critical when dealing with imbalanced data often found in real-world scenarios.

### Model Evaluation Using ROC-AUC
Stratified K-Fold cross-validation with XGBoost was employed to ensure consistent class distribution across folds. 

Cross-validation reduces overfitting by training the model on different subsets of data, providing a more reliable estimate of the model's performance on unseen data.

In [None]:
# Define training function with cross-validation and bagging
def train_cv(X, y, n_splits=10, n_bags=8):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate': 0.18346215360183396,
        'n_estimators': 2413,
        'max_depth': 6,
        'max_bin': 93101,
        'reg_alpha': 1.4244811420186621e-06, 
        'reg_lambda': 1.1488485490280707,
        'tree_method': 'gpu_hist',  
        'predictor': 'gpu_predictor',
        'seed': 42,
        'verbosity': 0
    }
    
    start_time = time.time()
    
    best_auc = 0
    best_model = None
    best_scaler = None
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(y))
    
    scores = []

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        print(f"Fold #{fold + 1}")
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        fold_predictions = np.zeros(len(y_test))
        
        fold_models = []
        fold_scalers = []
        
        for bag in range(n_bags):
            print(f"Bag #{bag + 1}")
            
            # Progressive sampling strategy
            sample_size = int(len(X_train) * (bag + 1) / n_bags)
            
            minority_count = (y_train == y_train.min()).sum()
            majority_count = len(y_train) - minority_count
            sampling_ratio = min(0.5, minority_count / (sample_size / 2))  
            sampling_strategy = {y_train.min(): int(sample_size * sampling_ratio),
                                 y_train.max(): sample_size - int(sample_size * sampling_ratio)}
            
            undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42+bag)
            X_sampled, y_sampled = undersampler.fit_resample(X_train, y_train)
            
            # Feature scaling
            scaler = RobustScaler()
            X_sampled_scaled = scaler.fit_transform(X_sampled)
            X_test_scaled = scaler.transform(X_test)
            
            model = xgb.XGBClassifier(**params)
            model.fit(X_sampled_scaled, y_sampled, 
                      eval_set=[(X_test_scaled, y_test)], 
                      verbose=False)
            
            y_pred = model.predict_proba(X_test_scaled)[:, 1]
            fold_predictions += y_pred / n_bags
            
            bag_auc = roc_auc_score(y_test, y_pred)

            print(f"ROC-AUC-Score={bag_auc:.7f}")
            
            scores.append(bag_auc)
            fold_models.append(model)
            fold_scalers.append(scaler)

            del X_sampled_scaled, X_test_scaled
            gc.collect()
            
        oof_preds[test_index] = fold_predictions

        # Average the predictions from all bags
        fold_auc = roc_auc_score(y_test, fold_predictions) 
        print(f"Aggregated AUC score for Fold #{fold + 1}: {fold_auc:.7f}")
        
        if fold_auc > best_auc:
            best_auc = fold_auc
            best_models = fold_models
            best_scalers = fold_scalers
    
    overall_auc = roc_auc_score(y, oof_preds)
    print(f"Overall AUC score: {overall_auc:.7f}")
    
    print(f"ROC-AUC mean: {np.array(scores).mean():.7f} (+- {np.array(scores).std():.7f})")
    
    total_auc = roc_auc_score(y, oof_preds)
    print(f"Total ROC-AUC score: {total_auc}")

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time load: {elapsed_time:.2f} seconds")
    return best_models, best_scalers

## Ensemble Model Prediction

The `ensemble_predict` function averages predictions from multiple models using their respective scalers. It processes the test data, scales it, and computes the mean prediction probabilities across the models. The final predictions are returned.

- **X and y split**: Separates features (X) and target (y) from the training dataset.
- **Best models and scalers**: Trains the models with cross-validation, finding the best models and scalers.

In [None]:
def ensemble_predict(test, models, scalers):
    preds = np.zeros(len(test))
    for model, scaler in zip(models, scalers):
        test_scaled = scaler.transform(test)
        preds += model.predict_proba(test_scaled)[:, 1]
    final_preds = preds / len(models)
    return final_preds

In [None]:
X = train.drop(columns=['Response'])
y = train['Response']

In [None]:
best_models, best_scalers = train_cv(X, y)

# Submission

In [None]:
final_preds = ensemble_predict(test, best_models, best_scalers)

submission = pd.DataFrame({
    'id': test_ids,
    'Response': final_preds
})

# save the submission file
submission.to_csv('submission.csv', index=False)
print('submission ready!')

In [None]:
submission.head(10)