# Import libraries

In [1]:
from catboost import CatBoostClassifier, Pool
import numpy as np
import pandas as pd
import polars as pl
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import warnings
warnings.simplefilter('ignore')

# Load train and test datasets

In [2]:
train = pl.read_csv('train.csv')
train.head()
test = pl.read_csv('test.csv')
test.head()

id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
i64,str,i64,i64,f64,i64,str,str,f64,f64,i64,i64
0,"""Male""",21,1,35.0,0,"""1-2 Year""","""Yes""",65101.0,124.0,187,0
1,"""Male""",43,1,28.0,0,"""> 2 Years""","""Yes""",58911.0,26.0,288,1
2,"""Female""",25,1,14.0,1,"""< 1 Year""","""No""",38043.0,152.0,254,0
3,"""Female""",35,1,1.0,0,"""1-2 Year""","""Yes""",2630.0,156.0,76,0
4,"""Female""",36,1,15.0,1,"""1-2 Year""","""No""",31951.0,152.0,294,0


# Add 'Response' column to test set with default value 0 for concatenation

In [4]:
test = test.with_columns(pl.lit(0).cast(pl.Int64).alias('Response'))

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Previously_Insured_Annual_Premium,Previously_Insured_Vehicle_Age,Previously_Insured_Vehicle_Damage,Previously_Insured_Vintage
0,0,0,21,1,35,0,1,1,65101,124,187,0,0,0,0,0
1,1,0,43,1,28,0,2,1,58911,26,288,1,1,1,0,1
2,2,1,25,1,14,1,0,0,38043,152,254,0,2,2,1,2
3,3,1,35,1,1,0,1,1,2630,156,76,0,3,0,0,3
4,4,1,36,1,15,1,1,0,31951,152,294,0,4,3,1,4


# Concatenate train and test datasets for preprocessing

In [None]:
df = pl.concat([train, test])

# Data preprocessing: Encode categorical variables and cast columns to appropriate types

In [None]:

df = df.with_columns([
    pl.col('Gender').replace({'Male': 0, 'Female': 1}).cast(pl.Int32),
    pl.col('Region_Code').cast(int),
    pl.col('Vehicle_Age').replace({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}).cast(pl.Int32),
    pl.col('Vehicle_Damage').replace({'No': 0, 'Yes': 1}).cast(pl.Int32),
    pl.col('Annual_Premium').cast(int),
    pl.col('Policy_Sales_Channel').cast(int)
])

# Feature engineering: Create new features by combining existing ones and applying factorization

In [None]:

df = df.with_columns([
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Annual_Premium'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Annual_Premium'),
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vehicle_Age'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Vehicle_Age'),
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vehicle_Damage'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Vehicle_Damage'),
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vintage'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Vintage')
])

# Split the processed dataframe back into train and test sets

In [None]:
train = df[:train.shape[0]].to_pandas()
test = df[train.shape[0]:].to_pandas()

train.head()

# Prepare features and target variable for training

In [5]:
X_train = train.drop(['id', 'Response'], axis=1)
y_train = train['Response']
X_test = test[X_train.columns]
submission = test[['id']]

0

# Clean up memory

In [None]:
del train, test, df
gc.collect()

# Define Stratified K-Fold cross-validation

In [None]:
cv = StratifiedKFold(4, shuffle=True, random_state=0)
cv_splits = cv.split(X_train, y_train)

# Initialize a list to store test predictions

In [None]:
test_preds = list()

# Define CatBoost parameters

In [None]:
params = {
    'nan_mode': 'Min',
    'gpu_ram_part': 0.85,
    'eval_metric': 'AUC',
    'combinations_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1',
    'FeatureFreq:CtrBorderCount=15:CtrBorderType=Median:Prior=0/1'],
    'iterations': 2500,
    'fold_permutation_block': 64,
    'leaf_estimation_method': 'Newton',
    'od_pval': 0,
    'random_score_type': 'NormalWithModelSizeDecrease',
    'counter_calc_method': 'SkipTest',
    'grow_policy': 'SymmetricTree',
    'penalties_coefficient': 1,
    'boosting_type': 'Plain',
    'ctr_history_unit': 'Sample',
    'feature_border_type': 'GreedyLogSum',
    'one_hot_max_size': 2,
    'devices': '-1',
    'eval_fraction': 0,
    'l2_leaf_reg': 0.5,
    'random_strength': 0,
    'od_type': 'Iter',
    'rsm': 1,
    'boost_from_average': False,
    'gpu_cat_features_storage': 'GpuRam',
    'max_ctr_complexity': 4,
    'model_size_reg': 0.5,
    'simple_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1',
    'FeatureFreq:CtrBorderCount=15:CtrBorderType=MinEntropy:Prior=0/1'],
    'use_best_model': True,
    'od_wait': 200,
    'class_names': [0, 1],
    'random_seed': 42,
    'depth': 9,
    'ctr_target_border_count': 1,
    'has_time': False,
    'border_count': 128,
    'data_partition': 'FeatureParallel',
    'bagging_temperature': 1,
    'classes_count': 0,
    'auto_class_weights': 'None',
    'leaf_estimation_backtracking': 'AnyImprovement',
    'best_model_min_trees': 1,
    'min_data_in_leaf': 1,
    'loss_function': 'Logloss',
    'learning_rate': 0.075,
    'score_function': 'Cosine',
    'task_type': 'GPU',
    'leaf_estimation_iterations': 10,
    'bootstrap_type': 'Bayesian',
    'max_leaves': 512,
}

# Train CatBoost model using Stratified K-Fold cross-validation and clean up memory

In [6]:
for i, (train_idx, val_idx) in enumerate(cv_splits):
    model = CatBoostClassifier(**params, verbose=False)
    X_train_fold, X_val_fold = X_train.loc[train_idx], X_train.loc[val_idx]
    y_train_fold, y_val_fold = y_train.loc[train_idx], y_train.loc[val_idx]
    X_train_pool = Pool(X_train, y_train, cat_features=X_train.columns.values)
    X_valid_pool = Pool(X_val_fold, y_val_fold, cat_features=X_val_fold.columns.values)
    X_test_pool = Pool(X_test[X_train.columns], cat_features=X_test.columns.values)
    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=250, early_stopping_rounds=200)
    test_pred = model.predict_proba(X_test_pool)[:, 1]
    test_preds.append(test_pred)
    
    del X_train_fold, X_val_fold, y_train_fold, y_val_fold
    del X_train_pool, X_valid_pool, X_test_pool
    del model, test_pred
    gc.collect()
    print(f'Fold {i+1} finished.')

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8767587	best: 0.8767587 (0)	total: 2.11s	remaining: 1h 27m 56s
250:	test: 0.9171066	best: 0.9171066 (250)	total: 8m 9s	remaining: 1h 13m 1s
500:	test: 0.9218410	best: 0.9218410 (500)	total: 16m 11s	remaining: 1h 4m 37s
750:	test: 0.9242518	best: 0.9242518 (750)	total: 24m 12s	remaining: 56m 23s
1000:	test: 0.9257380	best: 0.9257700 (999)	total: 32m 20s	remaining: 48m 25s
1250:	test: 0.9262079	best: 0.9262220 (1244)	total: 40m 24s	remaining: 40m 20s
1500:	test: 0.9265363	best: 0.9265556 (1423)	total: 48m 23s	remaining: 32m 12s
1750:	test: 0.9271720	best: 0.9271740 (1742)	total: 56m 20s	remaining: 24m 5s
2000:	test: 0.9274070	best: 0.9274070 (2000)	total: 1h 4m 24s	remaining: 16m 3s
2250:	test: 0.9279224	best: 0.9279224 (2250)	total: 1h 12m 28s	remaining: 8m
2499:	test: 0.9283029	best: 0.9283187 (2491)	total: 1h 20m 22s	remaining: 0us
bestTest = 0.9283186793
bestIteration = 2491
Shrink model to first 2492 iterations.
Fold 1 finished.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8773128	best: 0.8773128 (0)	total: 2.17s	remaining: 1h 30m 29s
250:	test: 0.9174850	best: 0.9174850 (250)	total: 8m 13s	remaining: 1h 13m 38s
500:	test: 0.9226437	best: 0.9226437 (500)	total: 16m 17s	remaining: 1h 5m
750:	test: 0.9249237	best: 0.9249263 (748)	total: 24m 18s	remaining: 56m 36s
1000:	test: 0.9259652	best: 0.9259686 (942)	total: 32m 18s	remaining: 48m 22s
1250:	test: 0.9264453	best: 0.9264484 (1241)	total: 40m 22s	remaining: 40m 18s
1500:	test: 0.9271277	best: 0.9271347 (1499)	total: 48m 26s	remaining: 32m 14s
1750:	test: 0.9274547	best: 0.9274552 (1738)	total: 56m 30s	remaining: 24m 10s
2000:	test: 0.9277995	best: 0.9277996 (1975)	total: 1h 4m 31s	remaining: 16m 5s
2250:	test: 0.9281549	best: 0.9281549 (2250)	total: 1h 12m 28s	remaining: 8m
2499:	test: 0.9284413	best: 0.9284627 (2494)	total: 1h 20m 2s	remaining: 0us
bestTest = 0.9284626842
bestIteration = 2494
Shrink model to first 2495 iterations.
Fold 2 finished.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8771950	best: 0.8771950 (0)	total: 2.16s	remaining: 1h 30m 2s
250:	test: 0.9171047	best: 0.9171047 (250)	total: 8m 18s	remaining: 1h 14m 25s
500:	test: 0.9225286	best: 0.9225286 (500)	total: 16m 29s	remaining: 1h 5m 47s
750:	test: 0.9253376	best: 0.9253376 (750)	total: 24m 36s	remaining: 57m 19s
1000:	test: 0.9263925	best: 0.9263925 (1000)	total: 32m 47s	remaining: 49m 6s
1250:	test: 0.9269596	best: 0.9269642 (1245)	total: 40m 53s	remaining: 40m 49s
1500:	test: 0.9274687	best: 0.9274740 (1497)	total: 48m 51s	remaining: 32m 31s
1750:	test: 0.9280935	best: 0.9280935 (1750)	total: 56m 39s	remaining: 24m 14s
2000:	test: 0.9282690	best: 0.9282879 (1979)	total: 1h 4m 28s	remaining: 16m 4s
2250:	test: 0.9285079	best: 0.9285116 (2246)	total: 1h 12m 32s	remaining: 8m 1s
2499:	test: 0.9289747	best: 0.9289770 (2497)	total: 1h 19m 58s	remaining: 0us
bestTest = 0.928976953
bestIteration = 2497
Shrink model to first 2498 iterations.
Fold 3 finished.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8773500	best: 0.8773500 (0)	total: 2.11s	remaining: 1h 28m 3s
250:	test: 0.9166902	best: 0.9166902 (250)	total: 7m 30s	remaining: 1h 7m 18s
500:	test: 0.9229822	best: 0.9229861 (491)	total: 14m 52s	remaining: 59m 22s
750:	test: 0.9251425	best: 0.9251667 (741)	total: 22m 7s	remaining: 51m 30s
1000:	test: 0.9262666	best: 0.9262666 (1000)	total: 29m 26s	remaining: 44m 5s
1250:	test: 0.9265789	best: 0.9265928 (1122)	total: 36m 46s	remaining: 36m 42s
1500:	test: 0.9271724	best: 0.9271724 (1500)	total: 44m 2s	remaining: 29m 18s
1750:	test: 0.9273695	best: 0.9273763 (1747)	total: 51m 24s	remaining: 21m 59s
2000:	test: 0.9275915	best: 0.9275915 (2000)	total: 58m 41s	remaining: 14m 38s
2250:	test: 0.9280373	best: 0.9280373 (2250)	total: 1h 6m 10s	remaining: 7m 19s
2499:	test: 0.9284146	best: 0.9284238 (2437)	total: 1h 13m 4s	remaining: 0us
bestTest = 0.9284237623
bestIteration = 2437
Shrink model to first 2438 iterations.
Fold 4 finished.


# Average predictions across all folds and save submission file

In [7]:
submission['Response'] = np.mean(test_preds, axis=0)
submission.to_csv('submission.csv', index=False)
submission.head(10)

Unnamed: 0,id,Response
0,11504798,0.005421
1,11504799,0.678243
2,11504800,0.244888
3,11504801,7.9e-05
4,11504802,0.272695
5,11504803,5.8e-05
6,11504804,0.103942
7,11504805,0.003222
8,11504806,4e-05
9,11504807,0.000184
