In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder



import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e12/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e12/test.csv")

sample = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv')

train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True) 

In [3]:
def date(Df):

    Df['Policy Start Date'] = pd.to_datetime(Df['Policy Start Date'])
    Df['Year'] = Df['Policy Start Date'].dt.year
    Df['Day'] = Df['Policy Start Date'].dt.day
    Df['Month'] = Df['Policy Start Date'].dt.month
    Df['Month_name'] = Df['Policy Start Date'].dt.month_name()
    Df['Day_of_week'] = Df['Policy Start Date'].dt.day_name()
    Df['Week'] = Df['Policy Start Date'].dt.isocalendar().week
    Df['Year_sin'] = np.sin(2 * np.pi * Df['Year'])
    Df['Year_cos'] = np.cos(2 * np.pi * Df['Year'])
    Df['Month_sin'] = np.sin(2 * np.pi * Df['Month'] / 12) 
    Df['Month_cos'] = np.cos(2 * np.pi * Df['Month'] / 12)
    Df['Day_sin'] = np.sin(2 * np.pi * Df['Day'] / 31)  
    Df['Day_cos'] = np.cos(2 * np.pi * Df['Day'] / 31)
    Df['Group']=(Df['Year']-2020)*48+Df['Month']*4+Df['Day']//7
    
    Df.drop('Policy Start Date', axis=1, inplace=True)

    return Df

In [4]:
train = date(train)
test = date(test)

cat_cols = [col for col in train.columns if train[col].dtype == 'object']
feature_cols = list(test.columns)

In [5]:
class CategoricalEncoder:
    def __init__(self, train, test):
        self.train = train
        self.test = test

    def frequency_encode(self, cat_cols, feature_cols, drop_org=False):
        combined = pd.concat([self.train, self.test], axis=0, ignore_index=True)

        new_cat_cols = [] 
        for col in cat_cols:
            freq_encoding = combined[col].value_counts().to_dict()
            
            self.train[f"{col}_freq"] = self.train[col].map(freq_encoding).astype('float')
            self.test[f"{col}_freq"] = self.test[col].map(freq_encoding).astype('float')

            new_col_name = f"{col}_freq"
            new_cat_cols.append(new_col_name)
            feature_cols.append(new_col_name)
            if drop_org:
                feature_cols.remove(col)

        return self.train, self.test, new_cat_cols, feature_cols

In [6]:
encoder = CategoricalEncoder(train, test)
train, test, cat_cols, feature_cols = encoder.frequency_encode(cat_cols, feature_cols, drop_org=True)

train = train[feature_cols + ['Premium Amount']]
test = test[feature_cols]

In [7]:
train.head()

Unnamed: 0,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Year,Day,...,Occupation_freq,Location_freq,Policy Type_freq,Customer Feedback_freq,Smoking Status_freq,Exercise Frequency_freq,Property Type_freq,Month_name_freq,Day_of_week_freq,Premium Amount
0,19.0,10049.0,1.0,22.598761,2.0,17.0,372.0,5.0,2023,23,...,470636.0,663201.0,669475.0,625952.0,996268.0,510693.0,667500.0,162307.0,284861.0,2869.0
1,39.0,31678.0,3.0,15.569731,1.0,12.0,694.0,2.0,2023,12,...,,668067.0,665822.0,629122.0,1003732.0,498230.0,667500.0,164442.0,287191.0,1483.0
2,23.0,25602.0,3.0,47.177549,1.0,14.0,,3.0,2023,30,...,470636.0,668732.0,669475.0,614826.0,1003732.0,510693.0,667500.0,165556.0,284861.0,567.0
3,21.0,141855.0,2.0,10.938144,1.0,0.0,367.0,1.0,2024,12,...,,668067.0,664703.0,625952.0,1003732.0,491143.0,666022.0,164442.0,287424.0,765.0
4,21.0,39651.0,1.0,20.376094,0.0,8.0,598.0,4.0,2021,1,...,470636.0,668067.0,669475.0,625952.0,1003732.0,510693.0,667500.0,162307.0,287424.0,2022.0


In [8]:
print(train.isnull().sum())
print(test.isnull().sum())

Age                         18705
Annual Income               44949
Number of Dependents       109672
Health Score                74076
Previous Claims            364029
Vehicle Age                     6
Credit Score               137882
Insurance Duration              1
Year                            0
Day                             0
Month                           0
Week                            0
Year_sin                        0
Year_cos                        0
Month_sin                       0
Month_cos                       0
Day_sin                         0
Day_cos                         0
Group                           0
Gender_freq                     0
Marital Status_freq         18529
Education Level_freq            0
Occupation_freq            358075
Location_freq                   0
Policy Type_freq                0
Customer Feedback_freq      77824
Smoking Status_freq             0
Exercise Frequency_freq         0
Property Type_freq              0
Month_name_fre

In [9]:
train = train.fillna(-111)
test = test.fillna(-111)

In [10]:
print(train.isnull().sum())
print(test.isnull().sum())

Age                        0
Annual Income              0
Number of Dependents       0
Health Score               0
Previous Claims            0
Vehicle Age                0
Credit Score               0
Insurance Duration         0
Year                       0
Day                        0
Month                      0
Week                       0
Year_sin                   0
Year_cos                   0
Month_sin                  0
Month_cos                  0
Day_sin                    0
Day_cos                    0
Group                      0
Gender_freq                0
Marital Status_freq        0
Education Level_freq       0
Occupation_freq            0
Location_freq              0
Policy Type_freq           0
Customer Feedback_freq     0
Smoking Status_freq        0
Exercise Frequency_freq    0
Property Type_freq         0
Month_name_freq            0
Day_of_week_freq           0
Premium Amount             0
dtype: int64
Age                        0
Annual Income              0
N

In [11]:
X = train.drop('Premium Amount', axis=1)  
y = train['Premium Amount']

y_log = np.log1p(y)

In [12]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [15]:
def train_model():
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof = np.zeros(len(X))
    models = []

    for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
        print(f"Fold {fold + 1}")
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y_log.iloc[train_idx], y_log.iloc[valid_idx]

        model = CatBoostRegressor(
            iterations=3000,
            learning_rate=0.05,
            depth=6,
            eval_metric="RMSE",
            random_seed=42,
            verbose=200,
            task_type='CPU',
            l2_leaf_reg =  0.7,
        )
        
        model.fit(X_train,
                  y_train,
                  eval_set=(X_valid, y_valid), 
                  early_stopping_rounds=300,
                  # cat_features=cat_cols,
                 )
        models.append(model)
        oof[valid_idx] = np.maximum(0, model.predict(X_valid))
        fold_rmsle = rmsle(np.expm1(y_valid), np.expm1(oof[valid_idx]))
        print(f"Fold {fold + 1} RMSLE: {fold_rmsle}")
        
    return models, oof

In [16]:
models,oof = train_model()

Fold 1
0:	learn: 1.0926202	test: 1.0933956	best: 1.0933956 (0)	total: 178ms	remaining: 8m 54s
200:	learn: 1.0485380	test: 1.0500188	best: 1.0500188 (200)	total: 18.2s	remaining: 4m 13s
400:	learn: 1.0463214	test: 1.0485330	best: 1.0485330 (400)	total: 36.4s	remaining: 3m 55s
600:	learn: 1.0449229	test: 1.0481523	best: 1.0481523 (600)	total: 55.2s	remaining: 3m 40s
800:	learn: 1.0436774	test: 1.0479132	best: 1.0479063 (788)	total: 1m 13s	remaining: 3m 21s
1000:	learn: 1.0425603	test: 1.0478785	best: 1.0478713 (943)	total: 1m 31s	remaining: 3m 3s
1200:	learn: 1.0414412	test: 1.0478245	best: 1.0478221 (1187)	total: 1m 50s	remaining: 2m 45s
1400:	learn: 1.0403556	test: 1.0477725	best: 1.0477725 (1400)	total: 2m 8s	remaining: 2m 27s
1600:	learn: 1.0393160	test: 1.0477786	best: 1.0477640 (1416)	total: 2m 27s	remaining: 2m 8s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 1.047763986
bestIteration = 1416

Shrink model to first 1417 iterations.
Fold 1 RMSLE: 1.0477639869003

In [17]:
print(rmsle(y, np.expm1(oof)))

1.047020877791748


In [18]:
test_predictions = np.zeros(len(test))

for model in models:
    test_predictions += np.maximum(0, np.expm1(model.predict(test))) / len(models)


sample['Premium Amount'] = test_predictions
sample.to_csv('submission.csv', index = False)
sample.head()

Unnamed: 0,id,Premium Amount
0,1200000,800.096145
1,1200001,797.374358
2,1200002,788.860009
3,1200003,802.320247
4,1200004,747.522588
