# Library

In [None]:
%%capture
%pip install catboost

In [26]:
import pandas as pd
from catboost import CatBoostRegressor

from numpy.random import seed
seed(7991)

# Read Data

In [None]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('/data/test.csv')

# Preprocessing

In [28]:
def clean_column_names(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
    return df

df_train = clean_column_names(df_train)
df_test = clean_column_names(df_test)

# Data Cleaning

## Outlier Detection and Removal

In [29]:
outliers = df_train['premium_amount'][
    (df_train['premium_amount'] < df_train['premium_amount'].quantile(0.25) - 1.5 * (df_train['premium_amount'].quantile(0.75) - df_train['premium_amount'].quantile(0.25))) |
    (df_train['premium_amount'] > df_train['premium_amount'].quantile(0.75) + 1.5 * (df_train['premium_amount'].quantile(0.75) - df_train['premium_amount'].quantile(0.25)))
]

# Remove outliers
df_train = df_train[~df_train['premium_amount'].isin(outliers)]

## Missing Value

In [30]:
df_train = df_train.dropna()

# Convert categorical variables to category

In [31]:
categorical_cols = df_train.select_dtypes(include=['object', 'category']).columns

df_train[categorical_cols] = df_train[categorical_cols].astype('str').fillna('None').astype('category')
df_test[categorical_cols] = df_test[categorical_cols].astype('str').fillna('None').astype('category')

# Convert categorical columns to numeric representation
cat_features = []
for col in categorical_cols:
    df_train[col], _ = df_train[col].factorize()
    df_test[col], _ = df_test[col].factorize()
    cat_features.append(df_train.columns.get_loc(col))

# Fitting

In [32]:
# Model training and prediction
model = CatBoostRegressor(cat_features=cat_features, random_state=42, verbose=False)
model.fit(df_train.drop('premium_amount', axis=1), df_train['premium_amount'])
y_hat = model.predict(df_test)

# Prediction

In [33]:
submission = pd.DataFrame({
    'id': df_test['id'],  # Ensure 'id' column is present in df_test
    'Premium Amount': y_hat
})

In [None]:
submission.head()

In [37]:
submission.to_csv('submission.csv', index=False)