In [1]:
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import pyarrow as pa
import utils
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score


import pyarrow.csv as csv

# Data preparation
For more information about feature engineering check the utils.py file

In [2]:
train_df, test_df = utils.get_train_test_df()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop(const_columns_to_remove, axis=1, inplace=True)


In [29]:
# train_df = csv.read_csv("train.csv").to_pandas().set_index("id").drop(columns=['index'])
# test_df = csv.read_csv("test.csv").to_pandas().set_index("id")

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_df.drop('target', axis=1), train_df['target'], test_size=0.2, random_state=42, stratify=train_df['target'])
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42, stratify=y_train)

In [34]:
fit_params = {
    "X": X_train,
    "y": y_train,
    "eval_set": (X_val, y_val),
    'verbose': 100,
}

Class weights were originally [1, 84], which is the inverse frequency of positive class, but it performed poorly, after that I used optuna to determine value 25 for it.

In [71]:
import numpy as np
best_params = {
    "used_ram_limit": "8gb",
    "random_seed": 42,
    "class_weights" : np.array([1, 25]),
    'use_best_model': True,
    'iterations': 2000,
    'l2_leaf_reg': 24,
    'random_strength': 3,
    "max_depth": 6,
    'early_stopping_rounds': 300
}


# Tuning the model

In [72]:
import optuna

def objective(trial):

    param = {
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.8, 1.0, log=True),
    }
    cp = best_params.copy()
    cp.update(param)
    model = CatBoostClassifier(**cp)

    fp = fit_params.copy()
    fp['verbose'] = False

    model.fit(**fp)

    preds = model.predict(X_val)


    return f1_score(y_val, preds)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_trial.value)

[I 2024-04-20 17:17:12,223] A new study created in memory with name: no-name-3f1b3e1b-aba3-49fc-9e5b-42bca2b3c4c1
[I 2024-04-20 17:21:26,927] Trial 0 finished with value: 0.09627216793340572 and parameters: {'colsample_bylevel': 0.8502932391455729}. Best is trial 0 with value: 0.09627216793340572.
[I 2024-04-20 17:25:46,382] Trial 1 finished with value: 0.09520322226290737 and parameters: {'colsample_bylevel': 0.9535079716279686}. Best is trial 0 with value: 0.09627216793340572.
[I 2024-04-20 17:29:25,680] Trial 2 finished with value: 0.09996297667530545 and parameters: {'colsample_bylevel': 0.9993499988691655}. Best is trial 2 with value: 0.09996297667530545.


# Predicting and uploading

In [70]:
from sklearn.metrics import f1_score

model = CatBoostClassifier(**best_params)

model.fit(**fit_params)

y_pred = model.predict(X_test)

f1_score(y_test, y_pred)

{'used_ram_limit': '8gb', 'random_seed': 42, 'class_weights': array([ 1, 25]), 'use_best_model': True, 'iterations': 2000, 'l2_leaf_reg': 18.76, 'random_strength': 2.48, 'max_depth': 6, 'early_stopping_rounds': 300}
0:	learn: 0.6824314	test: 0.6824504	best: 0.6824504 (0)	total: 266ms	remaining: 8m 51s
100:	learn: 0.4860507	test: 0.4843743	best: 0.4843743 (100)	total: 16.5s	remaining: 5m 9s
200:	learn: 0.4697739	test: 0.4683842	best: 0.4683842 (200)	total: 33.9s	remaining: 5m 3s
300:	learn: 0.4626710	test: 0.4622902	best: 0.4622902 (300)	total: 50.9s	remaining: 4m 47s
400:	learn: 0.4571546	test: 0.4584277	best: 0.4584156 (399)	total: 1m 5s	remaining: 4m 22s
500:	learn: 0.4520463	test: 0.4549623	best: 0.4549559 (499)	total: 1m 16s	remaining: 3m 50s
600:	learn: 0.4458166	test: 0.4517730	best: 0.4517730 (600)	total: 1m 26s	remaining: 3m 21s
700:	learn: 0.4408390	test: 0.4499125	best: 0.4498969 (696)	total: 1m 37s	remaining: 2m 59s
800:	learn: 0.4364273	test: 0.4487998	best: 0.4487898 (799)

0.09378663540445487

In [59]:
preds = model.predict(test_df)

In [60]:
def upload_from_test(test_df, name):
    preds = model.predict(test_df)
    index = test_df.index.values
    df = pd.DataFrame({"id": index, "target": preds})
    df.to_csv(name, index=False)

In [61]:
upload_from_test(test_df, "final.csv")