In [1]:
import pandas as pd
import numpy as np
import polars as pl
import os
import catboost
import sklearn
from catboost import CatBoostRegressor, cv, Pool
from sklearn.model_selection import cross_val_score, KFold
from catboost import Pool


In [2]:
print(pd.__version__, np.__version__, pl.__version__, sklearn.__version__, catboost.__version__, sep="\n")

2.2.3
1.26.4
1.9.0
1.2.2
1.2.7


In [3]:
model = None  # var to store the trained model

In [21]:

print("Begin Training")

path_to_data = '~/projects/ait/data/'
# Load train data
print("Loading training data...")
train_data = pl.read_csv(f'{path_to_data}train.csv')
print("Training data loaded.")

# drop cols with missing values in target col
train_data = train_data.drop_nulls(subset=['utility_agent1'])
print("Removed rows with missing values in the target column.")
train_df = train_data.to_pandas()

# selecting numerical columns
numeric_cols = train_df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# fill missing values with mean in numerical columns
if numeric_cols:
    train_df[numeric_cols] = train_df[numeric_cols].fillna(train_df[numeric_cols].mean())
print("Missing values filled with means in numerical columns.")

# Handle missing values in categorical columns
categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
    mode_series = train_df[col].mode().dropna()
    if not mode_series.empty:
        mode = mode_series.iloc[0]
    else:
        mode = 'missing'
    train_df[col] = train_df[col].fillna(mode)
    train_df[col] = train_df[col].astype(str)
print("Missing values in categorical columns handled.")

# Define the target variable and columns to drop
target = 'utility_agent1'
cols_to_drop = ['num_draws_agent1', 'num_losses_agent1', 'num_wins_agent1', target]

# separate attributes and target
X = train_df.drop(columns=cols_to_drop, axis=1)
y = train_df[target]
print("attributes and target separated.")

# ensure categorical columns are in X
categorical_cols = [col for col in categorical_cols if col in X.columns]

# Convert categorical columns to uniform data type. here : string
for col in categorical_cols:
    X[col] = X[col].astype(str)

# Prepare data for CatBoost
categorical_features_indices = [X.columns.get_loc(col) for col in categorical_cols]
print("Categorical feature indices obtained.")

# default parameters in a dictionary
params = {
    'iterations': 700,
    'learning_rate': 0.08780929167510496,
    'depth': 7,
    'l2_leaf_reg': 0.04063851295419513,
    'bagging_temperature': 0.8072638982547167,
    'random_strength': 0.7584336734894706,
    'border_count': 33,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'task_type': 'CPU',
    'verbose': 100,
    'allow_writing_files':False
}
print("catboost parameters set.")

# Prepare training data
train_pool = Pool(data=X, label=y, cat_features=categorical_features_indices)
print("Training pool prepared.")

# Initialize model object and train the model
print("Training the model...")
model = CatBoostRegressor(**params)
model.fit(train_pool)
print("Training has completed.")



Begin Training
Loading training data...
Training data loaded.
Removed rows with missing values in the target column.
Missing values filled with means in numerical columns.
Missing values in categorical columns handled.
attributes and target separated.
Categorical feature indices obtained.
catboost parameters set.
Training pool prepared.
Training the model...
0:	learn: 0.5992886	total: 363ms	remaining: 4m 13s
100:	learn: 0.3800618	total: 14.1s	remaining: 1m 23s
200:	learn: 0.3612583	total: 28s	remaining: 1m 9s
300:	learn: 0.3502140	total: 42s	remaining: 55.7s
400:	learn: 0.3423385	total: 56.1s	remaining: 41.9s
500:	learn: 0.3369388	total: 1m 10s	remaining: 28.2s
600:	learn: 0.3319372	total: 1m 27s	remaining: 14.3s
699:	learn: 0.3279447	total: 1m 42s	remaining: 0us
Training has completed.


In [22]:
model.save_model('cat_OF')

In [23]:
model = CatBoostRegressor()
model.load_model('cat_OF')

<catboost.core.CatBoostRegressor at 0x70f9dc3204f0>

In [24]:
test_df = pl.read_csv(f'{path_to_data}test.csv')
test_df = test_df.to_pandas()

categorical_cols = test_df.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
    test_df[col] = test_df[col].fillna('missing').astype(str)

cols_to_drop = ['num_draws_agent1', 'num_losses_agent1', 'num_wins_agent1']
X_test = test_df.drop(cols_to_drop, axis=1, errors='ignore')

categorical_cols = [col for col in categorical_cols if col in X_test.columns]
for col in categorical_cols:
    X_test[col] = X_test[col].astype(str)

predictions = model.predict(X_test)

table = pl.DataFrame({'Id': test_df['Id'], 'utility_agent1': 0})
result = table.with_columns(pl.Series('utility_agent1', predictions))

print(result)


shape: (3, 2)
┌────────┬────────────────┐
│ Id     ┆ utility_agent1 │
│ ---    ┆ ---            │
│ i64    ┆ f64            │
╞════════╪════════════════╡
│ 233234 ┆ 0.340349       │
│ 233235 ┆ -0.314         │
│ 233236 ┆ 0.133417       │
└────────┴────────────────┘


: 