In [35]:
import os
import polars as pl
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import lightgbm
from lightgbm import LGBMRegressor


In [36]:
print(pl.__version__, np.__version__, pd.__version__, sklearn.__version__, lightgbm.__version__, sep="\n")

1.9.0
1.26.4
2.2.3
1.2.2
4.2.0


In [37]:

# Define the path to competition data
path_to_data = '~/projects/ait/data/'
target = 'utility_agent1'

# Global variables
Light_Models = None
counter = 0
train_cols = []
categorical_features = []


In [38]:

print("Starting training...")

# Loading data
train = pd.read_csv(f'{path_to_data}train.csv')
y_train = train[target]

print('dropping columns...')
# Drop columns with unique values < 2 and unwanted columns
cols_to_drop = [col for col in train.columns if train[col].nunique() < 2]
cols_to_drop.extend(['num_draws_agent1', 'num_losses_agent1', 'num_wins_agent1', target])
train = train.drop(columns=cols_to_drop)

print('selecting categorical columns...')
# select categorical columns
cols_with_object_as_val = train.select_dtypes(include='object').columns.tolist()
categorical_cols = [col for col in train.columns if train[col].nunique() == 2 and train[col].dtype in ['object', 'int64', 'float64']]

# Convert selected columns to category type
train[cols_with_object_as_val + categorical_cols] = train[cols_with_object_as_val + categorical_cols].astype('category')
train_cols = train.columns
categorical_features = cols_with_object_as_val + categorical_cols

print('Preprocessing Done.')


Starting training...
dropping columns...
selecting categorical columns...
Preprocessing Done.


In [41]:

Light_Models = []
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
rmses = []

print('Begin training...')
# Cross-validation
for fold, (train_idx, val_idx) in enumerate(kfold.split(train)):
    X_train, X_val = train.iloc[train_idx], train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = LGBMRegressor(random_state=42, verbose=-1)
    model.fit(X_train, y_train_fold, categorical_feature=categorical_features)
    Light_Models.append(model)
    model.booster_.save_model(f"light_OF_k{fold+1}.txt")

    val_preds = model.predict(X_val)
    rmse = mean_squared_error(y_val_fold, val_preds, squared=False)
    print(f"Fold {fold + 1} : {rmse}")
    rmses.append(rmse)


print("Mean RMSE:", np.mean(rmses))
print("Training completed.")



Begin training...
Fold 1 : 0.3471106563754205
Fold 2 : 0.3515074457429645
Fold 3 : 0.33857166648621295
Fold 4 : 0.332664227787554
Fold 5 : 0.33473322742569916
Mean RMSE: 0.3409174447635702
Training completed.


In [43]:
def mean_of_models(data, models):
    return np.mean([model.predict(data) for model in models], axis=0)


In [46]:
Light_Models = []
for fold in range(5):
    model = lightgbm.Booster(model_file=f"light_OF_k{fold+1}.txt")
    Light_Models.append(model)
print("Models loaded.")

Models loaded.


In [47]:

test = pd.read_csv(f'{path_to_data}test.csv')

test[categorical_features] = test[categorical_features].astype('category')


missing_cols = [col for col in train_cols if col not in test.columns]
for col in missing_cols:
    test[col] = 0
test = test[train_cols]  # Ensure column order matches train data

predictions = mean_of_models(test, Light_Models)
table = pl.DataFrame({'Id': test['Id'], 'utility_agent1': 0})
result = table.with_columns(pl.Series('utility_agent1', predictions))
print(result)


shape: (3, 2)
┌────────┬────────────────┐
│ Id     ┆ utility_agent1 │
│ ---    ┆ ---            │
│ i64    ┆ f64            │
╞════════╪════════════════╡
│ 233234 ┆ 0.178242       │
│ 233235 ┆ -0.182833      │
│ 233236 ┆ 0.144762       │
└────────┴────────────────┘
