In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool

# Load data
train_df = pd.read_csv('/home/code/data/train.csv')

print("Original data:")
print(train_df['Sex'].value_counts())

# Feature engineering WITHOUT touching Sex column
def create_features(df):
    df_new = df.copy()
    num_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
    for col in num_features:
        df_new[f'{col}_log1p'] = np.log1p(df_new[col])
    df_new['Weight_Duration'] = df_new['Weight'] * df_new['Duration']
    return df_new

train_feat = create_features(train_df)
feature_cols = [col for col in train_feat.columns if col not in ['id', 'Calories']]
cat_features = ['Sex']

print(f"\nFeature columns: {feature_cols}")
print(f"Categorical features: {cat_features}")

# Check Sex values after feature engineering
print(f"\nSex values after feature engineering:")
print(train_feat['Sex'].value_counts())

# Create pool
train_pool = Pool(train_feat[feature_cols], label=train_feat['Calories'], cat_features=cat_features)

# Train model
model = CatBoostRegressor(iterations=10, verbose=False, random_seed=42)
model.fit(train_pool, verbose=False)
print("\nModel trained successfully!")

Original data:
Sex
M    4859
F    3141
Name: count, dtype: int64

Feature columns: ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Age_log1p', 'Height_log1p', 'Weight_log1p', 'Duration_log1p', 'Heart_Rate_log1p', 'Body_Temp_log1p', 'Weight_Duration']
Categorical features: ['Sex']

Sex values after feature engineering:
Sex
M    4859
F    3141
Name: count, dtype: int64

Model trained successfully!
