## Train baseline model

Create a quick and dirty model without data preparation just to see what we will get. We will use LightGBM for baseline model. Tree-based models free us from feature preparation and scaling. We will also use optuna to optimize hyperparameters.

In [2]:
import pandas as pd
import project.project_api as project_api
import utils.model_lgb as model_lgb

In [3]:
RANDOM_SEED = 42
CLASS_NUM = 5
TARGET_KEY = "target"

In [4]:
df_train_fe = pd.read_parquet("./data/train_fe")  # .sample(n=30000)
df_test_fe = pd.read_parquet("./data/test_fe")  # .sample(n=30000)

df_train_fe.info(max_cols=900)

<class 'pandas.core.frame.DataFrame'>
Index: 146953 entries, 1525928 to 132580172
Data columns (total 816 columns):
 #    Column                            Non-Null Count   Dtype  
---   ------                            --------------   -----  
 0    Ama_rchrgmnt_sum_max_mnt1         146914 non-null  float16
 1    content_clc_mea_mnt1              146914 non-null  float16
 2    content_cnt_max_mnt1              146914 non-null  float16
 3    voice_out_short_part_max_mnt1     146916 non-null  float16
 4    voice_mts_in_nrest_part_std_mnt1  146916 non-null  float16
 5    num_act_days_max_mnt1             146916 non-null  float16
 6    sms_roam_clc_min_mnt1             146914 non-null  float16
 7    voice_in_cmpttrs_avg_durmin_mnt1  146916 non-null  float16
 8    com_num_part_mea_mnt1             146914 non-null  float16
 9    pay_avg_mea_mnt1                  146916 non-null  float16
 10   voice_out_tar_dur_std_mnt1        146902 non-null  float16
 11   voice_out_tar_dur_min_mnt1       

In [5]:
X_train = df_train_fe.drop(columns=[TARGET_KEY])
# lgb requires class to be zero-based
y_train = df_train_fe[TARGET_KEY] - 1
X_test = df_test_fe.drop(columns=[TARGET_KEY])
# lgb requires class to be zero-based
y_test = df_test_fe[TARGET_KEY] - 1

In [6]:
# 10k random sample without stratification
# Best hyperparameters:  {'eta': 0.2946716146643395, 'boosting_type': 'gbdt', 'lambda_l1': 9.397345510495862, 'lambda_l2': 1.0215006016609774e-07, 'num_leaves': 6, 'min_data_in_leaf': 30, 'feature_fraction': 0.6779327568245341, 'bagging_fraction': 0.8997515432347486, 'bagging_freq': 5, 'min_child_samples': 34}
# Best score:  0.4394
study_name = "baseline_lgb"
# 30k random sample without stratification
# Best hyperparameters:  {'eta': 0.09723214967428116, 'boosting_type': 'gbdt', 'lambda_l1': 1.9100406735143505, 'lambda_l2': 3.5369799631250935e-07, 'num_leaves': 22, 'min_data_in_leaf': 20, 'feature_fraction': 0.4009733972022196, 'bagging_fraction': 0.8462707615712436, 'bagging_freq': 3, 'min_child_samples': 96}
# Best score:  0.45436666666666664
study_name = "baseline_lgb_30k"
# full dataset
# Best hyperparameters:  {'eta': 0.19997109376050565, 'boosting_type': 'gbdt', 'lambda_l1': 5.735491139313952e-07, 'lambda_l2': 3.1791646476628225e-06, 'num_leaves': 20, 'min_data_in_leaf': 30, 'feature_fraction': 0.8166299199026185, 'bagging_fraction': 0.9867527250605056, 'bagging_freq': 7, 'min_child_samples': 27}
# Best score:  0.46992574536754916
study_name = "baseline_lgb_full"

In [7]:
study = project_api.train_lgb(
    study_name=study_name,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
)

[I 2024-06-16 14:21:48,249] Using an existing study with name 'baseline_lgb_full' instead of creating a new one.


Best hyperparameters:  {'eta': 0.19997109376050565, 'boosting_type': 'gbdt', 'lambda_l1': 5.735491139313952e-07, 'lambda_l2': 3.1791646476628225e-06, 'num_leaves': 20, 'min_data_in_leaf': 30, 'feature_fraction': 0.8166299199026185, 'bagging_fraction': 0.9867527250605056, 'bagging_freq': 7, 'min_child_samples': 27}
Best score:  0.46992574536754916


In [8]:
predict_bnum_initial, model_bnum_initial = model_lgb.train_multiclass(
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    params=study.best_params,
    num_class=CLASS_NUM,
    seed=RANDOM_SEED,
    name="2024_05_31_baseline_fe_acc_4699",
)

In [9]:
print("Train dataset:")
project_api.report(
    y_test=y_train,
    y_pred=predict_bnum_initial(X_train),
)

print("\n\nTest dataset:")
project_api.report(
    y_test=y_test,
    y_pred=predict_bnum_initial(X_test),
)

Train dataset:
Accuracy: 0.5604308860656129


Test dataset:
Accuracy: 0.46992574536754916


{'accuracy': 0.46992574536754916}