## Call necessary libraries

In [14]:
# trying out label encoder and onehot encoder in lightgbm

from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

## Create syntethic dataset
Since we are interested in dealing with categorical variables we will create a schema like `userid, categorical1, categorical2`

In [15]:
df = pd.read_csv('toy_dataset.csv')
df.head() 

Unnamed: 0,age,income,city,owns_pet,purchased1,purchased2,user_id
0,23.0,48000.0,Lima,yes,0,1,c3e70dd6-a461-452d-98ae-a5c288d47145
1,45.0,82000.0,Cusco,no,1,0,9cb556a1-2ed7-4665-82c4-3d3b2191886d
2,31.0,61000.0,Lima,yes,0,0,44dd4982-783f-4e1c-97f6-87a93382ecb3
3,,54000.0,Arequipa,no,1,0,798f4272-7943-499b-a89d-9b94051ba127
4,52.0,,Lima,yes,1,1,defd8b36-f92e-4b68-a2a3-c1e453956b8f


In [16]:
target_columns = ['purchased1', 'purchased2']
target, user_id = df[target_columns], df['user_id']
df = df.drop(columns = ['purchased1','purchased2', 'user_id'])

In [17]:
numerical_features = df.select_dtypes(include=np.number).columns.tolist()
categorical_features = df.select_dtypes(exclude=np.number).columns.tolist()

print('numerical features', numerical_features)
print('categorical features', categorical_features)

numerical features ['age', 'income']
categorical features ['city', 'owns_pet']


In [18]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])



## hyperparameters

In [19]:
hp = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'learning_rate': 0.2,        
    'n_estimators': 30,          
    'max_depth': 6,              
    'min_child_samples': 20, # bump that to 500-1000 when we train for millions     
    'subsample': 0.8,            
    'colsample_bytree': 0.8,     
    'max_bin': 64,               
    'n_jobs': -1,
    'num_leaves': 64, # new
    'random_state': 42
}

## model definition

In [26]:
import pandas as pd
import time
import lightgbm as lgb
import numpy as np
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import precision_score
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, ClassifierMixin

class LGBMModel(BaseEstimator, ClassifierMixin):
    """
    Wrap the LGBMClassifier for each target column
    Exposes .fit and .predict_proba, and is sklearn-compatible so MLflow's sklearn flavor and log it.
    """
    def __init__(self, **lgbm_params):
        self.lgbm_params = lgbm_params
        self.models_ = {}
        self.companies = None # to be defined during the call to `fitting` method
    
    def fit(self, X_train: pd.DataFrame, y_train: pd.DataFrame, X_val: pd.DataFrame = None, y_val: pd.DataFrame = None):
        """
        X: Dataframe with user features 
        y: Dataframe with companies payment binary information
        """
        self.companies = y_train.columns
        for j, target in enumerate(y_train.columns):
            # print to get track of progress, kinda difficult without this
            print(f"===== TRAINING LABEL ===== {j+1}/{len(y_train.columns)}: {target}")
            start_time = time.time()
            model = lgb.LGBMClassifier(**self.lgbm_params)
            # default training
            fit_kwargs = {
                "X": X_train, 
                "y": y_train[target]
            }
            # if we pass eval training set
            if X_val is not None and y_val is not None:
                fit_kwargs.update({
                    "eval_set": [(X_val, y_val[target])],
                    "eval_metric": "auc",
                    # "early_stopping_rounds": self.lgbm_params.get("early_stopping_rounds", 50),
                    # "verbose": 10
                    "callbacks": [
                        lgb.early_stopping(stopping_rounds=self.lgbm_params.get("early_stopping_rounds", 10)),
                        lgb.log_evaluation(period=10)
                    ]
                })
            model.fit(**fit_kwargs) 

            # Only try to access best_score_ if validation was used
            if X_val is not None and y_val is not None and hasattr(model, 'best_score_') and model.best_score_:
                try:
                    auc = model.best_score_['valid_0']['auc']
                    print(f"Validation AUC for {target}: {auc}")
                    # log metric
                    mlflow.log_metric(f'{target}_auc', auc)
                except KeyError:
                    print(f"AUC score not available for {target}")
            else:
                print(f"No validation data provided for {target}")
                
            # track how much time elapsed
            end_time = time.time()
            duration = end_time - start_time
            print(f"Training time for target {target}: {duration:.2f} seconds")
            # save model
            self.models_[target] = model
        return self

In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

pipe = Pipeline(
    steps = [
        ("preprocess", ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numerical_features),
                ("cat", categorical_transformer, categorical_features),
            ]
        )),
        ("model", LGBMModel(**hp))
    ]
)

In [28]:
from sklearn import set_config
set_config(display='diagram')
pipe

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [29]:
pipe.fit(df,target)

===== TRAINING LABEL ===== 1/2: purchased1
[LightGBM] [Info] Number of positive: 5, number of negative: 3
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 8, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.625000 -> initscore=0.510826
[LightGBM] [Info] Start training from score 0.510826
No validation data provided for purchased1
Training time for target purchased1: 0.00 seconds
===== TRAINING LABEL ===== 2/2: purchased2
[LightGBM] [Info] Number of positive: 4, number of negative: 4
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 8, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
No validation data provided for purchased2
Training time for target purchased2: 0.00 seconds




0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'
