In [None]:
#import libs
import numpy as np
import pandas as pd
import polars as pl
from tqdm import tqdm
import random
#Model
import lightgbm as lgb
#BO
from bayes_opt import BayesianOptimization
#for random ensemble
from sklearn.linear_model import LogisticRegression, LinearRegression
#evo alg
from deap import base, creator, tools, algorithms
#plot
from matplotlib import pyplot as plt
import seaborn as sns

# Common elements

In [None]:
#dataset with added class label for b1, b2, c
df_data = pd.read_csv('../df_initial_comp3.csv')
df_data = pl.from_pandas(df_data)

#combine b1 and b2 classes
df_data = df_data.with_columns(df_data['class'].clone().alias('true_class'))
df_data = df_data.with_columns(
                                pl.when(df_data['true_class'] == 'Baja+2')\
                                .then(1)
                                .when(df_data['true_class'] == 'Baja+1')\
                                .then(0)\
                                .otherwise(0).alias('true_class')
                               )
df_data = df_data.with_columns(
                                pl.when(df_data['class'] == 'Baja+2')\
                                .then(1)
                                .when(df_data['class'] == 'Baja+1')\
                                .then(1)\
                                .otherwise(0).alias('class')
                               )

#convert date
df_data = df_data.with_columns(pl.col('foto_mes').str.to_datetime().alias('foto_mes'))

#sort
df_data = df_data.sort(['numero_de_cliente', 'foto_mes'])

#select all float colnames
lag_columns = df_data.select([pl.col(pl.Float64)]).columns
for col in df_data.select([pl.col(pl.Float64)]).columns:
    #cast to int to reduce memory consumption
    df_data = df_data.with_columns(df_data[col].cast(pl.Int64), alias=col)

#create lag features
for col in lag_columns:
    for i in [1,3,6]:
        lag_col_name = f"{col}_lag_{i}"
        df_data = df_data.with_columns(
                                        pl.col(col)
                                        .shift(-i)
                                        .over(pl.col('numero_de_cliente'))
                                        .alias(lag_col_name)
                                      )

In [None]:
'true_class' in lag_columns

In [None]:
'class' in lag_columns

In [None]:
SEED = 42

In [None]:
#training and testing data
#select dates range: 6 months for training, 1 scipped, 1 for validation
df_data = df_data.filter(df_data['foto_mes'] >= pl.date(2020,12,1))

#train data
#6 meses de entrenamiento
df_train = df_data.filter(df_data['foto_mes'] <= pl.date(2021,5,1)).to_pandas().copy()

#validation and test data
df_val = df_data.filter(df_data['foto_mes'] == pl.date(2021,6,1)).to_pandas().copy()
df_test = df_data.filter(df_data['foto_mes'] == pl.date(2021,7,1)).to_pandas().copy()

#drop original data
del df_data

In [None]:
df_train

In [None]:
#param space
space = {'MAX_DEPTH': (3, 12),
         'COLSAMPLE_BYTREE': (0.5, 1),
         'N_ESTIMATORS':(50,1000),
         'NUM_LEAVES':(128,1024),
         'REG_ALPHA':(0,100),
         'REG_LAMBDA':(0,20)}

In [None]:
#target function to maximize and curves
def calculate_expected_profit(df_labels, df_predictions, step=100, target_col='prob_target',class_col='class'):
    curve = []
    max_value = 0
    max_step = -1
    df_comb = df_labels.merge(df_predictions, left_on='numero_de_cliente', right_on='numero_de_cliente')
    df_comb.sort_values(target_col, ascending=False, inplace=True)
    for i in range(len(df_comb) // step + 1):
        curve.append((i,sum(df_comb[class_col].iloc[:min(i*step,len(df_comb))])*280000 - min(i*step,len(df_comb)) * 7000))
        if max_value < sum(df_comb[class_col].iloc[:min(i*step,len(df_comb))])*280000 - min(i*step,len(df_comb)) * 7000:
            max_value = sum(df_comb[class_col].iloc[:min(i*step,len(df_comb))])*280000 - min(i*step,len(df_comb)) * 7000
            max_step = i
    return max_step, max_value, curve

In [None]:
#logging
log = []
def bo_function(NUM_LEAVES, MAX_DEPTH, N_ESTIMATORS, COLSAMPLE_BYTREE, REG_ALPHA, REG_LAMBDA, step=100, df_train=df_train):

    #model
    model = lgb.LGBMClassifier(
                                boosting_type='gbdt',
                                eval_metric='cross_entropy_lambda',
                                num_leaves=int(NUM_LEAVES),
                                max_depth=int(MAX_DEPTH),
                                n_estimators=int(N_ESTIMATORS),
                                colsample_bytree=COLSAMPLE_BYTREE,
                                learning_rate=1, 
                                random_state=SEED,
                                zero_as_missing=True,
                                class_weight='balanced',
                                objective='binary', 
                                verbosity=-1,
                                reg_alpha=REG_ALPHA,
                                reg_lambda=REG_LAMBDA
                              )
    
    #train
    model.fit(
                df_train.drop(['numero_de_cliente',
                               'foto_mes',
                               'class',
                               'true_class'], axis=1), 
                df_train['class']
             )
    
    #predict
    df_pred = df_val[['numero_de_cliente']]
    df_pred['prob_target'] = model.predict_proba(df_val.drop(['numero_de_cliente',
                                                              'foto_mes',
                                                              'class',
                                                              'true_class'], axis=1))[:,1]
    
    step, val, curve = calculate_expected_profit(df_labels=df_val[['numero_de_cliente','class']], 
                                                 df_predictions=df_pred, 
                                                 step=100)
    log.append(curve)
    
    return val

# Traditional BO (baseline)

50 iterations, 16 initial points.

In [None]:
optimizer = BayesianOptimization(
                                    f=bo_function,
                                    pbounds=space,
                                    random_state=42,
                                )

In [None]:
optimizer.maximize(
                      init_points=16,
                      n_iter=50,
                  )

In [None]:
#Optimizer's iterations can be accessed via8:

#print(optimizer.max)

#for i, res in enumerate(optimizer.res):
#    print("Iteration {}: \n\t{}".format(i, res))

## Curves

In [None]:
df_curves = pd.DataFrame(columns=['experiment','point value','curve'])

# BO ensemble

## BO ensemble semillero 20

In [None]:
#train 20 models
random.seed(42)
seed_array = [random.getrandbits(32) for i in range(20)]

df_pred_val = df_val[['numero_de_cliente']]
df_pred_test = df_test[['numero_de_cliente']]

for seed in seed_array:
    
    model = lgb.LGBMClassifier(
                                boosting_type='gbdt',
                                eval_metric='cross_entropy_lambda',
                                num_leaves=int(optimizer.max['params']['NUM_LEAVES']),
                                max_depth=int(optimizer.max['params']['MAX_DEPTH']),
                                n_estimators=int(optimizer.max['params']['N_ESTIMATORS']),
                                colsample_bytree=optimizer.max['params']['COLSAMPLE_BYTREE'],
                                learning_rate=1, 
                                random_state=seed,
                                zero_as_missing=True,
                                class_weight='balanced',
                                objective='binary', 
                                verbosity=-1,
                                reg_alpha=optimizer.max['params']['REG_ALPHA'],
                                reg_lambda=optimizer.max['params']['REG_LAMBDA']
                              )
    
    #train
    model.fit(
                df_train.drop(['numero_de_cliente','foto_mes','class','true_class'], axis=1), 
                df_train['class']
             )
    
    #predict val
    df_pred_val[f'prob_target_{seed}'] = model.predict_proba(df_val.drop(['numero_de_cliente',
                                                                          'foto_mes',
                                                                          'class',
                                                                          'true_class'], axis=1))[:,1]
    
    #predict test
    df_pred_test[f'prob_target_{seed}'] = model.predict_proba(df_test.drop(['numero_de_cliente',
                                                                            'foto_mes',
                                                                            'class',
                                                                            'true_class'], axis=1))[:,1]


In [None]:
#Estimate cutoff
STEP = 100

df_pred_val['avg_prob'] = sum([df_pred_val[f'prob_target_{seed}'] for seed in seed_array]) / len(seed_array)
df_pred_test['avg_prob'] = sum([df_pred_test[f'prob_target_{seed}'] for seed in seed_array]) / len(seed_array)

step, val, curve = calculate_expected_profit(df_labels=df_val[['numero_de_cliente','class']], 
                                             df_predictions=df_pred_val[['numero_de_cliente','avg_prob']], 
                                             step=STEP,
                                             target_col='avg_prob',
                                             class_col='class')
print('Num envios', step*STEP, '\nGanancia', val)

In [None]:
#df_pred_test.drop(['true_class_x','true_class_y','label','class_x','class_y','true_class'],axis=1)

In [None]:
#Expected profit
df_pred_test = df_pred_test.sort_values(['avg_prob'],ascending=False)
df_pred_test = df_pred_test.merge(df_test[['numero_de_cliente','class']],
                                  left_on='numero_de_cliente',
                                  right_on='numero_de_cliente')

df_pred_test['label'] = 0
df_pred_test.loc[:step*STEP,'label'] = 1

ganancia_semillero = sum(df_pred_test['label'] * (-7000)) + \
                     len(df_pred_test.loc[(df_pred_test['label']==1) & \
                                          (df_pred_test['class']==1)]) * 273000
print('Bayesian semillero 20', ganancia_semillero)

In [None]:
#Adjust number of sent
STEP = 100
step, val, curve = calculate_expected_profit(df_labels=df_test[['numero_de_cliente','true_class']], 
                                             df_predictions=df_pred_test[['numero_de_cliente','avg_prob']], 
                                             step=STEP,
                                             target_col='avg_prob',class_col='true_class')
print('Bayesian semillero 20 adjusted true labels', val)
df_curves.loc[len(df_curves.index)] = ['Bayesian semillero 20 true', val, curve]

In [None]:
#Adjust number of sent
STEP = 100
step, val, curve = calculate_expected_profit(df_labels=df_test[['numero_de_cliente','class']], 
                                             df_predictions=df_pred_test[['numero_de_cliente','avg_prob']], 
                                             step=STEP,
                                             target_col='avg_prob',
                                             class_col='class')
print('Bayesian semillero 20 adjusted binary labels', val)
df_curves.loc[len(df_curves.index)] = ['Bayesian semillero 20 binary', val, curve]

## BO ensemble 20 mejores

In [None]:
#create df with parameters and objective values
df_iters = pd.DataFrame(columns=['ganancia',
                                 'COLSAMPLE_BYTREE',
                                 'MAX_DEPTH', 
                                 'NUM_LEAVES', 
                                 'N_ESTIMATORS', 
                                 'REG_ALPHA', 
                                 'REG_LAMBDA'])

for i, res in enumerate(optimizer.res):
    df_iters.loc[len(df_iters.index)] = [
                                            res['target'],
                                            res['params']['COLSAMPLE_BYTREE'],
                                            res['params']['MAX_DEPTH'],
                                            res['params']['NUM_LEAVES'],
                                            res['params']['N_ESTIMATORS'],
                                            res['params']['REG_ALPHA'],
                                            res['params']['REG_LAMBDA']
                                         ]

df_iters.sort_values(['ganancia'],inplace=True,ascending=False)
df_iters.reset_index(drop=True,inplace=True)

In [None]:
#train 20 best models
df_pred_val_bo20 = df_val[['numero_de_cliente']]
df_pred_test_bo20 = df_test[['numero_de_cliente']]

for i in range(20):
    #seed for reproducibility
   
    model = lgb.LGBMClassifier(
                                boosting_type='gbdt',
                                eval_metric='cross_entropy_lambda',
                                num_leaves=int(df_iters.loc[i,'NUM_LEAVES']),
                                max_depth=int(df_iters.loc[i,'MAX_DEPTH']),
                                n_estimators=int(df_iters.loc[i,'N_ESTIMATORS']),
                                colsample_bytree=df_iters.loc[i,'COLSAMPLE_BYTREE'],
                                learning_rate=1, 
                                random_state=SEED,
                                zero_as_missing=True,
                                class_weight='balanced',
                                objective='binary', 
                                verbosity=-1,
                                reg_alpha=df_iters.loc[i,'REG_ALPHA'],
                                reg_lambda=df_iters.loc[i,'REG_LAMBDA']
                              )
    
    #train
    model.fit(
                df_train.drop(['numero_de_cliente','foto_mes','class','true_class'], axis=1), 
                df_train['class']
             )
    
    #predict val
    df_pred_val_bo20[f'prob_target_{i}'] = model.predict_proba(df_val.drop(['numero_de_cliente',
                                                                                'foto_mes',
                                                                                'class',
                                                                                'true_class'], axis=1))[:,1]
    
    #predict test
    df_pred_test_bo20[f'prob_target_{i}'] = model.predict_proba(df_test.drop(['numero_de_cliente',
                                                                                 'foto_mes',
                                                                                 'class',
                                                                                 'true_class'], axis=1))[:,1]

In [None]:
#Estimate cutoff
STEP = 100

df_pred_val_bo20['avg_prob'] = sum([df_pred_val_bo20[f'prob_target_{i}'] for i in range(20)]) / len(range(20))
df_pred_test_bo20['avg_prob'] = sum([df_pred_test_bo20[f'prob_target_{i}'] for i in range(20)]) / len(range(20))

step, val, curve = calculate_expected_profit(df_labels=df_val[['numero_de_cliente','class']], 
                                             df_predictions=df_pred_val_bo20[['numero_de_cliente','avg_prob']], 
                                             step=STEP,
                                             target_col='avg_prob',
                                             class_col='class')

In [None]:
#Expected profit

df_pred_test_bo20 = df_pred_test_bo20[['numero_de_cliente','avg_prob']].sort_values(['avg_prob'])
df_pred_test_bo20 = df_pred_test_bo20.merge(df_test[['numero_de_cliente','true_class']],
                                            left_on='numero_de_cliente',
                                            right_on='numero_de_cliente')

df_pred_test_bo20['label'] = 0
df_pred_test_bo20.loc[:step*STEP,'label'] = 1

ganancia_best_20 = sum(df_pred_test_bo20['label']) * (-7000) + \
                   len(df_pred_test_bo20.loc[(df_pred_test_bo20['label']==1) & \
                                             (df_pred_test_bo20['true_class']==1)]) * 273000
print('Bayesian best 20', ganancia_best_20)

In [None]:
#Adjust number of sent
STEP = 100
step, val, curve = calculate_expected_profit(df_labels=df_test[['numero_de_cliente','true_class']], 
                                             df_predictions=df_pred_test_bo20[['numero_de_cliente','avg_prob']], 
                                             step=STEP,
                                             target_col='avg_prob',
                                             class_col='true_class')
print('Bayesian semillero 20 adjusted true labels', val)
df_curves.loc[len(df_curves.index)] = ['Bayesian best 20 true', val, curve]

In [None]:
#Adjust number of sent
STEP = 100
step, val, curve = calculate_expected_profit(df_labels=df_test[['numero_de_cliente','class']], 
                                             df_predictions=df_pred_test_bo20[['numero_de_cliente','avg_prob']], 
                                             step=STEP,
                                             target_col='avg_prob',
                                             class_col='class')
print('Bayesian semillero 20 adjusted binary labels', val)
df_curves.loc[len(df_curves.index)] = ['Bayesian best 20 binary', val, curve]

# Random ensemble

In [None]:
def get_random_params(seed):
    random.seed(seed)
    MAX_DEPTH = random.randint(3, 12)
    COLSAMPLE_BYTREE = random.uniform(0.5, 1)
    N_ESTIMATORS = random.randint(50,1000)
    NUM_LEAVES = random.randint(128,1024)
    REG_ALPHA = random.uniform(0,100)
    REG_LAMBDA = random.uniform(0,20)
    return MAX_DEPTH, COLSAMPLE_BYTREE, N_ESTIMATORS, NUM_LEAVES, REG_ALPHA, REG_LAMBDA

In [None]:
#df with outputs
df_pred_val_ra = df_val[['numero_de_cliente']]
df_pred_test_ra = df_test[['numero_de_cliente']]

features = []

#generate random models
random.seed(42)
seed_array = [random.getrandbits(32) for i in range(20)]
for seed in seed_array:
    #model
    MAX_DEPTH, COLSAMPLE_BYTREE, N_ESTIMATORS, NUM_LEAVES, REG_ALPHA, REG_LAMBDA = get_random_params(seed)
    model = lgb.LGBMClassifier(
                                boosting_type='gbdt',
                                eval_metric='cross_entropy_lambda',
                                num_leaves=int(NUM_LEAVES),
                                max_depth=int(MAX_DEPTH),
                                n_estimators=int(N_ESTIMATORS),
                                colsample_bytree=COLSAMPLE_BYTREE,
                                learning_rate=1, 
                                random_state=SEED,
                                zero_as_missing=True,
                                class_weight='balanced',
                                objective='binary', 
                                verbosity=-1,
                                reg_alpha=REG_ALPHA,
                                reg_lambda=REG_LAMBDA
                              )
    
    #train
    model.fit(
                df_train.drop(['numero_de_cliente','foto_mes','class','true_class'], axis=1), 
                df_train['class']
             )
    
    #predict
    features.append(f'prob_target_{seed}')
    df_pred_val_ra[f'prob_target_{seed}'] = model.predict_proba(df_val.drop(['numero_de_cliente',
                                                                             'foto_mes',
                                                                             'class',
                                                                             'true_class'], axis=1))[:,1]
    
    #predict test
    df_pred_test_ra[f'prob_target_{seed}'] = model.predict_proba(df_test.drop(['numero_de_cliente',
                                                                               'foto_mes',
                                                                               'class',
                                                                               'true_class'], axis=1))[:,1]

In [None]:
STEP = 100
df_pred_val_ra = df_pred_val_ra.merge(df_val[['numero_de_cliente','class','true_class']],
                                      left_on='numero_de_cliente',
                                      right_on='numero_de_cliente')

In [None]:
#weight models
reg = LogisticRegression(random_state=SEED).fit(df_pred_val_ra[features], df_pred_val_ra['class'])

df_pred_val_ra['avg_prob'] = reg.predict_proba(df_pred_val_ra[features])[:,1]
df_pred_test_ra['avg_prob'] = reg.predict_proba(df_pred_test_ra[features])[:,1]

#Estimate cutoff
step, val, curve = calculate_expected_profit(df_labels=df_val[['numero_de_cliente','class']], 
                                             df_predictions=df_pred_val_ra[['numero_de_cliente','avg_prob']], 
                                             step=STEP,
                                             target_col='avg_prob',
                                             class_col='class')

In [None]:
#Expected profit
df_pred_test_ra = df_pred_test_ra.sort_values(['avg_prob'])
df_pred_test_ra = df_pred_test_ra.merge(df_test[['numero_de_cliente','true_class']],
                                        left_on='numero_de_cliente',
                                        right_on='numero_de_cliente')

df_pred_test_ra['label'] = None
df_pred_test_ra.loc[:step*STEP,'label'] = 1
df_pred_test_ra.loc[step*STEP:,'label'] = 0

ganancia_random = sum(df_pred_test_ra['label'] * (-7000)) * \
                  len(df_pred_test_ra.loc[(df_pred_test_ra['label']==1) & \
                                          (df_pred_test_ra['true_class']==1)]) * 273000
print('Weighted random ensemble', ganancia_random)

In [None]:
#Adjust number of sent
STEP = 100
step, val, curve = calculate_expected_profit(df_labels=df_test[['numero_de_cliente','true_class']], 
                                             df_predictions=df_pred_test_ra[['numero_de_cliente','avg_prob']], 
                                             step=STEP,
                                             target_col='avg_prob',
                                             class_col='true_class')
print('Random 20 adjusted true labels (logreg)', val)
df_curves.loc[len(df_curves.index)] = ['Random 20 true (logreg)', val, curve]

In [None]:
#Adjust number of sent
STEP = 100
step, val, curve = calculate_expected_profit(df_labels=df_test[['numero_de_cliente','class']], 
                                             df_predictions=df_pred_test_ra[['numero_de_cliente','avg_prob']], 
                                             step=STEP,
                                             target_col='avg_prob',
                                             class_col='class')
print('Random 20 adjusted binary labels (logreg)', val)
df_curves.loc[len(df_curves.index)] = ['Random 20 binary (logreg)', val, curve]

In [None]:
#weight models
random.seed(42)
features=[]
seed_array = [random.getrandbits(32) for i in range(20)]
for seed in seed_array:
    features.append(f'prob_target_{seed}')
lin_reg = LinearRegression().fit(df_pred_val_ra[features], df_pred_val_ra['class'])

df_pred_val_ra['avg_prob_lin'] = lin_reg.predict(df_pred_val_ra[features])
df_pred_test_ra['avg_prob_lin'] = lin_reg.predict(df_pred_test_ra[features])

In [None]:
#Adjust number of sent
STEP = 100
step, val, curve = calculate_expected_profit(df_labels=df_test[['numero_de_cliente','true_class']], 
                                             df_predictions=df_pred_test_ra[['numero_de_cliente','avg_prob_lin']], 
                                             step=STEP,
                                             target_col='avg_prob_lin',
                                             class_col='true_class')
print('Random 20 adjusted true labels (linreg)', val)
df_curves.loc[len(df_curves.index)] = ['Random 20 true (linreg)', val, curve]

In [None]:
#Adjust number of sent
STEP = 100
step, val, curve = calculate_expected_profit(df_labels=df_test[['numero_de_cliente','class']], 
                                             df_predictions=df_pred_test_ra[['numero_de_cliente','avg_prob_lin']], 
                                             step=STEP,
                                             target_col='avg_prob_lin',
                                             class_col='class')
print('Random 20 adjusted binary labels (linreg)', val)
df_curves.loc[len(df_curves.index)] = ['Random 20 binary (linreg)', val, curve]

# Genetic ensemble

In [None]:
creator.create("Fitness", base.Fitness, weights=(1.0,)) 
creator.create("Individual", list, fitness=creator.Fitness)

In [None]:
#logging
log_evo = []
def evaluate(individual):
    #model
    print(individual)
    model = lgb.LGBMClassifier(
                                boosting_type='gbdt',
                                eval_metric='cross_entropy_lambda',
                                num_leaves=int(individual[0]),
                                max_depth=int(individual[1]),
                                n_estimators=int(individual[2]),
                                colsample_bytree=min(1,individual[3]),
                                learning_rate=1, 
                                random_state=42,
                                zero_as_missing=True,
                                class_weight='balanced',
                                objective='binary', 
                                verbosity=-1,
                                reg_alpha=individual[4],
                                reg_lambda=individual[5]
                              )
    #train
    model.fit(
                df_train.drop(['numero_de_cliente',
                               'foto_mes',
                               'class',
                               'true_class'], axis=1), 
                df_train['class']
             )
    #predict
    df_pred = df_val[['numero_de_cliente']]
    df_pred['prob_target'] = model.predict_proba(df_val.drop(['numero_de_cliente',
                                                              'foto_mes',
                                                              'class',
                                                              'true_class'], axis=1))[:,1]
    step, val, curve = calculate_expected_profit(df_labels=df_val[['numero_de_cliente',
                                                                   'true_class']], 
                                             df_predictions=df_pred, 
                                             step=100,
                                             target_col='prob_target',
                                             class_col='true_class')
    
    #validation values
    log_evo.append(curve)
    fitness_value = val
    return fitness_value,

In [None]:
def crossover_with_constraints(ind1, ind2, alpha):
    def clamp(min_value, max_value, x):
        return max(min(x, max_value), min_value)


    child1, child2 = tools.cxBlend(ind1, ind2, alpha)

    child1[0] = int(clamp(space['NUM_LEAVES'][0], space['NUM_LEAVES'][1], child1[0]))
    child1[1] = int(clamp(space['MAX_DEPTH'][0], space['MAX_DEPTH'][1], child1[1]))
    child1[2] = int(clamp(space['N_ESTIMATORS'][0], space['N_ESTIMATORS'][1], child1[2]))
    child1[3] = float(clamp(space['COLSAMPLE_BYTREE'][0], space['COLSAMPLE_BYTREE'][1], child1[3]))
    child1[4] = float(clamp(space['REG_ALPHA'][0], space['REG_ALPHA'][1], child1[4]))
    child1[5] = float(clamp(space['REG_LAMBDA'][0], space['REG_LAMBDA'][1], child1[5]))

    child2[0] = int(clamp(space['NUM_LEAVES'][0], space['NUM_LEAVES'][1], child2[0]))
    child2[1] = int(clamp(space['MAX_DEPTH'][0], space['MAX_DEPTH'][1], child2[1]))
    child2[2] = int(clamp(space['N_ESTIMATORS'][0], space['N_ESTIMATORS'][1], child2[2]))
    child2[3] = float(clamp(space['COLSAMPLE_BYTREE'][0], space['COLSAMPLE_BYTREE'][1], child2[3]))
    child2[4] = float(clamp(space['REG_ALPHA'][0], space['REG_ALPHA'][1], child2[4]))
    child2[5] = float(clamp(space['REG_LAMBDA'][0], space['REG_LAMBDA'][1], child2[5]))
        
    return child1, child2

In [None]:
def mutate_with_constraints(ind, mu, sigma, indpb):
    def clamp(min_value, max_value, x):
        return max(min(x, max_value), min_value)
    # Создаем копию индивида для мутации
    mutant = toolbox.clone(ind)

    # Применяем мутацию с гауссовским шумом
    tools.mutGaussian(mutant, mu, sigma, indpb)

    # Применяем ограничения к параметрам мутанта
    mutant[0] = int(clamp(space['NUM_LEAVES'][0], space['NUM_LEAVES'][1], mutant[0]))
    mutant[1] = int(clamp(space['MAX_DEPTH'][0], space['MAX_DEPTH'][1], mutant[1]))
    mutant[2] = int(clamp(space['N_ESTIMATORS'][0], space['N_ESTIMATORS'][1], mutant[2]))
    mutant[3] = float(clamp(space['COLSAMPLE_BYTREE'][0], space['COLSAMPLE_BYTREE'][1], mutant[3]))
    mutant[4] = float(clamp(space['REG_ALPHA'][0], space['REG_ALPHA'][1], mutant[4]))
    mutant[5] = float(clamp(space['REG_LAMBDA'][0], space['REG_LAMBDA'][1], mutant[5]))

    return mutant,

In [None]:
#setting up DEAP
toolbox = base.Toolbox()
AttrSet = []
for param in space.keys():
    if param in ['MAX_DEPTH','N_ESTIMATORS','NUM_LEAVES']:
        toolbox.register(param, random.randint, space[param][0], space[param][1])
    elif param in ['COLSAMPLE_BYTREE','REG_ALPHA','REG_LAMBDA']:
        toolbox.register(param, random.uniform, space[param][0], space[param][1])
    AttrSet.append(getattr(toolbox, param))

toolbox.register("individual", 
                 tools.initCycle, 
                 creator.Individual,
                 (toolbox.NUM_LEAVES,
                  toolbox.MAX_DEPTH,
                  toolbox.N_ESTIMATORS,
                  toolbox.COLSAMPLE_BYTREE,   
                  toolbox.REG_ALPHA, 
                  toolbox.REG_LAMBDA), 
                 1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("mate", crossover_with_constraints, alpha=0.5)
toolbox.register("mutate", mutate_with_constraints, mu=0, sigma=1, indpb=0.2)
toolbox.register("select", tools.selNSGA2)  # Selection
toolbox.register("evaluate", evaluate)

In [None]:
population = toolbox.population(n=10)

#target function to maximize and curves
def calculate_expected_profit(df_labels, df_predictions, step=100, target_col='prob_target',class_col='class'):
    curve = []
    max_value = 0
    max_step = -1
    df_comb = df_labels.merge(df_predictions, left_on='numero_de_cliente', right_on='numero_de_cliente')
    df_comb.sort_values(target_col, ascending=False, inplace=True)
    for i in range(len(df_comb) // step + 1):
        curve.append((i,sum(df_comb[class_col].iloc[:min(i*step,len(df_comb))])*280000 - min(i*step,len(df_comb)) * 7000))
        if max_value < sum(df_comb[class_col].iloc[:min(i*step,len(df_comb))])*280000 - min(i*step,len(df_comb)) * 7000:
            max_value = sum(df_comb[class_col].iloc[:min(i*step,len(df_comb))])*280000 - min(i*step,len(df_comb)) * 7000
            max_step = i
    return max_step, max_value, curve

In [None]:
random.seed(SEED)
for pop in tqdm(population):
    pop.fitness.values = evaluate(pop)

In [None]:
#evo alg
#20 best models
halloffame = tools.HallOfFame(20)
generations = 10
for gen in tqdm(range(generations), desc="Evolving"):
    algorithms.eaMuPlusLambda(population, toolbox, mu=10, lambda_=20, cxpb=0.7, mutpb=0.2, ngen=1, stats=None, halloffame=halloffame)
    
# Вывод 20 лучших образцов
print("Top 20 individuals:")
for ind in halloffame:
    print("Params:", ind)
    print("Fitness:", ind.fitness.values)

In [None]:
#df with outputs
df_pred_val_gen = df_val[['numero_de_cliente']]
df_pred_test_gen = df_test[['numero_de_cliente']]

features = []

#generate random models
random.seed(42)
i = 0
for ind in halloffame:
    
    model = lgb.LGBMClassifier(
                                boosting_type='gbdt',
                                eval_metric='cross_entropy_lambda',
                                num_leaves=int(ind[0]),
                                max_depth=int(ind[1]),
                                n_estimators=int(ind[2]),
                                colsample_bytree=min(1,ind[3]),
                                learning_rate=1, 
                                random_state=42,
                                zero_as_missing=True,
                                class_weight='balanced',
                                objective='binary', 
                                verbosity=-1,
                                reg_alpha=ind[4],
                                reg_lambda=ind[5]
                              )
    #train
    model.fit(
                df_train.drop(['numero_de_cliente',
                               'foto_mes',
                               'class',
                               'true_class'], axis=1), 
                df_train['class']
             )
    
    #predict
    features.append(f'prob_target_{i}')
    df_pred_val_gen[f'prob_target_{i}'] = model.predict_proba(df_val.drop(['numero_de_cliente',
                                                                             'foto_mes',
                                                                             'class',
                                                                             'true_class'], axis=1))[:,1]
    
    #predict test
    df_pred_test_gen[f'prob_target_{i}'] = model.predict_proba(df_test.drop(['numero_de_cliente',
                                                                               'foto_mes',
                                                                               'class',
                                                                               'true_class'], axis=1))[:,1]
    i += 1

In [None]:
STEP = 100

df_pred_val_gen['avg_prob'] = sum([df_pred_val_gen[f'prob_target_{i}'] for i in range(20)]) / len(range(20))
df_pred_test_gen['avg_prob'] = sum([df_pred_test_gen[f'prob_target_{i}'] for i in range(20)]) / len(range(20))

In [None]:
#Adjust number of sent
STEP = 100
step, val, curve = calculate_expected_profit(df_labels=df_test[['numero_de_cliente','true_class']], 
                                             df_predictions=df_pred_test_gen[['numero_de_cliente','avg_prob']], 
                                             step=STEP,
                                             target_col='avg_prob',
                                             class_col='true_class')
print('Genetic 20 adjusted true labels', val)
df_curves.loc[len(df_curves.index)] = ['Genetic 20 true', val, curve]

In [None]:
#Adjust number of sent
STEP = 100
step, val, curve = calculate_expected_profit(df_labels=df_test[['numero_de_cliente','class']], 
                                             df_predictions=df_pred_test_gen[['numero_de_cliente','avg_prob']], 
                                             step=STEP,
                                             target_col='avg_prob',
                                             class_col='class')
print('Genetic 20 adjusted binary labels', val)
df_curves.loc[len(df_curves.index)] = ['Genetic 20 binary', val, curve]

In [None]:
df_curves = df_curves.sort_values('experiment')

In [None]:
df_curves.to_csv('df_curves.csv')

# Curves for all models

In [None]:

legend = []
plt.figure(figsize=(10,6))
for index, row in df_curves.loc[df_curves['experiment'].str.contains('binary')].iterrows():
    experiment_name = row['experiment']
    legend.append(experiment_name)
    point_value = ' '.join([str(row['point value'] / 1000000),'millions'])
    curve_data = row['curve']

    x, y = zip(*curve_data[:300])
    
    y = [val / 1000000 for val in y]
    x = [val * 100 for val in x]

    plt.plot(x, y
            )


plt.legend(legend)
plt.xlabel('Envios')
plt.ylabel('Function Value, millions')

plt.show()

In [None]:

legend = []
plt.figure(figsize=(10,6))
for index, row in df_curves.loc[df_curves['experiment'].str.contains('true')].iterrows():
    experiment_name = row['experiment']
    legend.append(experiment_name)
    point_value = ' '.join([str(row['point value'] / 1000000),'millions'])
    curve_data = row['curve']

    x, y = zip(*curve_data[:300])
    
    y = [val / 1000000 for val in y]
    x = [val * 100 for val in x]

    

    plt.plot(x, y
            )


plt.legend(legend, loc=4)
plt.xlabel('Envios')
plt.ylabel('Function Value, millions')

plt.show()

In [None]:
df_curves.to_csv('df_curves_baseline.csv')