### Class Weights 

In [2]:
import numpy as np

# Tracks the quality depending on the weights set for observations  
def show_model_cvs_class_weights(model, x_train, y_train, cv_type, metrics, weights=np.linspace(0.05, 0.95, 20)):
    res_df = pd.DataFrame()
    
    for weight in log_progress(weights):
        cv_results = cross_validate(model,
                                    X=x_train,
                                    y=y_train, 
                                    cv=cv_type,
                                    scoring=metrics,
                                    error_score='raise',
                                    n_jobs=-1,
                                    fit_params={'model__sample_weight': y_train.apply(lambda x: weight if x == 1.0 else 1 - weight)})
           
        cv_results['Min_Class_Weight'] = weight 
        res_df = res_df.append(pd.DataFrame(pd.DataFrame(cv_results).mean()).iloc[2:, :].T)
    
    return res_df

# %%time
# weights_cv_results = show_model_cvs_class_weights(
#     model=model_pipeline,
#     x_train=x_train,
#     y_train=y_train,
#     cv_type=StratifiedKFold(shuffle=True, random_state=SEED),
#     metrics=['precision', 'recall', 'f1', 'roc_auc'],
#     weights=np.linspace(0.05, 0.95, 20)
# )

# weights_cv_results = weights_cv_results.reset_index(drop=True)

### Random Over/UnderSampling

In [None]:
# Определяем функцию over/under sampling 
def show_model_cvs_random_sampling(model, x_train, y_train, cv_type, metrics,
                                   sample_iters=20, sampling_type='under',
                                   target_labels=[-1, 1], random_state=SEED):
    res_df = pd.DataFrame()
    
    # Определяем число объектов мажоритарного и минорного классов
    maj_class_size, min_class_size = y_train.value_counts()
    
    # Поделим данный интервал на необходимое количество значений
    n_samples = np.linspace(min_class_size, maj_class_size, sample_iters)
    n_samples = np.floor(n_samples).astype('int') # Получим целочисленные значения
    
    # Разделяем выборки на классы
    x_train[y_train.name] = y_train
    
    x_train_pos_class = x_train[x_train[y_train.name] == max(target_labels)] 
    x_train_neg_class = x_train[x_train[y_train.name] == min(target_labels)]
    
    x_train.drop(y_train.name, axis=1, inplace=True) # Чтобы после завершения, таргет не присутствовал в обучении
    
    if sampling_type == 'under':
        for sample_size in log_progress(n_samples):
            # Сэмлируем необходимое число объектов мажоритарного класса
            x_train_neg_under = x_train_neg_class.sample(sample_size, random_state=random_state)

            # Cоздаем обучающую выборку
            x_train_under = pd.concat([x_train_neg_under, x_train_pos_class], axis=0)
            y_train = x_train_under[y_train.name]

            # Удаляем целевой класс из обучения
            x_train_under.drop(y_train.name, axis=1, inplace=True)

            # Кросс-валидируемся
            cv_results = cross_validate(model, X=x_train_under, y=y_train, 
                                        cv=cv_type, scoring=metrics, error_score='raise', n_jobs=-1)

            cv_results['Sample_Size'] = sample_size
            res_df = res_df.append(pd.DataFrame(pd.DataFrame(cv_results).mean()).iloc[2:, :].T)
            
        return res_df
    
    else:
        for sample_size in log_progress(n_samples):
            # Сэмлируем необходимое число объектов минорного класса
            x_train_pos_over = x_train_pos_class.sample(sample_size, replace=True, random_state=random_state)
            
            # Cоздаем обучающую выборку
            x_train_over = pd.concat([x_train_neg_class, x_train_pos_over], axis=0)
            y_train = x_train_over[y_train.name]
        
            # Удаляем целевой класс из обучения
            x_train_over.drop(y_train.name, axis=1, inplace=True)
        
            # Кросс-валидируемся
            cv_results = cross_validate(model, X=x_train_over, y=y_train, 
                                        cv=cv_type, scoring=metrics, error_score='raise', n_jobs=-1)
        
            cv_results['Sample_Size'] = sample_size
            res_df = res_df.append(pd.DataFrame(pd.DataFrame(cv_results).mean()).iloc[2:, :].T)
            
        return res_df
    
    
# %%time 
# under_cv_results = show_model_cvs_random_sampling(model=model_pipeline,
#                                                   x_train=x_train,
#                                                   y_train=y_train,
#                                                   cv_type=StratifiedKFold(shuffle=True, random_state=SEED),
#                                                   metrics=['precision', 'recall', 'f1', 'roc_auc'],
#                                                   sampling_type='under',
#                                                   sample_iters=20)

# under_cv_results = under_cv_results.reset_index(drop=True)

### TomekLinks

In [3]:
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import TomekLinks

# # Переопределяем Pipeline
# model_pipeline = make_pipe(cat_bin_columns=cat_features.columns,
#                            num_columns=num_features.columns,
#                            model=GradientBoostingClassifier(random_state=SEED),
#                            cat_bin_imputer=SimpleImputer(strategy='constant', fill_value='unknown'),
#                            cat_bin_encoder=OneHotEncoder(sparse=True, handle_unknown='ignore'),
#                            num_imputer=SimpleImputer(strategy='mean'),
#                            num_scaler=StandardScaler())

# # Добавим этап downsampling в Pipeline
# model_pipeline.steps.insert(1, ('downsampling', TomekLinks(sampling_strategy='majority', n_jobs=-1)))