In [None]:
### 1. Логистическая регрессия 

from sklearn.linear_model import LogisticRegression

def log_reg_clf_hitrate(x_random_state, x_features_train, x_target_train, x_features_test, x_target_test, c_w):
    print("LogisticRegression")
    LR_max_hitrate = 0
    LR_hitrate_C = 0

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=x_random_state)
    LR_data_metrix = []

    for i in range(-3, 4, 1):
        x_c = 10**i  # Свободный член регрессии

        fold_metrics = []

        for fold, (train_idx, valid_idx) in enumerate(kf.split(x_features_train, x_target_train)):
            x_features_fold_train, x_features_valid = x_features_train.iloc[train_idx], x_features_train.iloc[valid_idx]
            x_target_fold_train, x_target_valid = x_target_train.iloc[train_idx], x_target_train.iloc[valid_idx]

            model = LogisticRegression(random_state=x_random_state, solver='liblinear', class_weight=c_w, C=x_c)
            model.fit(x_features_fold_train, x_target_fold_train)

            # Оценка метрики hitrate@5 на валидационных данных
            hitrate = calculate_hitrate_at_5(model, x_features_valid, x_target_valid)
            fold_metrics.append(hitrate)

        avg_hitrate = np.mean(fold_metrics)
        LR_data_metrix.append([avg_hitrate, x_c])

        if LR_max_hitrate < avg_hitrate:
            LR_max_hitrate = avg_hitrate
            LR_hitrate_C = x_c

    LR_data_metrix = pd.DataFrame(LR_data_metrix, columns=['hitrate', 'x_c'])
    display(LR_data_metrix)
    LR_data_metrix["best_C"] = LR_hitrate_C
    LR_data_metrix["max_train_hitrate"] = LR_max_hitrate
    print('Максимум Hitrate =', LR_max_hitrate, '| свободный член регрессии С=', LR_hitrate_C)

    # Обучение на всем трейне с лучшими гиперпараметрами
    best_model = LogisticRegression(random_state=x_random_state, solver='liblinear', class_weight=c_w, C=LR_hitrate_C)
    best_model.fit(x_features_train, x_target_train)

    # Оценка на тестовых данных
    hitrate_test = calculate_hitrate_at_5(best_model, x_features_test, x_target_test)
    print(f'Test Hitrate@5: {hitrate_test:.4f}')

    LR_data_metrix["max_test_hitrate"] = hitrate_test
    
    # Отбор важных фичей на основе коэффициентов модели с учетом знака
    importances = best_model.coef_[0]
    feature_importance = pd.DataFrame(
        importances,
        index=x_features_train.columns,
        columns=['importance']
    ).sort_values(by='importance', ascending=False)

    # Построение графика важности признаков
    plot_feature_importances(feature_importance=feature_importance, model_name=f"Logistic Regression {c_w}", target_type="Hitrate@5")

    return best_model, LR_data_metrix


## Пропишем классификаторы для исследования моделей

In [None]:
### Функция для построения гистограммы важности признаков
def plot_feature_importances(feature_importance, model_name, target_type):
    feature_importance = feature_importance.sort_values('importance', ascending=True)
    plt.figure(figsize=(20, 16))
    plt.barh(feature_importance.index, feature_importance.importance, height=0.7)
    plt.xlabel('Importance', fontsize=12)
    plt.ylabel('Features', fontsize=12)
    plt.title(f'{model_name} - {target_type} - Feature Importance', fontsize=16)
    plt.show()


### Строит матрицу путаниц для оценки качества классификации
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def ax_plot_confusion_matrix(ax, y_true, y_pred, labels=None, title="Confusion Matrix"): 
    # Вычисляем матрицу путаниц 
    cm = confusion_matrix(y_true, y_pred, labels=labels) 
    # Создаем объект для отображения матрицы путаниц 
    cmp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels) 
    # Отображаем матрицу путаниц на графике 
    cmp.plot(ax=ax) 
 
    ax.set_title(title)


def print_clf(df_LR, df_DT, df_RF, df_LGBM):
    """
    Функция вывода максимальных значений Hitrate@5 и гиперпараметров.
    На вход принимает таблицы значений Hitrate@5 по параметрам.
    """
    result_metrix = []

    models = {
        "LogisticRegression": df_LR,
        "DecisionTreeClassifier": df_DT,
        "RandomForestClassifier": df_RF,
        "LGBMClassifier": df_LGBM
    }

    for model_name, df in models.items():
        print(model_name)
        
        max_hitrate_idx = df['hitrate'].idxmax()
        max_hitrate_row = df.loc[max_hitrate_idx]
        
        print('Максимум Hitrate@5 =', max_hitrate_row['hitrate'])
        print('Тренировочные данные: Hitrate@5 =', max_hitrate_row['max_train_hitrate'])
        print('Тестовые данные: Hitrate@5 =', max_hitrate_row['max_test_hitrate'])
        print('Гиперпараметры:')
        for param in df.columns:
            if param not in ['hitrate', 'max_train_hitrate', 'max_test_hitrate', 'best_C', 'best_depth', 'best_n_estimators']:
                print(f'  {param}: {max_hitrate_row[param]}')
        
        result_metrix.append([
            model_name, 
            max_hitrate_row['hitrate'], 
            max_hitrate_row['max_train_hitrate'], 
            max_hitrate_row['max_test_hitrate'],
            {param: max_hitrate_row[param] for param in df.columns if param not in ['hitrate', 'max_train_hitrate', 'max_test_hitrate', 'best_C', 'best_depth', 'best_n_estimators']}
        ])
        
        print()
    
    final_metrix = pd.DataFrame(result_metrix, columns=['Classifier', 'Hitrate@5', 'Train Hitrate@5', 'Test Hitrate@5', 'Best Params'])
    return final_metrix

In [None]:
### Напишем классификаторы

from sklearn.model_selection import StratifiedKFold

### 1. Логистическая регрессия 

from sklearn.linear_model import LogisticRegression

def log_reg_clf_hitrate(x_random_state, x_features_train, x_target_train, x_features_test, x_target_test, c_w):
    print("LogisticRegression")
    LR_max_hitrate = 0
    LR_hitrate_C = 0

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=x_random_state)
    LR_data_metrix = []

    for i in range(-3, 4, 1):
        x_c = 10**i  # Свободный член регрессии

        fold_metrics = []

        for fold, (train_idx, valid_idx) in enumerate(kf.split(x_features_train, x_target_train)):
            x_features_fold_train, x_features_valid = x_features_train.iloc[train_idx], x_features_train.iloc[valid_idx]
            x_target_fold_train, x_target_valid = x_target_train.iloc[train_idx], x_target_train.iloc[valid_idx]

            model = LogisticRegression(random_state=x_random_state, solver='liblinear', class_weight=c_w, C=x_c)
            model.fit(x_features_fold_train, x_target_fold_train)

            # Оценка метрики hitrate@5 на валидационных данных
            hitrate = calculate_hitrate_at_5(model, x_features_valid, x_target_valid)
            fold_metrics.append(hitrate)

        avg_hitrate = np.mean(fold_metrics)
        LR_data_metrix.append([avg_hitrate, x_c])

        if LR_max_hitrate < avg_hitrate:
            LR_max_hitrate = avg_hitrate
            LR_hitrate_C = x_c

    LR_data_metrix = pd.DataFrame(LR_data_metrix, columns=['hitrate', 'x_c'])
    display(LR_data_metrix)
    LR_data_metrix["best_C"] = LR_hitrate_C
    LR_data_metrix["max_train_hitrate"] = LR_max_hitrate
    print('Максимум Hitrate =', LR_max_hitrate, '| свободный член регрессии С=', LR_hitrate_C)

    # Обучение на всем трейне с лучшими гиперпараметрами
    best_model = LogisticRegression(random_state=x_random_state, solver='liblinear', class_weight=c_w, C=LR_hitrate_C)
    best_model.fit(x_features_train, x_target_train)

    # Оценка на тестовых данных
    hitrate_test = calculate_hitrate_at_5(best_model, x_features_test, x_target_test)
    print(f'Test Hitrate@5: {hitrate_test:.4f}')

    LR_data_metrix["max_test_hitrate"] = hitrate_test
    
    # Отбор важных фичей на основе коэффициентов модели с учетом знака
    importances = best_model.coef_[0]
    feature_importance = pd.DataFrame(
        importances,
        index=x_features_train.columns,
        columns=['importance']
    ).sort_values(by='importance', ascending=False)

    # Построение графика важности признаков
    plot_feature_importances(feature_importance=feature_importance, model_name=f"Logistic Regression {c_w}", target_type="Hitrate@5")

    return best_model, LR_data_metrix


### 2. Дерево решений

from sklearn.tree import DecisionTreeClassifier

def dec_tre_clf_hitrate(x_random_state, x_features_train, x_target_train, x_features_test, x_target_test, c_w):
    print("DecisionTreeClassifier")
    DT_max_hitrate = 0
    DT_hitrate_depth = 0

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=x_random_state)
    DT_data_metrix = []

    for depth in range(4, 51, 2):
        fold_metrics = []

        for fold, (train_idx, valid_idx) in enumerate(kf.split(x_features_train, x_target_train)):
            x_features_fold_train, x_features_valid = x_features_train.iloc[train_idx], x_features_train.iloc[valid_idx]
            x_target_fold_train, x_target_valid = x_target_train.iloc[train_idx], x_target_train.iloc[valid_idx]

            model = DecisionTreeClassifier(max_depth=depth, random_state=x_random_state, class_weight=c_w)
            model.fit(x_features_fold_train, x_target_fold_train)

            # Оценка метрики hitrate@5 на валидационных данных
            hitrate = calculate_hitrate_at_5(model, x_features_valid, x_target_valid)
            fold_metrics.append(hitrate)

        avg_hitrate = np.mean(fold_metrics)
        DT_data_metrix.append([avg_hitrate, depth])

        if DT_max_hitrate < avg_hitrate:
            DT_max_hitrate = avg_hitrate
            DT_hitrate_depth = depth

    DT_data_metrix = pd.DataFrame(DT_data_metrix, columns=['hitrate', 'depth'])
    display(DT_data_metrix)
    DT_data_metrix["best_depth"] = DT_hitrate_depth
    DT_data_metrix["max_train_hitrate"] = DT_max_hitrate
    print('Максимум Hitrate =', DT_max_hitrate, '| глубина дерева = ', DT_hitrate_depth)

    # Обучение на всем трейне с лучшими гиперпараметрами
    best_model = DecisionTreeClassifier(max_depth=DT_hitrate_depth, random_state=x_random_state, class_weight=c_w)
    best_model.fit(x_features_train, x_target_train)

    # Оценка на тестовых данных
    hitrate_test = calculate_hitrate_at_5(best_model, x_features_test, x_target_test)
    print(f'Test Hitrate@5: {hitrate_test:.4f}')

    DT_data_metrix["max_test_hitrate"] = hitrate_test
    
    # Отбор важных фичей на основе коэффициентов модели
    importances = best_model.feature_importances_
    feature_importance = pd.DataFrame(
        importances,
        index=x_features_train.columns,
        columns=['importance']
    ).sort_values(by='importance', ascending=False)

    # Построение графика важности признаков
    plot_feature_importances(feature_importance=feature_importance, model_name=f"Decision Tree {c_w}", target_type="Hitrate@5")

    return best_model, DT_data_metrix


### 3. Случайный лес

from sklearn.ensemble import RandomForestClassifier

def ran_for_clf_hitrate(x_random_state, x_features_train, x_target_train, x_features_test, x_target_test, c_w):
    print("RandomForestClassifier")
    
    RF_max_Hitrate = 0
    RF_Hitrate_n_estimators = 0
    RF_Hitrate_depth = 0
    
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=x_random_state)
    RF_data_metrix = []
    
    for depth in range(6, 11, 2):
        for estim in range(50, 301, 50):
            fold_metrics = {
                'hitrate': []
            }
            
            for fold, (train_idx, valid_idx) in enumerate(kf.split(x_features_train, x_target_train)):
                x_features_fold_train, x_features_valid = x_features_train.iloc[train_idx], x_features_train.iloc[valid_idx]
                x_target_fold_train, x_target_valid = x_target_train.iloc[train_idx], x_target_train.iloc[valid_idx]
                
                model = RandomForestClassifier(n_estimators=estim, max_depth=depth, random_state=x_random_state, class_weight=c_w)
                model.fit(x_features_fold_train, x_target_fold_train)
                
                hitrate = calculate_hitrate_at_5(model, x_features_valid, x_target_valid)
                fold_metrics['hitrate'].append(hitrate)
            
            avg_hitrate = np.mean(fold_metrics['hitrate'])
            RF_data_metrix.append([avg_hitrate, depth, estim])

            if RF_max_Hitrate < avg_hitrate:
                RF_max_Hitrate = avg_hitrate
                RF_Hitrate_n_estimators = estim
                RF_Hitrate_depth = depth

    RF_data_metrix = pd.DataFrame(RF_data_metrix, columns=['hitrate', 'depth', 'estim'])
    display(RF_data_metrix)
    RF_data_metrix["best_depth"] = RF_Hitrate_depth
    RF_data_metrix["best_n_estimators"] = RF_Hitrate_n_estimators
    RF_data_metrix["max_train_hitrate"] = RF_max_Hitrate
    print('Максимум Hitrate@5 =', RF_max_Hitrate, '| число деревьев = ', RF_Hitrate_n_estimators, '| глубина дерева = ', RF_Hitrate_depth)
    
    best_model = RandomForestClassifier(n_estimators=RF_Hitrate_n_estimators, max_depth=RF_Hitrate_depth, random_state=x_random_state, class_weight=c_w)
    best_model.fit(x_features_train, x_target_train)
    
    hitrate_test = calculate_hitrate_at_5(best_model, x_features_test, x_target_test)
    print('Hitrate@5 на тестовых данных =', hitrate_test)
    
    RF_data_metrix["max_test_hitrate"] = hitrate_test
    
    # Отбор важных фичей на основе коэффициентов модели
    importances = best_model.feature_importances_
    feature_importance = pd.DataFrame(
        importances,
        index=x_features_train.columns,
        columns=['importance']
    ).sort_values(by='importance', ascending=False)

    # Построение графика важности признаков
    plot_feature_importances(feature_importance=feature_importance, model_name=f"Random Forest {c_w}", target_type="Hitrate@5")
    
    return best_model, RF_data_metrix


from lightgbm import LGBMClassifier

def lgbm_clf_hitrate(x_random_state, x_features_train, x_target_train, x_features_test, x_target_test, c_w):
    print("LightGBMClassifier")
    
    lgbm_max_Hitrate = 0
    lgbm_Hitrate_depth = 0
    
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=x_random_state)
    lgbm_data_metrix = []
    
    for depth in range(4, 51, 2):
        fold_metrics = {
            'hitrate': []
        }
        
        params = {
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': 'binary_logloss',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.8,
            'max_cat_threshold': 25,
            'min_data_in_leaf': 10,
            'num_threads': 4,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'max_depth': depth,
            'class_weight': c_w
        }
        
        for fold, (train_idx, valid_idx) in enumerate(kf.split(x_features_train, x_target_train)):
            x_features_fold_train, x_features_valid = x_features_train.iloc[train_idx], x_features_train.iloc[valid_idx]
            x_target_fold_train, x_target_valid = x_target_train.iloc[train_idx], x_target_train.iloc[valid_idx]
            
            model = LGBMClassifier(**params)
            model.fit(x_features_fold_train, x_target_fold_train)
            
            hitrate = calculate_hitrate_at_5(model, x_features_valid, x_target_valid)
            fold_metrics['hitrate'].append(hitrate)
        
        avg_hitrate = np.mean(fold_metrics['hitrate'])
        lgbm_data_metrix.append([avg_hitrate, depth])
    
        if lgbm_max_Hitrate < avg_hitrate:
            lgbm_max_Hitrate = avg_hitrate
            lgbm_Hitrate_depth = depth

    lgbm_data_metrix = pd.DataFrame(lgbm_data_metrix, columns=['hitrate', 'depth'])
    display(lgbm_data_metrix)
    lgbm_data_metrix["best_depth"] = lgbm_Hitrate_depth
    lgbm_data_metrix["max_train_hitrate"] = lgbm_max_Hitrate  
    print('Максимум Hitrate@5 =', lgbm_max_Hitrate, '| глубина дерева = ', lgbm_Hitrate_depth)
    
    best_params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'binary_logloss',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'max_cat_threshold': 25,
        'min_data_in_leaf': 10,
        'num_threads': 4,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'max_depth': lgbm_Hitrate_depth,
        'class_weight': c_w
    }
    
    best_model = LGBMClassifier(**best_params)
    best_model.fit(x_features_train, x_target_train)
    
    hitrate_test = calculate_hitrate_at_5(best_model, x_features_test, x_target_test)
    print('Hitrate@5 на тестовых данных =', hitrate_test)
    
    lgbm_data_metrix["max_test_hitrate"] = hitrate_test  
    
    importances = best_model.feature_importances_
    feature_importance = pd.DataFrame(
        importances,
        index=x_features_train.columns,
        columns=['importance']
    ).sort_values(by='importance', ascending=False)

    # Построение графика важности признаков
    plot_feature_importances(feature_importance=feature_importance, model_name=f"LGBM {c_w}", target_type="Hitrate@5")
    
    return best_model, lgbm_data_metrix