In [157]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from probatus.feature_elimination import ShapRFECV
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
import optuna

In [158]:
import pandas as pd

def load_and_preprocess_data(file_path, drop_columns=None, date_column=None):
    """
    Загружает данные из файла CSV, удаляет указанные колонки, обрабатывает дату и кодирует категориальные данные.
    
    """
    # Загружаем данные
    df = pd.read_csv(file_path)
    
    # Удаляем указанные колонки
    if drop_columns:
        df.drop(columns=drop_columns, inplace=True, errors='ignore')
    
    # Обрабатываем колонку с датами, если она указана
    if date_column:
        df[date_column] = pd.to_datetime(df[date_column], errors='coerce')  # Преобразуем дату, пропуская некорректные значения
        df['year'] = df[date_column].dt.year
        df['month'] = df[date_column].dt.month
        df['day'] = df[date_column].dt.day
        df['weekday'] = df[date_column].dt.weekday
        df['quarter'] = df[date_column].dt.quarter
        df.drop(columns=[date_column], inplace=True)
    
    # Заполняем пропущенные значения
    df.fillna(0, inplace=True)
    
    # Преобразуем категориальные данные в числовые (One-Hot Encoding)
    df = pd.get_dummies(df, drop_first=True)
    
    # Преобразуем логические значения в целочисленные
    bool_columns = df.select_dtypes(include=['bool']).columns
    if not bool_columns.empty:
        df[bool_columns] = df[bool_columns].astype('int')
    
    return df


In [159]:

X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)


X_test = X_test.reindex(columns=X_train.columns, fill_value=0)


In [160]:
from sklearn.ensemble import RandomForestClassifier
from probatus.feature_elimination import ShapRFECV

def feature_selection(X, y, n_features=5):
    """
    Отбор признаков с использованием ShapRFECV..
    """

    model = RandomForestClassifier(random_state=42)

   
    shap_selector = ShapRFECV(model, step=0.2, cv=3, scoring='accuracy', n_jobs=-1)

   
    shap_selector.fit(X, y)

   
    selected_features = shap_selector.get_reduced_features_set(num_features=n_features)

    print(f"Выбрано {len(selected_features)} признаков: {selected_features}")
    
    return X[selected_features]


In [161]:
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

def train_and_select_model(X_train, X_test, y_train, y_test):
    """
    Обучает CatBoost, LightGBM, XGBoost и выбирает лучшую модель.
    
    """
    models = {
        "catboost": CatBoostClassifier(verbose=0, random_state=42),
        "lightgbm": LGBMClassifier(random_state=42),
        "xgboost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }
    
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results[name] = acc
        print(f"{name} accuracy: {acc}")
    
    best_model_name = max(results, key=results.get)
    best_model = models[best_model_name]
    print(f"Best model: {best_model_name}")
    return best_model


In [162]:
import optuna
from sklearn.model_selection import cross_val_score

def optimize_model_with_optuna(X, y, model_name):
    """
    Настраивает гиперпараметры модели.
    """
    def objective(trial):
        if model_name == "catboost":
            params = {
                'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
                'depth': trial.suggest_int('depth', 4, 10),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10)
            }
            model = CatBoostClassifier(**params, verbose=0, random_state=42)
        
        elif model_name == "lightgbm":
            params = {
                'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
                'n_estimators': trial.suggest_int('n_estimators', 50, 300),
                'max_depth': trial.suggest_int('max_depth', 3, 10)
            }
            model = LGBMClassifier(**params, random_state=42)
        
        elif model_name == "xgboost":
            params = {
                'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'n_estimators': trial.suggest_int('n_estimators', 50, 300)
            }
            model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=42)
        
        score = cross_val_score(model, X, y, cv=3, scoring='accuracy').mean()
        return score

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=20)
    return study.best_params


In [163]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np

class BlendingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, models):
        self.models = models

    def fit(self, X, y):
        for _, model in self.models:
            model.fit(X, y)
        return self

    def predict(self, X):
        preds = np.array([model.predict(X) for _, model in self.models])
        return np.round(np.mean(preds, axis=0)).astype(int)


In [164]:
def evaluate_model(model, X, y, X_test, y_test):
    """
    Оценивает модель и возвращает результаты в DataFrame.
    """
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    results = X_test.copy()
    results['y_real'] = y_test
    results['y_pred'] = y_pred
    print(f"Accuracy: {accuracy}")
    return results


In [165]:
df = load_and_preprocess_data('/home/vozzy/ml_project/structured_customer_data.csv', drop_columns=['customer_id'], date_column='registration_date')
X = df.drop(columns=['target'])
y = df['target']


In [166]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [167]:
X_train_selected = feature_selection(X_train, y_train, n_features=5)
X_test_selected = X_test[X_train_selected.columns]


Выбрано 5 признаков: ['average_purchase_value', 'visit_frequency', 'days_since_last_visit', 'customer_feedback_score', 'day']


In [168]:
best_model = train_and_select_model(X_train_selected, X_test_selected, y_train, y_test)


catboost accuracy: 0.915
[LightGBM] [Info] Number of positive: 723, number of negative: 77
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000877 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 384
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.903750 -> initscore=2.239604
[LightGBM] [Info] Start training from score 2.239604
lightgbm accuracy: 0.91


Parameters: { "use_label_encoder" } are not used.



xgboost accuracy: 0.91
Best model: catboost


In [169]:
params = optimize_model_with_optuna(X_train_selected, y_train, model_name='catboost')
print(params)


[I 2024-12-25 06:27:20,924] A new study created in memory with name: no-name-7b253fab-bcf0-4435-bd7e-8208b96a9b16
[I 2024-12-25 06:27:25,264] Trial 0 finished with value: 0.9075028394957432 and parameters: {'learning_rate': 0.00265743718170487, 'depth': 6, 'l2_leaf_reg': 9.206870453952455}. Best is trial 0 with value: 0.9075028394957432.
[I 2024-12-25 06:27:29,243] Trial 1 finished with value: 0.8937606187754029 and parameters: {'learning_rate': 0.010712322058312247, 'depth': 6, 'l2_leaf_reg': 6.137794923428995}. Best is trial 0 with value: 0.9075028394957432.
[I 2024-12-25 06:27:38,174] Trial 2 finished with value: 0.8975012437460693 and parameters: {'learning_rate': 0.007071389361918535, 'depth': 7, 'l2_leaf_reg': 3.8203974758890684}. Best is trial 0 with value: 0.9075028394957432.
[I 2024-12-25 06:27:45,697] Trial 3 finished with value: 0.8875090347591826 and parameters: {'learning_rate': 0.01592266531881893, 'depth': 8, 'l2_leaf_reg': 4.4795948256615485}. Best is trial 0 with value

{'learning_rate': 0.0028027559348039266, 'depth': 6, 'l2_leaf_reg': 9.853893935467484}


In [170]:
models = [("catboost", CatBoostClassifier(**params, verbose=0))]
blender = BlendingClassifier(models)
blender.fit(X_train_selected, y_train)
results = evaluate_model(blender, X_train_selected, y_train, X_test_selected, y_test)


Accuracy: 0.91
