In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklift.datasets import fetch_x5
from feature_extraction import UpliftFeatureExtractor

In [2]:
# Загружаем данные
dataset = fetch_x5()
data = dataset.data

In [3]:
# Считаем фичи
extractor = UpliftFeatureExtractor(drop_redundant=True)
df = extractor.calculate_features(
    clients_df=data.clients,
    train_df=data.train,
    treatment_df=dataset.treatment,
    target_df=dataset.target,
    purchases_df=data.purchases
)

features = extractor.feature_names

print(f"Создано признаков: {len(features)}")
print(f"Размер датафрейма: {df.shape}")
print(f"Признаки: {features}")

Создано признаков: 32
Размер датафрейма: (200039, 34)
Признаки: ['first_issue_month', 'first_issue_weekday', 'first_issue_year_quarter_idx', 'total_transactions', 'avg_transaction_amount', 'max_transaction_amount', 'min_transaction_amount', 'total_express_points_received', 'total_express_points_spent', 'avg_express_points_per_transaction', 'points_earned_to_spent_ratio', 'unique_products_count', 'avg_product_quantity', 'transaction_period_days', 'first_transaction_quarter', 'first_transaction_year_quarter_idx', 'unique_stores_visited', 'store_loyalty_ratio', 'avg_purchase_per_day', 'transactions_per_month', 'points_spend_ratio', 'points_balance_ratio', 'unique_store_intensity', 'log_total_purchase_sum', 'seasonal_quarter_code', 'avg_items_per_transaction', 'spend_points_per_transaction', 'transaction_value_density', 'is_super_loyal', 'age', 'gender', 'is_activated']


In [4]:
df.isna().sum()

first_issue_month                     0
first_issue_weekday                   0
first_issue_year_quarter_idx          0
total_transactions                    0
avg_transaction_amount                0
max_transaction_amount                0
min_transaction_amount                0
total_express_points_received         0
total_express_points_spent            0
avg_express_points_per_transaction    0
points_earned_to_spent_ratio          0
unique_products_count                 0
avg_product_quantity                  0
transaction_period_days               0
first_transaction_quarter             0
first_transaction_year_quarter_idx    0
unique_stores_visited                 0
store_loyalty_ratio                   0
avg_purchase_per_day                  0
transactions_per_month                0
points_spend_ratio                    0
points_balance_ratio                  0
unique_store_intensity                0
log_total_purchase_sum                0
seasonal_quarter_code                 0


# Обучение

In [13]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

In [14]:
X = df[features]
y = df["target"]

# Определяем категориальные признаки
categorical_features = []
for col in X.columns:
    if X[col].dtype.name == "category" or (X[col].dtype == "object" and X[col].nunique() < 100):
        categorical_features.append(col)
cat_features_indices = [i for i, col in enumerate(X.columns) if col in categorical_features]

In [15]:
# Разделяем на train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=67, stratify=y)

In [17]:
# Задаем catboost
model = CatBoostClassifier(
    iterations=1000,
    depth=6,
    eval_metric="AUC",
    verbose=100,
    use_best_model=True
)

# Обучаем catboost
model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    cat_features=cat_features_indices
)

Learning rate set to 0.111006
0:	test: 0.7403200	best: 0.7403200 (0)	total: 92.6ms	remaining: 1m 32s
100:	test: 0.7672661	best: 0.7672661 (100)	total: 10.1s	remaining: 1m 29s
200:	test: 0.7675510	best: 0.7675673 (192)	total: 19.8s	remaining: 1m 18s
300:	test: 0.7669644	best: 0.7675673 (192)	total: 30s	remaining: 1m 9s
400:	test: 0.7668621	best: 0.7675673 (192)	total: 40.2s	remaining: 1m
500:	test: 0.7666343	best: 0.7675673 (192)	total: 50s	remaining: 49.8s
600:	test: 0.7660830	best: 0.7675673 (192)	total: 1m	remaining: 39.9s
700:	test: 0.7656198	best: 0.7675673 (192)	total: 1m 9s	remaining: 29.8s
800:	test: 0.7653105	best: 0.7675673 (192)	total: 1m 19s	remaining: 19.7s
900:	test: 0.7650661	best: 0.7675673 (192)	total: 1m 29s	remaining: 9.79s
999:	test: 0.7647625	best: 0.7675673 (192)	total: 1m 38s	remaining: 0us

bestTest = 0.7675673226
bestIteration = 192

Shrink model to first 193 iterations.


<catboost.core.CatBoostClassifier at 0x1d0e4a12dd0>

In [21]:
pd.DataFrame({
    'feature': features,
    'importance': model.get_feature_importance()
}).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
3,total_transactions,25.225495
19,transactions_per_month,17.568179
13,transaction_period_days,15.627092
2,first_issue_year_quarter_idx,3.650541
29,age,3.082079
31,is_activated,3.025909
22,unique_store_intensity,3.01684
5,max_transaction_amount,2.66603
6,min_transaction_amount,2.23267
0,first_issue_month,2.19382
