In [1]:
import numpy as np
import pandas as pd

from dataclasses import dataclass
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score

from catboost import CatBoostClassifier, Pool

RANDOM_STATE = 30
TRAIN_PATH = 'data/train.csv'
TEST_PATH = 'data/test_X.csv'

pd.set_option('display.max_colwidth', 200)



In [2]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

In [56]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138039 entries, 0 to 138038
Data columns (total 26 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    138039 non-null  int64  
 1   is_fake               138039 non-null  int64  
 2   brand                 79313 non-null   object 
 3   description           121914 non-null  object 
 4   title_name            138039 non-null  object 
 5   category              138039 non-null  object 
 6   rating_1_count        35377 non-null   float64
 7   rating_2_count        35377 non-null   float64
 8   rating_3_count        35377 non-null   float64
 9   rating_4_count        35377 non-null   float64
 10  rating_5_count        35377 non-null   float64
 11  comments_count        35377 non-null   float64
 12  photos_count          35377 non-null   float64
 13  videos_count          35377 non-null   float64
 14  price                 138039 non-null  float64
 15  

In [57]:
test_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59159 entries, 0 to 59158
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    59159 non-null  int64  
 1   brand                 37354 non-null  object 
 2   description           49224 non-null  object 
 3   title_name            59159 non-null  object 
 4   category              59159 non-null  object 
 5   rating_1_count        11816 non-null  float64
 6   rating_2_count        11816 non-null  float64
 7   rating_3_count        11816 non-null  float64
 8   rating_4_count        11816 non-null  float64
 9   rating_5_count        11816 non-null  float64
 10  comments_count        11816 non-null  float64
 11  photos_count          11816 non-null  float64
 12  videos_count          11816 non-null  float64
 13  price                 59159 non-null  float64
 14  item_time_alive       59159 non-null  int64  
 15  item_count_sales7  

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

for c in ['description','brand']:
    if c in train_df.columns:
        train_df[c] = train_df[c].fillna('')
    if c in test_df.columns:
        test_df[c] = test_df[c].fillna('')

if 'seller_id' in train_df.columns:
    train_df['seller_id'] = train_df['seller_id'].astype(str)
if 'seller_id' in test_df.columns:
    test_df['seller_id'] = test_df['seller_id'].astype(str)

y = train_df['is_fake'].astype(int)


print('train:', train_df.shape, 'test:', test_df.shape)


train: (138039, 26) test: (59159, 25)


In [None]:
@dataclass
class OOFEnc:
    oof_mean: np.ndarray
    test_mean: np.ndarray
    oof_cnt: np.ndarray
    test_cnt: np.ndarray
    prior: float


def oof_target_mean_count(train_key: pd.Series, y: pd.Series, test_key: pd.Series, *, n_splits: int = 5, smooth: float = 50.0) -> OOFEnc:
    yv = y.astype(float).values
    prior = float(np.mean(yv))

    trk = train_key.astype(str).reset_index(drop=True)
    tek = test_key.astype(str).reset_index(drop=True)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)

    oof_mean = np.full(len(trk), prior, dtype=float)
    oof_cnt = np.zeros(len(trk), dtype=float)

    for tr_idx, va_idx in skf.split(trk, yv):
        agg = pd.DataFrame({'k': trk.iloc[tr_idx].values, 'y': yv[tr_idx]})
        g = agg.groupby('k')['y'].agg(['sum','count'])
        enc = (g['sum'] + prior * smooth) / (g['count'] + smooth)

        kva = trk.iloc[va_idx]
        oof_mean[va_idx] = kva.map(enc).fillna(prior).values
        oof_cnt[va_idx] = kva.map(g['count']).fillna(0).values

    # Full-train mapping for test
    agg_full = pd.DataFrame({'k': trk.values, 'y': yv})
    g_full = agg_full.groupby('k')['y'].agg(['sum','count'])
    enc_full = (g_full['sum'] + prior * smooth) / (g_full['count'] + smooth)

    test_mean = tek.map(enc_full).fillna(prior).values
    test_cnt = tek.map(g_full['count']).fillna(0).values

    return OOFEnc(oof_mean=oof_mean, test_mean=test_mean, oof_cnt=oof_cnt, test_cnt=test_cnt, prior=prior)


In [None]:
def build_features(train_df, test_df, y):
    X_tr = pd.DataFrame(index=train_df.index)
    X_te = pd.DataFrame(index=test_df.index)

    num_cols = [c for c in train_df.columns if c not in ['id','is_fake'] and pd.api.types.is_numeric_dtype(train_df[c])]
    for c in num_cols:
        med = train_df[c].median()
        X_tr[c] = train_df[c].fillna(med)
        X_te[c] = test_df[c].fillna(med)

    te_cat = oof_target_mean_count(train_df['category'], y, test_df['category'], smooth=50.0)
    te_brand = oof_target_mean_count(train_df['brand'], y, test_df['brand'], smooth=50.0)

    X_tr['te_cat'] = te_cat.oof_mean
    X_te['te_cat'] = te_cat.test_mean

    X_tr['te_brand'] = te_brand.oof_mean
    X_te['te_brand'] = te_brand.test_mean

    sid_tr = train_df['seller_id'].astype(str)
    sid_te = test_df['seller_id'].astype(str)
    X_tr['cat_seller_mix_mean'] = pd.Series(te_cat.oof_mean, index=train_df.index).groupby(sid_tr).transform('mean').values
    X_te['cat_seller_mix_mean'] = pd.Series(te_cat.test_mean, index=test_df.index).groupby(sid_te).transform('mean').values
    X_tr['brand_seller_mix_mean'] = pd.Series(te_brand.oof_mean, index=train_df.index).groupby(sid_tr).transform('mean').values
    X_te['brand_seller_mix_mean'] = pd.Series(te_brand.test_mean, index=test_df.index).groupby(sid_te).transform('mean').values

    tr_cnt = train_df['seller_id'].value_counts()
    te_cnt = test_df['seller_id'].value_counts()
    X_tr['seller_cnt'] = train_df['seller_id'].map(tr_cnt).fillna(0).astype(int)
    X_te['seller_cnt'] = test_df['seller_id'].map(te_cnt).fillna(0).astype(int)

    X_tr['seller_id'] = train_df['seller_id'].astype(str)
    X_te['seller_id'] = test_df['seller_id'].astype(str)

    X_tr['desc_len'] = train_df['description'].fillna('').astype(str).str.len().values
    X_te['desc_len'] = test_df['description'].fillna('').astype(str).str.len().values
    X_tr['title_len'] = train_df['title_name'].fillna('').astype(str).str.len().values
    X_te['title_len'] = test_df['title_name'].fillna('').astype(str).str.len().values

    X_tr['log_item_time_alive'] = np.log1p(train_df['item_time_alive'].fillna(0)).values
    X_te['log_item_time_alive'] = np.log1p(test_df['item_time_alive'].fillna(0)).values

    X_tr['log_price'] = np.log1p(train_df['price'].fillna(0)).values
    X_te['log_price'] = np.log1p(test_df['price'].fillna(0)).values

    tr_cat_cnt = train_df['category'].value_counts()
    te_cat_cnt = test_df['category'].value_counts()
    X_tr['logcnt_cat'] = np.log1p(train_df['category'].map(tr_cat_cnt).fillna(0)).values
    X_te['logcnt_cat'] = np.log1p(test_df['category'].map(te_cat_cnt).fillna(0)).values
    
    tr_brand_cnt = train_df['brand'].value_counts()
    te_brand_cnt = test_df['brand'].value_counts()
    X_tr['logcnt_brand'] = np.log1p(train_df['brand'].map(tr_brand_cnt).fillna(0)).values
    X_te['logcnt_brand'] = np.log1p(test_df['brand'].map(te_brand_cnt).fillna(0)).values

    cat_cols = [c for c in ['category', 'brand', 'seller_id'] if c in X_tr.columns]
    cat_features = [X_tr.columns.get_loc(c) for c in cat_cols]

    return X_tr, X_te, cat_features, cat_cols


X_tr, X_te, cat_features, cat_cols = build_features(train_df, test_df, y)
print('X_tr:', X_tr.shape, 'X_te:', X_te.shape)
print('cat_cols:', cat_cols)
print('n_features:', X_tr.shape[1])



X_tr: (138039, 31) X_te: (59159, 31)
cat_cols: ['seller_id']
n_features: 31


In [None]:
CB_PARAMS = dict(
    iterations=2400,
    learning_rate=0.03,
    depth=9,
    l2_leaf_reg=1,
    random_strength=1.1,
    bootstrap_type='Bernoulli',
    subsample=0.8,
    loss_function='Logloss',
    eval_metric='PRAUC',
    random_seed=RANDOM_STATE,
    verbose=200,
    max_ctr_complexity=1,
) #0.66

In [66]:
CB_PARAMS = dict(
    iterations=2000,
    learning_rate=0.1,
    depth=7,
    l2_leaf_reg=40,
    random_strength=0.5,
    bootstrap_type='Bayesian',
    # subsample=0.6,
    loss_function='Logloss',
    eval_metric='PRAUC',
    random_seed=RANDOM_STATE,
    verbose=200,
    max_ctr_complexity=1,
) #0.66

In [67]:
train_pool_full = Pool(X_tr, y, cat_features=cat_features)

model = CatBoostClassifier(**CB_PARAMS, task_type='GPU')
model.fit(train_pool_full)


Default metric period is 5 because PRAUC is/are not implemented for GPU
Metric PRAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.7408249	total: 19.9ms	remaining: 39.8s
200:	learn: 0.8910031	total: 2.78s	remaining: 24.9s
400:	learn: 0.9114979	total: 5.54s	remaining: 22.1s
600:	learn: 0.9264203	total: 8.32s	remaining: 19.4s
800:	learn: 0.9381222	total: 11.1s	remaining: 16.6s
1000:	learn: 0.9467381	total: 13.8s	remaining: 13.8s
1200:	learn: 0.9536738	total: 16.5s	remaining: 11s
1400:	learn: 0.9592942	total: 19.3s	remaining: 8.23s
1600:	learn: 0.9642819	total: 22s	remaining: 5.48s
1800:	learn: 0.9685636	total: 24.7s	remaining: 2.73s
1999:	learn: 0.9721054	total: 27.5s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x2ce668eb460>

In [None]:

ensemble_variants = [
    ('seed0', {'random_seed': RANDOM_STATE}),
    ('seed1', {'random_seed': RANDOM_STATE + 1}),
    ('seed2', {'random_seed': RANDOM_STATE + 2}),
]

ens_preds = []
for name, overrides in ensemble_variants:
    params = dict(CB_PARAMS)
    params.update(overrides)


    model_ens = CatBoostClassifier(**params, task_type='GPU')
    model_ens.fit(train_pool_full)


    pred = model_ens.predict_proba(X_te)[:, 1]
    ens_preds.append(pred)

    out = f"data/sub_boost_ens_{name}.csv"
    pd.DataFrame({'id': test_df['id'].values, 'is_fake': pred}).to_csv(out, index=False)
    print('saved', out)

ens_preds = np.vstack(ens_preds)
rank_pred = np.vstack([pd.Series(p).rank(pct=True).values for p in ens_preds]).mean(axis=0)

out_rank = 'data/sub_ens.csv'

pd.DataFrame({'id': test_df['id'].values, 'is_fake': rank_pred}).to_csv(out_rank, index=False)

print('saved', out_rank)



Default metric period is 5 because PRAUC is/are not implemented for GPU
Metric PRAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.7408249	total: 20.1ms	remaining: 40.1s


KeyboardInterrupt: 

In [36]:
importances = model.get_feature_importance()
imp_series = pd.Series(importances, index=X_tr.columns).sort_values(ascending=False)
imp_series

te_cat                   10.595437
desc_len                  7.696589
title_len                 7.546060
te_brand                  7.406527
item_time_alive           6.409198
logcnt_cat                5.838128
seller_time_alive         5.815858
brand_seller_mix_mean     5.714207
seller_cnt                5.515500
cat_seller_mix_mean       5.230381
price                     4.988133
logcnt_brand              4.800475
seller_id                 4.609796
log_price                 3.451147
item_variety_count        3.405865
item_available_count      2.766540
log_item_time_alive       2.727550
rating_5_count            1.208841
item_count_sales90        0.994706
comments_count            0.937803
item_count_sales30        0.475404
photos_count              0.462278
rating_1_count            0.460172
rating_4_count            0.228004
item_count_sales7         0.172100
rating_3_count            0.142778
item_count_returns90      0.141222
rating_2_count            0.127105
videos_count        

In [68]:
pred_test = model.predict_proba(X_te)[:, 1]
sub = pd.DataFrame({'id': test_df['id'].values, 'is_fake': pred_test})
out = 'data/sub_c1.csv'
sub.to_csv(out, index=False)
print('saved', out)

saved data/sub_c1.csv
