In [1]:
!pip -q install catboost iterative-stratification

import gc
import ast
import json
import numpy as np
import pandas as pd

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import VotingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.utils.class_weight import compute_sample_weight

from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data preparing

In [2]:
train_df = pd.read_csv('/kaggle/input/vseros-1e/train_frida_emb.csv')
test_df = pd.read_csv('/kaggle/input/vseros-1e/test_frida_emb.csv')
train_df['text'] = train_df['text'].apply(lambda x: x[len('categorize_topic: '):])
test_df['text'] = test_df['text'].apply(lambda x: x[len('categorize_topic: '):])
print(train_df.shape)
print(test_df.shape)
train_df.head()

(53494, 8)
(15046, 5)


Unnamed: 0,app_name,full_description,shortDescription,labels_str,labels_list,n_labels,text,embedding
0,–ú–∞—Ç—Ä–æ–Ω–∞ –ú–æ—Å–∫–æ–≤—Å–∫–∞—è.,–•–æ—á–µ—à—å –∑–Ω–∞—Ç—å —á—Ç–æ –±—É–¥–µ—Ç? –ó–∞–≥–ª—è–Ω–∏ –≤ –±—É–¥—É—â–∏–µ. –ú–∞—Ç...,–£–∑–Ω–∞—Ç—å –±—É–¥—É—â–µ–µ. –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è . –ú–∞–≥–∏—è –≤–æ–ª—à–µ–±–Ω–æ–≥...,lifestyle,['lifestyle'],1,–ú–∞—Ç—Ä–æ–Ω–∞ –ú–æ—Å–∫–æ–≤—Å–∫–∞—è. –£–∑–Ω–∞—Ç—å –±—É–¥—É—â–µ–µ. –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω...,"[-0.02546943910419941, -0.02646569162607193, -..."
1,"Run and Jump - ""–ë–µ—Å–∫–æ–Ω–µ—á–Ω—ã–π —Ä–∞–Ω–Ω–µ—Ä""",Run and Jump —ç—Ç–æ –Ω–æ–≤–∞—è —É–≤–ª–µ–∫–∞—Ç–µ–ª—å–Ω–∞—è –ê—Ä–∫–∞–¥–∞ - ...,–ü–ª–∞—Ç—Ñ–æ—Ä–º–µ–Ω–Ω–∞—è –ê—Ä–∫–∞–¥–∞ - –†–∞–Ω–Ω–µ—Ä —Å –∑–∞—Ö–≤–∞—Ç—ã–≤–∞—é—â–∏–º ...,action|arcade,"['action', 'arcade']",2,"Run and Jump - ""–ë–µ—Å–∫–æ–Ω–µ—á–Ω—ã–π —Ä–∞–Ω–Ω–µ—Ä"" –ü–ª–∞—Ç—Ñ–æ—Ä–º–µ–Ω...","[-0.03978952392935753, -0.004444682039320469, ..."
2,Ghost Maze,"–ò–≥—Ä–∞—è –∑–∞ –ø—Ä–∏–≤–µ–¥–µ–Ω–∏–µ, —Å–æ–±–∏—Ä–∞–π –Ω–µ–æ–±—Ö–æ–¥–∏–º—ã–µ –∫–æ–º–±–∏...",–ò–≥—Ä–∞-–≥–æ–ª–æ–≤–æ–ª–æ–º–∫–∞. –ü–æ–ø—Ä–æ–±—É–π –Ω–∞–π—Ç–∏ –≤—ã—Ö–æ–¥ –∏–∑ –ª–∞–±–∏...,arcade|puzzle,"['arcade', 'puzzle']",2,Ghost Maze –ò–≥—Ä–∞-–≥–æ–ª–æ–≤–æ–ª–æ–º–∫–∞. –ü–æ–ø—Ä–æ–±—É–π –Ω–∞–π—Ç–∏ –≤—ã...,"[-0.04193723574280739, -0.0033695693127810955,..."
3,LabTools Mobile: –õ–ê–ë–û–†–ê–¢–û–†–ù–´–ï –ü–†–ò–ë–û–†–´,AR-–ø—Ä–∏–ª–æ–∂–µ–Ω–∏–µ ¬´LabTools Mobile: –õ–∞–±–æ—Ä–∞—Ç–æ—Ä–Ω—ã–µ –ø...,AR-–ø—Ä–∏–ª–æ–∂–µ–Ω–∏–µ –∫–æ—Ç–æ—Ä–æ–µ –ø—Ä–µ–¥–Ω–∞–∑–Ω–∞—á–µ–Ω–æ –¥–ª—è –∏–∑—É—á–µ–Ω...,education,['education'],1,LabTools Mobile: –õ–ê–ë–û–†–ê–¢–û–†–ù–´–ï –ü–†–ò–ë–û–†–´ AR-–ø—Ä–∏–ª–æ...,"[-0.044376254081726074, -0.028724001720547676,..."
4,Mario Anime Coloring,\n–†–∞—Å–∫—Ä–∞—Å—å—Ç–µ —Å–≤–æ–∏—Ö –ª—é–±–∏–º—ã—Ö –ø–µ—Ä—Å–æ–Ω–∞–∂–µ–π –∏–∑ mario...,–†–∞—Å–∫—Ä–∞—Å—å—Ç–µ —Å–≤–æ–∏—Ö –ª—é–±–∏–º—ã—Ö –ø–µ—Ä—Å–æ–Ω–∞–∂–µ–π –∏–∑ mario —è...,children|family,"['children', 'family']",2,Mario Anime Coloring –†–∞—Å–∫—Ä–∞—Å—å—Ç–µ —Å–≤–æ–∏—Ö –ª—é–±–∏–º—ã—Ö ...,"[-0.03006214275956154, -0.011583167128264904, ..."


In [3]:
def safe_json_loads(x):
    return json.loads(x)

train_df['embedding'] = train_df['embedding'].apply(safe_json_loads)
test_df['embedding'] = test_df['embedding'].apply(safe_json_loads)
gc.collect()

0

In [4]:
train_df['labels_list'] = train_df['labels_list'].apply(lambda x: eval(x))
gc.collect()

0

### Tfidf features

In [5]:
import re

russian_stopwords = list(stopwords.words('russian'))

def russian_tokenizer(text):
    tokens = re.findall(r'[a-zA-Z–∞-—è–ê-–Ø—ë–Å0-9]+', str(text).lower())
    return [token for token in tokens if len(token) > 1]

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    max_features=200,
    ngram_range=(1, 2),
    stop_words=russian_stopwords,        # ‚Üê —Ä—É—Å—Å–∫–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤–∞
    lowercase=True,
    tokenizer=russian_tokenizer,         # ‚Üê –∫–∞—Å—Ç–æ–º–Ω—ã–π —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä
    token_pattern=None,                  # ‚Üê –æ—Ç–∫–ª—é—á–∞–µ–º –≤—Å—Ç—Ä–æ–µ–Ω–Ω—ã–π, –∏—Å–ø–æ–ª—å–∑—É–µ–º —Å–≤–æ–π
    strip_accents=None,                  # ‚Üê –Ω–µ —Ç—Ä–æ–≥–∞–µ–º –∫–∏—Ä–∏–ª–ª–∏—Ü—É
    min_df=3,
    max_df=0.8
)

X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['text'].fillna(''))
X_test_tfidf = tfidf_vectorizer.transform(test_df['text'].fillna(''))

In [7]:
# –ü–æ–ª—É—á–∞–µ–º –∏–º–µ–Ω–∞ —Ñ–∏—á–µ–π –∏–∑ TF-IDF
tfidf_feature_names = [f"tfidf_{i}" for i in range(X_train_tfidf.shape[1])]

# –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º sparse ‚Üí dense –∏ –¥–æ–±–∞–≤–ª—è–µ–º –≤ DataFrame
train_tfidf_dense = X_train_tfidf.toarray()
test_tfidf_dense = X_test_tfidf.toarray()

# –°–æ–∑–¥–∞—ë–º DataFrame —Å TF-IDF –ø—Ä–∏–∑–Ω–∞–∫–∞–º–∏
train_tfidf_df = pd.DataFrame(train_tfidf_dense, columns=tfidf_feature_names, index=train_df.index)
test_tfidf_df = pd.DataFrame(test_tfidf_dense, columns=tfidf_feature_names, index=test_df.index)

print(train_df.shape, test_df.shape)
# –ö–æ–Ω–∫–∞—Ç–µ–Ω–∏—Ä—É–µ–º —Å –∏—Å—Ö–æ–¥–Ω—ã–º–∏ –¥–∞—Ç–∞—Ñ—Ä–µ–π–º–∞–º–∏
train_df = pd.concat([train_df, train_tfidf_df], axis=1)
test_df = pd.concat([test_df, test_tfidf_df], axis=1)
print(train_df.shape, test_df.shape)

(53494, 8) (15046, 5)
(53494, 208) (15046, 205)


In [8]:
def add_text_feats(df, text_col):
    s = df[text_col].fillna('').astype(str)
    df[f'{text_col}_len'] = s.str.len()
    df[f'{text_col}_words'] = s.str.split().apply(len)
    df[f'{text_col}_uniq_words'] = s.apply(lambda x: len(set(x.split())))
    df[f'{text_col}_digits'] = s.str.count(r'\d')
    df[f'{text_col}_punc'] = s.str.count(r'[^\w\s]', flags=0)
    df[f'{text_col}_upper'] = s.apply(lambda x: sum(ch.isupper() for ch in x))
    df[f'{text_col}_avgw'] = (df[f'{text_col}_len'] / df[f'{text_col}_words'].replace(0, 1)).astype(float)
    df[f'{text_col}_upr'] = (df[f'{text_col}_upper'] / df[f'{text_col}_len'].replace(0, 1)).astype(float)
    return df

def add_emb_stats(df, emb_col):
    e = df[emb_col].apply(lambda v: np.array(v, dtype=np.float32) if isinstance(v, list) else np.array([], dtype=np.float32))
    df['emb_dim'] = e.apply(lambda v: len(v))
    df['emb_min'] = e.apply(lambda v: float(v.min()) if v.size else 0.0)
    df['emb_max'] = e.apply(lambda v: float(v.max()) if v.size else 0.0)
    df['emb_mean'] = e.apply(lambda v: float(v.mean()) if v.size else 0.0)
    df['emb_std'] = e.apply(lambda v: float(v.std()) if v.size else 0.0)
    df['emb_norm'] = e.apply(lambda v: float(np.linalg.norm(v)) if v.size else 0.0)
    return df

train_df = add_text_feats(train_df, 'text')
test_df = add_text_feats(test_df, 'text')
train_df = add_emb_stats(train_df, 'embedding')
test_df = add_emb_stats(test_df, 'embedding')
print(train_df.shape)
train_df.head()

(53494, 222)


Unnamed: 0,app_name,full_description,shortDescription,labels_str,labels_list,n_labels,text,embedding,tfidf_0,tfidf_1,...,text_punc,text_upper,text_avgw,text_upr,emb_dim,emb_min,emb_max,emb_mean,emb_std,emb_norm
0,–ú–∞—Ç—Ä–æ–Ω–∞ –ú–æ—Å–∫–æ–≤—Å–∫–∞—è.,–•–æ—á–µ—à—å –∑–Ω–∞—Ç—å —á—Ç–æ –±—É–¥–µ—Ç? –ó–∞–≥–ª—è–Ω–∏ –≤ –±—É–¥—É—â–∏–µ. –ú–∞—Ç...,–£–∑–Ω–∞—Ç—å –±—É–¥—É—â–µ–µ. –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è . –ú–∞–≥–∏—è –≤–æ–ª—à–µ–±–Ω–æ–≥...,lifestyle,[lifestyle],1,–ú–∞—Ç—Ä–æ–Ω–∞ –ú–æ—Å–∫–æ–≤—Å–∫–∞—è. –£–∑–Ω–∞—Ç—å –±—É–¥—É—â–µ–µ. –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω...,"[-0.02546943910419941, -0.02646569162607193, -...",0.0,0.0,...,18,17,6.819149,0.026521,1536,-0.070459,0.06482,-0.000436,0.025512,1.0
1,"Run and Jump - ""–ë–µ—Å–∫–æ–Ω–µ—á–Ω—ã–π —Ä–∞–Ω–Ω–µ—Ä""",Run and Jump —ç—Ç–æ –Ω–æ–≤–∞—è —É–≤–ª–µ–∫–∞—Ç–µ–ª—å–Ω–∞—è –ê—Ä–∫–∞–¥–∞ - ...,–ü–ª–∞—Ç—Ñ–æ—Ä–º–µ–Ω–Ω–∞—è –ê—Ä–∫–∞–¥–∞ - –†–∞–Ω–Ω–µ—Ä —Å –∑–∞—Ö–≤–∞—Ç—ã–≤–∞—é—â–∏–º ...,action|arcade,"[action, arcade]",2,"Run and Jump - ""–ë–µ—Å–∫–æ–Ω–µ—á–Ω—ã–π —Ä–∞–Ω–Ω–µ—Ä"" –ü–ª–∞—Ç—Ñ–æ—Ä–º–µ–Ω...","[-0.03978952392935753, -0.004444682039320469, ...",0.0,0.0,...,29,33,6.976744,0.036667,1536,-0.072544,0.064759,-0.000229,0.025514,1.0
2,Ghost Maze,"–ò–≥—Ä–∞—è –∑–∞ –ø—Ä–∏–≤–µ–¥–µ–Ω–∏–µ, —Å–æ–±–∏—Ä–∞–π –Ω–µ–æ–±—Ö–æ–¥–∏–º—ã–µ –∫–æ–º–±–∏...",–ò–≥—Ä–∞-–≥–æ–ª–æ–≤–æ–ª–æ–º–∫–∞. –ü–æ–ø—Ä–æ–±—É–π –Ω–∞–π—Ç–∏ –≤—ã—Ö–æ–¥ –∏–∑ –ª–∞–±–∏...,arcade|puzzle,"[arcade, puzzle]",2,Ghost Maze –ò–≥—Ä–∞-–≥–æ–ª–æ–≤–æ–ª–æ–º–∫–∞. –ü–æ–ø—Ä–æ–±—É–π –Ω–∞–π—Ç–∏ –≤—ã...,"[-0.04193723574280739, -0.0033695693127810955,...",0.0,0.0,...,8,6,7.793103,0.026549,1536,-0.067927,0.062633,-0.000385,0.025513,1.0
3,LabTools Mobile: –õ–ê–ë–û–†–ê–¢–û–†–ù–´–ï –ü–†–ò–ë–û–†–´,AR-–ø—Ä–∏–ª–æ–∂–µ–Ω–∏–µ ¬´LabTools Mobile: –õ–∞–±–æ—Ä–∞—Ç–æ—Ä–Ω—ã–µ –ø...,AR-–ø—Ä–∏–ª–æ–∂–µ–Ω–∏–µ –∫–æ—Ç–æ—Ä–æ–µ –ø—Ä–µ–¥–Ω–∞–∑–Ω–∞—á–µ–Ω–æ –¥–ª—è –∏–∑—É—á–µ–Ω...,education,[education],1,LabTools Mobile: –õ–ê–ë–û–†–ê–¢–û–†–ù–´–ï –ü–†–ò–ë–û–†–´ AR-–ø—Ä–∏–ª–æ...,"[-0.044376254081726074, -0.028724001720547676,...",0.0,0.172609,...,182,131,7.384755,0.032195,1536,-0.068226,0.060918,0.000156,0.025515,1.0
4,Mario Anime Coloring,\n–†–∞—Å–∫—Ä–∞—Å—å—Ç–µ —Å–≤–æ–∏—Ö –ª—é–±–∏–º—ã—Ö –ø–µ—Ä—Å–æ–Ω–∞–∂–µ–π –∏–∑ mario...,–†–∞—Å–∫—Ä–∞—Å—å—Ç–µ —Å–≤–æ–∏—Ö –ª—é–±–∏–º—ã—Ö –ø–µ—Ä—Å–æ–Ω–∞–∂–µ–π –∏–∑ mario —è...,children|family,"[children, family]",2,Mario Anime Coloring –†–∞—Å–∫—Ä–∞—Å—å—Ç–µ —Å–≤–æ–∏—Ö –ª—é–±–∏–º—ã—Ö ...,"[-0.03006214275956154, -0.011583167128264904, ...",0.0,0.0,...,61,37,7.191011,0.028906,1536,-0.069395,0.065026,4.7e-05,0.025515,1.0


In [9]:
def expand_embedding(df, emb_col, prefix='emb'):
    arr = np.vstack(df[emb_col].apply(lambda v: np.array(v, dtype=np.float32)))
    emb_cols = [f'{prefix}_{i}' for i in range(arr.shape[1])]
    emb_df = pd.DataFrame(arr, columns=emb_cols, index=df.index)
    out = pd.concat([df.drop(columns=[emb_col]), emb_df], axis=1)
    return out, emb_cols

train_df, emb_cols = expand_embedding(train_df, 'embedding')
test_df, emb_cols = expand_embedding(test_df, 'embedding')
print(train_df.shape)
train_df.head()

(53494, 1757)


Unnamed: 0,app_name,full_description,shortDescription,labels_str,labels_list,n_labels,text,tfidf_0,tfidf_1,tfidf_2,...,emb_1526,emb_1527,emb_1528,emb_1529,emb_1530,emb_1531,emb_1532,emb_1533,emb_1534,emb_1535
0,–ú–∞—Ç—Ä–æ–Ω–∞ –ú–æ—Å–∫–æ–≤—Å–∫–∞—è.,–•–æ—á–µ—à—å –∑–Ω–∞—Ç—å —á—Ç–æ –±—É–¥–µ—Ç? –ó–∞–≥–ª—è–Ω–∏ –≤ –±—É–¥—É—â–∏–µ. –ú–∞—Ç...,–£–∑–Ω–∞—Ç—å –±—É–¥—É—â–µ–µ. –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è . –ú–∞–≥–∏—è –≤–æ–ª—à–µ–±–Ω–æ–≥...,lifestyle,[lifestyle],1,–ú–∞—Ç—Ä–æ–Ω–∞ –ú–æ—Å–∫–æ–≤—Å–∫–∞—è. –£–∑–Ω–∞—Ç—å –±—É–¥—É—â–µ–µ. –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω...,0.0,0.0,0.0,...,0.013285,-0.027001,-0.006622,-0.048583,-0.023744,0.014363,0.035482,-0.01602,0.008836,0.055232
1,"Run and Jump - ""–ë–µ—Å–∫–æ–Ω–µ—á–Ω—ã–π —Ä–∞–Ω–Ω–µ—Ä""",Run and Jump —ç—Ç–æ –Ω–æ–≤–∞—è —É–≤–ª–µ–∫–∞—Ç–µ–ª—å–Ω–∞—è –ê—Ä–∫–∞–¥–∞ - ...,–ü–ª–∞—Ç—Ñ–æ—Ä–º–µ–Ω–Ω–∞—è –ê—Ä–∫–∞–¥–∞ - –†–∞–Ω–Ω–µ—Ä —Å –∑–∞—Ö–≤–∞—Ç—ã–≤–∞—é—â–∏–º ...,action|arcade,"[action, arcade]",2,"Run and Jump - ""–ë–µ—Å–∫–æ–Ω–µ—á–Ω—ã–π —Ä–∞–Ω–Ω–µ—Ä"" –ü–ª–∞—Ç—Ñ–æ—Ä–º–µ–Ω...",0.0,0.0,0.671788,...,0.003523,-0.048084,-0.013755,-0.050502,-0.024837,0.019303,0.011996,-0.025982,0.033767,0.064759
2,Ghost Maze,"–ò–≥—Ä–∞—è –∑–∞ –ø—Ä–∏–≤–µ–¥–µ–Ω–∏–µ, —Å–æ–±–∏—Ä–∞–π –Ω–µ–æ–±—Ö–æ–¥–∏–º—ã–µ –∫–æ–º–±–∏...",–ò–≥—Ä–∞-–≥–æ–ª–æ–≤–æ–ª–æ–º–∫–∞. –ü–æ–ø—Ä–æ–±—É–π –Ω–∞–π—Ç–∏ –≤—ã—Ö–æ–¥ –∏–∑ –ª–∞–±–∏...,arcade|puzzle,"[arcade, puzzle]",2,Ghost Maze –ò–≥—Ä–∞-–≥–æ–ª–æ–≤–æ–ª–æ–º–∫–∞. –ü–æ–ø—Ä–æ–±—É–π –Ω–∞–π—Ç–∏ –≤—ã...,0.0,0.0,0.0,...,0.014284,-0.045735,-0.018553,-0.049476,-0.02534,0.018433,0.014861,-0.024793,0.032536,0.062633
3,LabTools Mobile: –õ–ê–ë–û–†–ê–¢–û–†–ù–´–ï –ü–†–ò–ë–û–†–´,AR-–ø—Ä–∏–ª–æ–∂–µ–Ω–∏–µ ¬´LabTools Mobile: –õ–∞–±–æ—Ä–∞—Ç–æ—Ä–Ω—ã–µ –ø...,AR-–ø—Ä–∏–ª–æ–∂–µ–Ω–∏–µ –∫–æ—Ç–æ—Ä–æ–µ –ø—Ä–µ–¥–Ω–∞–∑–Ω–∞—á–µ–Ω–æ –¥–ª—è –∏–∑—É—á–µ–Ω...,education,[education],1,LabTools Mobile: –õ–ê–ë–û–†–ê–¢–û–†–ù–´–ï –ü–†–ò–ë–û–†–´ AR-–ø—Ä–∏–ª–æ...,0.0,0.172609,0.0,...,0.009105,-0.037666,0.009979,-0.053618,-0.005597,0.012486,0.016367,-0.01513,0.011178,0.056622
4,Mario Anime Coloring,\n–†–∞—Å–∫—Ä–∞—Å—å—Ç–µ —Å–≤–æ–∏—Ö –ª—é–±–∏–º—ã—Ö –ø–µ—Ä—Å–æ–Ω–∞–∂–µ–π –∏–∑ mario...,–†–∞—Å–∫—Ä–∞—Å—å—Ç–µ —Å–≤–æ–∏—Ö –ª—é–±–∏–º—ã—Ö –ø–µ—Ä—Å–æ–Ω–∞–∂–µ–π –∏–∑ mario —è...,children|family,"[children, family]",2,Mario Anime Coloring –†–∞—Å–∫—Ä–∞—Å—å—Ç–µ —Å–≤–æ–∏—Ö –ª—é–±–∏–º—ã—Ö ...,0.0,0.0,0.0,...,0.015005,-0.044766,-0.020036,-0.044337,-0.018741,0.01702,0.012762,-0.01667,0.029289,0.062918


In [10]:
MIN_SAMPLES_PER_CLASS = 50

all_labels_flat = [label for labels in train_df['labels_list'] for label in labels]
label_counts = pd.Series(all_labels_flat).value_counts()
small_classes = set(label_counts[label_counts <= MIN_SAMPLES_PER_CLASS].index)

print(sorted(small_classes))

['ar', 'casino', 'gambling', 'games']


In [11]:
def has_at_least_one_valid_label(labels, small_classes):
    return any(label not in small_classes for label in labels)

old_len = len(train_df)
mask_keep = train_df['labels_list'].apply(lambda x: has_at_least_one_valid_label(x, small_classes))
train_df = train_df[mask_keep].reset_index(drop=True)

print(f'deleted {old_len - len(train_df)} samples')

deleted 57 samples


In [12]:
def remove_small_labels(labels, small_classes):
    return [label for label in labels if label not in small_classes]

train_df['labels_list'] = train_df['labels_list'].apply(
    lambda x: remove_small_labels(x, small_classes)
)

train_df = train_df[train_df['labels_list'].apply(len) > 0].reset_index(drop=True)

In [13]:
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(train_df['labels_list'])
classes = list(mlb.classes_)
len(classes)

41

## Train / Val split

In [14]:
text_cols = []
num_cols = [c for c in train_df.columns if c not in ['app_name', 'full_description', 'shortDescription', 'labels_str', 'n_labels', 'labels_list', 'text']]

X = train_df[num_cols + text_cols].copy()

In [15]:
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.08, random_state=42)
train_idx, val_idx = next(msss.split(X, Y))

# Xcb_tr, Xcb_va = X_cb.iloc[tr_idx].reset_index(drop=True), X_cb.iloc[va_idx].reset_index(drop=True)
X_train, X_val = X.iloc[train_idx].reset_index(drop=True), X.iloc[val_idx].reset_index(drop=True)
Y_train, Y_val = Y[train_idx], Y[val_idx]
print('Split:', X_train.shape, X_val.shape)

Split: (49130, 1750) (4307, 1750)


In [16]:
X_test = test_df[num_cols + text_cols].copy()

## Boostings

In [17]:
RANDOM_STATE = 52

In [18]:
def hitrate_at_k(y_true, y_proba, k=3):
    hits = 0
    for i in range(len(y_true)):
        true_labels = set(np.where(y_true[i] == 1)[0])
        if not true_labels:
            continue
        top_k_pred = set(np.argsort(y_proba[i])[::-1][:k])
        if len(true_labels & top_k_pred) > 0:
            hits += 1
    return hits / len(y_true)

In [19]:
def save_submission(proba_matrix, mlb, test_app_names, filename):
    # –ü–æ–ª—É—á–∞–µ–º –∏–Ω–¥–µ–∫—Å—ã —Ç–æ–ø-3 –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π
    top3_indices = np.argsort(proba_matrix, axis=1)[:, ::-1][:, :3]

    # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –∏–Ω–¥–µ–∫—Å—ã –≤ –Ω–∞–∑–≤–∞–Ω–∏—è –∫–ª–∞—Å—Å–æ–≤
    top3_labels = [mlb.classes_[idxs] for idxs in top3_indices]

    # –û–±—ä–µ–¥–∏–Ω—è–µ–º –Ω–∞–∑–≤–∞–Ω–∏—è —á–µ—Ä–µ–∑ "|"
    predictions = ['|'.join(labels) for labels in top3_labels]

    # –°–æ–∑–¥–∞–µ–º DataFrame —Å –ø—Ä–∞–≤–∏–ª—å–Ω—ã–º–∏ –∏–º–µ–Ω–∞–º–∏ –∫–æ–ª–æ–Ω–æ–∫
    submission = pd.DataFrame({
        'app_name': test_app_names,
        'labels_str': predictions
    })

    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –≤ TSV —Ñ–æ—Ä–º–∞—Ç–µ
    submission.to_csv(filename, sep='\t', index=False)
    print(f"üíæ –°–∞–±–º–∏—Ç —Å–æ—Ö—Ä–∞–Ω–µ–Ω: {filename}")

In [20]:
import joblib
import os

MODEL_DIR = "models"
os.makedirs(MODEL_DIR, exist_ok=True)

def save_model(model, filename):
    path = os.path.join(MODEL_DIR, filename)
    joblib.dump(model, path)
    print(f"üíæ –ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞: {path}")

def load_model(filename):
    path = os.path.join(MODEL_DIR, filename)
    if os.path.exists(path):
        model = joblib.load(path)
        print(f"üìÇ –ú–æ–¥–µ–ª—å –∑–∞–≥—Ä—É–∂–µ–Ω–∞: {path}")
        return model
    else:
        print(f"‚ö†Ô∏è  –ú–æ–¥–µ–ª—å –Ω–µ –Ω–∞–π–¥–µ–Ω–∞: {path}")
        return None

In [21]:
print("üå≤ –û–±—É—á–µ–Ω–∏–µ XGBoost –ø–æ –∫–ª–∞—Å—Å–∞–º...")

xgb_val_probas = []
xgb_test_probas = []

for i, label in enumerate(tqdm(classes, desc="XGBoost")):
    model_path = os.path.join(MODEL_DIR, f"xgb_{i}_{label}.joblib")
    
    if os.path.exists(model_path):
        print(f"üìÇ –ó–∞–≥—Ä—É–∂–∞–µ–º XGBoost: {label}")
        model = joblib.load(model_path)
    else:
        print(f"üÜï –û–±—É—á–∞–µ–º XGBoost: {label}")
        
        # –í—ã—á–∏—Å–ª—è–µ–º –≤–µ—Å–∞ –¥–ª—è –±–∞–ª–∞–Ω—Å–∏—Ä–æ–≤–∫–∏
        scale_pos_weight = sum(Y_train[:, i] == 0) / max(sum(Y_train[:, i] == 1), 1)
        
        model = XGBClassifier(
            objective='binary:logistic',
            eval_metric='logloss',
            max_depth=8,
            learning_rate=0.05,
            n_estimators=1200,
            subsample=0.8,
            colsample_bytree=0.8,
            scale_pos_weight=scale_pos_weight,
            random_state=RANDOM_STATE,
            n_jobs=1,
            tree_method='gpu_hist',
            predictor='gpu_predictor',
            use_label_encoder=False,
            verbosity=0
        )
        
        # –û–±—É—á–µ–Ω–∏–µ —Å —Ä–∞–Ω–Ω–µ–π –æ—Å—Ç–∞–Ω–æ–≤–∫–æ–π
        model.fit(
            X_train, Y_train[:, i],
            eval_set=[(X_val, Y_val[:, i])],
            early_stopping_rounds=50,
            verbose=200
        )
        
        # –°–æ—Ö—Ä–∞–Ω—è–µ–º
        joblib.dump(model, model_path)
    
    # –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–µ–π
    lgb_val_proba = model.predict_proba(X_val)[:, 1]
    lgb_test_proba = model.predict_proba(X_test)[:, 1]
    
    xgb_val_probas.append(lgb_val_proba)
    xgb_test_probas.append(lgb_test_proba)

# –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º –≤ –º–∞—Ç—Ä–∏—Ü—ã
xgb_val_proba_matrix = np.column_stack(xgb_val_probas)
xgb_test_proba_matrix = np.column_stack(xgb_test_probas)

print("‚úÖ XGBoost: –æ–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ")

üå≤ –û–±—É—á–µ–Ω–∏–µ XGBoost –ø–æ –∫–ª–∞—Å—Å–∞–º...


XGBoost:   0%|          | 0/41 [00:00<?, ?it/s]

üÜï –û–±—É—á–∞–µ–º XGBoost: action
[0]	validation_0-logloss:0.65942
[200]	validation_0-logloss:0.12446
[400]	validation_0-logloss:0.10326
[454]	validation_0-logloss:0.10423
üÜï –û–±—É—á–∞–µ–º XGBoost: adsAndServices
[0]	validation_0-logloss:0.65107
[200]	validation_0-logloss:0.03683
[296]	validation_0-logloss:0.03606
üÜï –û–±—É—á–∞–µ–º XGBoost: adventure
[0]	validation_0-logloss:0.66033
[200]	validation_0-logloss:0.09881
[361]	validation_0-logloss:0.09133
üÜï –û–±—É—á–∞–µ–º XGBoost: arcade
[0]	validation_0-logloss:0.66398
[200]	validation_0-logloss:0.19068
[400]	validation_0-logloss:0.16797
[452]	validation_0-logloss:0.16844
üÜï –û–±—É—á–∞–µ–º XGBoost: board
[0]	validation_0-logloss:0.64863
[200]	validation_0-logloss:0.01271
[264]	validation_0-logloss:0.01313
üÜï –û–±—É—á–∞–µ–º XGBoost: boardAndCard
[0]	validation_0-logloss:0.65021
[200]	validation_0-logloss:0.03115
[307]	validation_0-logloss:0.03090
üÜï –û–±—É—á–∞–µ–º XGBoost: books
[0]	validation_0-logloss:0.65123
[200]	valida

In [23]:
joblib.dump(xgb_val_proba_matrix, "xgb_val_proba.joblib")
joblib.dump(xgb_test_proba_matrix, "xgb_test_proba.joblib")

['xgb_test_proba.joblib']

In [24]:
hr3_xgb = hitrate_at_k(Y_val, xgb_val_proba_matrix, k=3)
print(f"XGBoost HitRate@3: {hr3_xgb:.5f}")

XGBoost HitRate@3: 0.91177


In [25]:
save_submission(xgb_test_proba_matrix, mlb, test_df['app_name'], "xgb_sub1.tsv")

üíæ –°–∞–±–º–∏—Ç —Å–æ—Ö—Ä–∞–Ω–µ–Ω: xgb_sub1.tsv


In [22]:
import gc
from tqdm import tqdm

print("üê± –û–±—É—á–µ–Ω–∏–µ CatBoost –ø–æ –∫–ª–∞—Å—Å–∞–º...")

PROBA_DIR = "proba_matrices"
os.makedirs(PROBA_DIR, exist_ok=True)

# –°–ø–∏—Å–∫–∏ –Ω–µ –Ω—É–∂–Ω—ã ‚Äî –±—É–¥–µ–º —Å–æ—Ö—Ä–∞–Ω—è—Ç—å –Ω–∞ –¥–∏—Å–∫
# cat_val_probas = []
# cat_test_probas = []

for i, label in enumerate(tqdm(classes, desc="CatBoost")):
    model_path = os.path.join(MODEL_DIR, f"cat_{i}_{label}.cbm")
    val_proba_path = os.path.join(PROBA_DIR, f"cat_val_proba_{i}.joblib")
    test_proba_path = os.path.join(PROBA_DIR, f"cat_test_proba_{i}.joblib")

    if os.path.exists(val_proba_path) and os.path.exists(test_proba_path):
        print(f"üìÇ –ó–∞–≥—Ä—É–∂–∞–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è CatBoost: {label}")
        continue  # –ü—Ä–æ–ø—É—Å–∫–∞–µ–º, –µ—Å–ª–∏ —É–∂–µ –µ—Å—Ç—å –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è

    if os.path.exists(model_path):
        print(f"üìÇ –ó–∞–≥—Ä—É–∂–∞–µ–º CatBoost: {label}")
        model = CatBoostClassifier()
        model.load_model(model_path)
    else:
        print(f"üÜï –û–±—É—á–∞–µ–º CatBoost: {label}")

        model = CatBoostClassifier(
            iterations=1000,              # ‚Üì –£–º–µ–Ω—å—à–∏–ª–∏
            learning_rate=0.05,
            depth=6,                      # ‚Üì –£–º–µ–Ω—å—à–∏–ª–∏
            loss_function='Logloss',
            eval_metric='Logloss',
            auto_class_weights='Balanced',
            early_stopping_rounds=50,
            random_seed=RANDOM_STATE,
            task_type='GPU',
            devices='0:0',
            verbose=200,
            thread_count=1                # ‚Üì –ò—Å–ø—Ä–∞–≤–ª–µ–Ω–æ!
        )

        model.fit(
            X_train, Y_train[:, i],
            eval_set=(X_val, Y_val[:, i]),
            use_best_model=True
        )

        model.save_model(model_path)

    # –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–µ–π
    cat_val_proba = model.predict_proba(X_val)[:, 1]
    cat_test_proba = model.predict_proba(X_test)[:, 1]

    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –Ω–∞ –¥–∏—Å–∫ ‚Üí –Ω–µ –¥–µ—Ä–∂–∏–º –≤ –ø–∞–º—è—Ç–∏!
    joblib.dump(cat_val_proba, val_proba_path)
    joblib.dump(cat_test_proba, test_proba_path)

    # –Ø–≤–Ω–æ —É–¥–∞–ª—è–µ–º –º–æ–¥–µ–ª—å –∏ –≤—ã–∑—ã–≤–∞–µ–º —Å–±–æ—Ä—â–∏–∫ –º—É—Å–æ—Ä–∞
    del model
    gc.collect()

print("‚úÖ CatBoost: –æ–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ ‚Äî –≤—Å–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –Ω–∞ –¥–∏—Å–∫.")

üê± –û–±—É—á–µ–Ω–∏–µ CatBoost –ø–æ –∫–ª–∞—Å—Å–∞–º...


CatBoost:   0%|          | 0/41 [00:00<?, ?it/s]

üÜï –û–±—É—á–∞–µ–º CatBoost: action
0:	learn: 0.6397421	test: 0.6416701	best: 0.6416701 (0)	total: 12.4s	remaining: 3h 25m 48s
bestTest = 0.2971531855
bestIteration = 140
Shrink model to first 141 iterations.


CatBoost:   2%|‚ñè         | 1/41 [00:48<32:31, 48.79s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: adsAndServices
0:	learn: 0.6402188	test: 0.6454957	best: 0.6454957 (0)	total: 142ms	remaining: 2m 21s
bestTest = 0.2224996032
bestIteration = 148
Shrink model to first 149 iterations.


CatBoost:   5%|‚ñç         | 2/41 [01:22<25:57, 39.93s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: adventure
0:	learn: 0.6514699	test: 0.6541663	best: 0.6541663 (0)	total: 144ms	remaining: 2m 24s
bestTest = 0.3438640924
bestIteration = 136
Shrink model to first 137 iterations.


CatBoost:   7%|‚ñã         | 3/41 [01:58<24:05, 38.03s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: arcade
0:	learn: 0.6533031	test: 0.6550309	best: 0.6550309 (0)	total: 148ms	remaining: 2m 28s
200:	learn: 0.2769670	test: 0.3511435	best: 0.3504260 (178)	total: 26.6s	remaining: 1m 45s
bestTest = 0.3504259584
bestIteration = 178
Shrink model to first 179 iterations.


CatBoost:  10%|‚ñâ         | 4/41 [02:39<24:18, 39.41s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: board
0:	learn: 0.6220008	test: 0.6281195	best: 0.6281195 (0)	total: 153ms	remaining: 2m 33s
bestTest = 0.258088255
bestIteration = 48
Shrink model to first 49 iterations.


CatBoost:  12%|‚ñà‚ñè        | 5/41 [03:03<20:20, 33.91s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: boardAndCard
0:	learn: 0.6300215	test: 0.6275806	best: 0.6275806 (0)	total: 145ms	remaining: 2m 25s
200:	learn: 0.1185485	test: 0.1756513	best: 0.1744894 (198)	total: 26.1s	remaining: 1m 43s
bestTest = 0.1733101161
bestIteration = 208
Shrink model to first 209 iterations.


CatBoost:  15%|‚ñà‚ñç        | 6/41 [03:48<21:56, 37.62s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: books
0:	learn: 0.6354252	test: 0.6394376	best: 0.6394376 (0)	total: 151ms	remaining: 2m 30s
200:	learn: 0.1170211	test: 0.1807451	best: 0.1799060 (199)	total: 24.9s	remaining: 1m 38s
bestTest = 0.1760384924
bestIteration = 238
Shrink model to first 239 iterations.


CatBoost:  17%|‚ñà‚ñã        | 7/41 [04:36<23:07, 40.80s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: business
0:	learn: 0.6338862	test: 0.6350673	best: 0.6350673 (0)	total: 146ms	remaining: 2m 25s
200:	learn: 0.1412233	test: 0.2174058	best: 0.2170573 (174)	total: 26.2s	remaining: 1m 43s
bestTest = 0.2165361419
bestIteration = 221
Shrink model to first 222 iterations.


CatBoost:  20%|‚ñà‚ñâ        | 8/41 [05:22<23:25, 42.60s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: card
0:	learn: 0.5915416	test: 0.6224918	best: 0.6224918 (0)	total: 133ms	remaining: 2m 12s
bestTest = 0.3164374136
bestIteration = 20
Shrink model to first 21 iterations.


CatBoost:  22%|‚ñà‚ñà‚ñè       | 9/41 [05:43<19:01, 35.68s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: casual
0:	learn: 0.6604208	test: 0.6615812	best: 0.6615812 (0)	total: 145ms	remaining: 2m 24s
200:	learn: 0.3204685	test: 0.3636771	best: 0.3636771 (200)	total: 26.8s	remaining: 1m 46s
400:	learn: 0.2837340	test: 0.3566200	best: 0.3565665 (395)	total: 52.5s	remaining: 1m 18s
bestTest = 0.355969768
bestIteration = 444
Shrink model to first 445 iterations.


CatBoost:  24%|‚ñà‚ñà‚ñç       | 10/41 [06:58<24:50, 48.08s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: children
0:	learn: 0.6508638	test: 0.6498863	best: 0.6498863 (0)	total: 145ms	remaining: 2m 25s
200:	learn: 0.1814340	test: 0.2562411	best: 0.2556249 (184)	total: 25.9s	remaining: 1m 43s
bestTest = 0.2544059319
bestIteration = 253
Shrink model to first 254 iterations.


CatBoost:  27%|‚ñà‚ñà‚ñã       | 11/41 [07:49<24:22, 48.74s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: education
0:	learn: 0.6378558	test: 0.6403704	best: 0.6403704 (0)	total: 149ms	remaining: 2m 28s
200:	learn: 0.1286402	test: 0.2035905	best: 0.2031474 (167)	total: 26s	remaining: 1m 43s
bestTest = 0.2027797891
bestIteration = 205
Shrink model to first 206 iterations.


CatBoost:  29%|‚ñà‚ñà‚ñâ       | 12/41 [08:33<22:53, 47.37s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: entertainment
0:	learn: 0.6622069	test: 0.6616365	best: 0.6616365 (0)	total: 148ms	remaining: 2m 28s
200:	learn: 0.2400158	test: 0.2872507	best: 0.2871051 (195)	total: 25.7s	remaining: 1m 42s
bestTest = 0.2825527236
bestIteration = 321
Shrink model to first 322 iterations.


CatBoost:  32%|‚ñà‚ñà‚ñà‚ñè      | 13/41 [09:31<23:39, 50.70s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: family
0:	learn: 0.6303607	test: 0.6395938	best: 0.6395938 (0)	total: 150ms	remaining: 2m 29s
bestTest = 0.3317517988
bestIteration = 129
Shrink model to first 130 iterations.


CatBoost:  34%|‚ñà‚ñà‚ñà‚ñç      | 14/41 [10:05<20:34, 45.72s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: finance
0:	learn: 0.6010792	test: 0.6069267	best: 0.6069267 (0)	total: 137ms	remaining: 2m 16s
bestTest = 0.182885252
bestIteration = 37
Shrink model to first 38 iterations.


CatBoost:  37%|‚ñà‚ñà‚ñà‚ñã      | 15/41 [10:28<16:46, 38.71s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: foodAndDrink
0:	learn: 0.5857353	test: 0.5862733	best: 0.5862733 (0)	total: 136ms	remaining: 2m 15s
200:	learn: 0.0237028	test: 0.0507473	best: 0.0502760 (178)	total: 24.9s	remaining: 1m 38s
bestTest = 0.0502760166
bestIteration = 178
Shrink model to first 179 iterations.


CatBoost:  39%|‚ñà‚ñà‚ñà‚ñâ      | 16/41 [11:07<16:13, 38.95s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: health
0:	learn: 0.6130346	test: 0.6183473	best: 0.6183473 (0)	total: 140ms	remaining: 2m 19s
bestTest = 0.1933538785
bestIteration = 59
Shrink model to first 60 iterations.


CatBoost:  41%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 17/41 [11:32<13:54, 34.77s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: indie
0:	learn: 0.6572588	test: 0.6642944	best: 0.6642944 (0)	total: 138ms	remaining: 2m 17s
bestTest = 0.4529572291
bestIteration = 42
Shrink model to first 43 iterations.


CatBoost:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 18/41 [11:56<12:01, 31.39s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: lifestyle
0:	learn: 0.6599418	test: 0.6614512	best: 0.6614512 (0)	total: 142ms	remaining: 2m 21s
200:	learn: 0.2445981	test: 0.3147655	best: 0.3143408 (187)	total: 22.5s	remaining: 1m 29s
bestTest = 0.3108196693
bestIteration = 336
Shrink model to first 337 iterations.


CatBoost:  46%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 19/41 [12:52<14:11, 38.71s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: music
0:	learn: 0.6003023	test: 0.6113548	best: 0.6113548 (0)	total: 151ms	remaining: 2m 30s
bestTest = 0.294682969
bestIteration = 22
Shrink model to first 23 iterations.


CatBoost:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 20/41 [13:13<11:40, 33.33s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: news
0:	learn: 0.6307982	test: 0.6326421	best: 0.6326421 (0)	total: 136ms	remaining: 2m 15s
200:	learn: 0.0981561	test: 0.2275949	best: 0.2227548 (152)	total: 20.3s	remaining: 1m 20s
bestTest = 0.2227547547
bestIteration = 152
Shrink model to first 153 iterations.


CatBoost:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 21/41 [13:44<10:56, 32.83s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: paid
0:	learn: 0.6310351	test: 0.6337517	best: 0.6337517 (0)	total: 151ms	remaining: 2m 31s
bestTest = 0.392279897
bestIteration = 58
Shrink model to first 59 iterations.


CatBoost:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 22/41 [14:10<09:44, 30.74s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: parenting
0:	learn: 0.6197485	test: 0.6183286	best: 0.6183286 (0)	total: 135ms	remaining: 2m 15s
200:	learn: 0.0769208	test: 0.1491351	best: 0.1432542 (177)	total: 24.7s	remaining: 1m 38s
bestTest = 0.1432542303
bestIteration = 177
Shrink model to first 178 iterations.


CatBoost:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 23/41 [14:50<10:00, 33.35s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: pets
0:	learn: 0.5965417	test: 0.6128890	best: 0.6128890 (0)	total: 156ms	remaining: 2m 35s
bestTest = 0.3130888782
bestIteration = 70
Shrink model to first 71 iterations.


CatBoost:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 24/41 [15:16<08:51, 31.29s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: purchases
0:	learn: 0.6242846	test: 0.6265705	best: 0.6265705 (0)	total: 139ms	remaining: 2m 19s
bestTest = 0.1719533141
bestIteration = 114
Shrink model to first 115 iterations.


CatBoost:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 25/41 [15:48<08:24, 31.53s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: puzzle
0:	learn: 0.6456606	test: 0.6465877	best: 0.6465877 (0)	total: 144ms	remaining: 2m 23s
200:	learn: 0.2162946	test: 0.2621215	best: 0.2621215 (200)	total: 26.1s	remaining: 1m 43s
400:	learn: 0.1836232	test: 0.2586463	best: 0.2573104 (355)	total: 51.4s	remaining: 1m 16s
bestTest = 0.257310433
bestIteration = 355
Shrink model to first 356 iterations.


CatBoost:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 26/41 [16:51<10:16, 41.09s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: quiz
0:	learn: 0.6461135	test: 0.6472501	best: 0.6472501 (0)	total: 138ms	remaining: 2m 17s
200:	learn: 0.1248761	test: 0.1815016	best: 0.1815016 (200)	total: 24.1s	remaining: 1m 35s
400:	learn: 0.0754990	test: 0.1637496	best: 0.1635971 (399)	total: 49.1s	remaining: 1m 13s
bestTest = 0.1610228236
bestIteration = 428
Shrink model to first 429 iterations.


CatBoost:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 27/41 [18:02<11:38, 49.91s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: race
0:	learn: 0.6080343	test: 0.6086444	best: 0.6086444 (0)	total: 147ms	remaining: 2m 27s
200:	learn: 0.0623592	test: 0.1254072	best: 0.1240003 (170)	total: 26.1s	remaining: 1m 43s
bestTest = 0.1240003481
bestIteration = 170
Shrink model to first 171 iterations.


CatBoost:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 28/41 [18:42<10:09, 46.90s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: rolePlaying
0:	learn: 0.6419358	test: 0.6479155	best: 0.6479155 (0)	total: 148ms	remaining: 2m 27s
bestTest = 0.3522332102
bestIteration = 50
Shrink model to first 51 iterations.


CatBoost:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 29/41 [19:06<08:02, 40.19s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: shooter
0:	learn: 0.6402325	test: 0.6458296	best: 0.6458296 (0)	total: 125ms	remaining: 2m 5s
bestTest = 0.2507963243
bestIteration = 65
Shrink model to first 66 iterations.


CatBoost:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 30/41 [19:33<06:36, 36.01s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: simulator
0:	learn: 0.6566524	test: 0.6551432	best: 0.6551432 (0)	total: 139ms	remaining: 2m 18s
200:	learn: 0.2335321	test: 0.2945422	best: 0.2940846 (194)	total: 26.3s	remaining: 1m 44s
bestTest = 0.2898908901
bestIteration = 257
Shrink model to first 258 iterations.


CatBoost:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 31/41 [20:24<06:45, 40.58s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: social
0:	learn: 0.6435605	test: 0.6480706	best: 0.6480706 (0)	total: 145ms	remaining: 2m 25s
bestTest = 0.3127842696
bestIteration = 63
Shrink model to first 64 iterations.


CatBoost:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 32/41 [20:49<05:24, 36.07s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: sport
0:	learn: 0.5974726	test: 0.5988369	best: 0.5988369 (0)	total: 136ms	remaining: 2m 15s
200:	learn: 0.0334588	test: 0.0770700	best: 0.0766492 (182)	total: 24.7s	remaining: 1m 38s
bestTest = 0.0763095056
bestIteration = 213
Shrink model to first 214 iterations.


CatBoost:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 33/41 [21:33<05:06, 38.37s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: sports
0:	learn: 0.6064019	test: 0.6108291	best: 0.6108291 (0)	total: 150ms	remaining: 2m 29s
bestTest = 0.1955958668
bestIteration = 140
Shrink model to first 141 iterations.


CatBoost:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 34/41 [22:09<04:22, 37.55s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: state
0:	learn: 0.6057188	test: 0.6190695	best: 0.6190695 (0)	total: 136ms	remaining: 2m 15s
bestTest = 0.3309573166
bestIteration = 16
Shrink model to first 17 iterations.


CatBoost:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 35/41 [22:29<03:13, 32.28s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: strategy
0:	learn: 0.6434312	test: 0.6486248	best: 0.6486248 (0)	total: 146ms	remaining: 2m 25s
bestTest = 0.3880445439
bestIteration = 31
Shrink model to first 32 iterations.


CatBoost:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 36/41 [22:51<02:25, 29.19s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: tools
0:	learn: 0.6559941	test: 0.6560092	best: 0.6560092 (0)	total: 143ms	remaining: 2m 23s
200:	learn: 0.2368017	test: 0.2790675	best: 0.2790675 (200)	total: 26.2s	remaining: 1m 44s
400:	learn: 0.2052834	test: 0.2733158	best: 0.2732091 (397)	total: 50.9s	remaining: 1m 16s
bestTest = 0.2718235915
bestIteration = 476
Shrink model to first 477 iterations.


CatBoost:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 37/41 [24:09<02:55, 43.76s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: transport
0:	learn: 0.6208206	test: 0.6210263	best: 0.6210263 (0)	total: 137ms	remaining: 2m 17s
200:	learn: 0.0786000	test: 0.1219325	best: 0.1213406 (184)	total: 25.3s	remaining: 1m 40s
bestTest = 0.1213405759
bestIteration = 184
Shrink model to first 185 iterations.


CatBoost:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 38/41 [24:49<02:08, 42.87s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: travelling
0:	learn: 0.6007547	test: 0.5978317	best: 0.5978317 (0)	total: 153ms	remaining: 2m 33s
bestTest = 0.1090437421
bestIteration = 88
Shrink model to first 89 iterations.


CatBoost:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 39/41 [25:18<01:17, 38.66s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: utilities
0:	learn: 0.6177577	test: 0.6200920	best: 0.6200920 (0)	total: 141ms	remaining: 2m 21s
200:	learn: 0.0695721	test: 0.1509759	best: 0.1485532 (169)	total: 25.6s	remaining: 1m 41s
bestTest = 0.148553199
bestIteration = 169
Shrink model to first 170 iterations.


CatBoost:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 40/41 [25:57<00:38, 38.86s/it]

üÜï –û–±—É—á–∞–µ–º CatBoost: word
0:	learn: 0.6172700	test: 0.6213791	best: 0.6213791 (0)	total: 135ms	remaining: 2m 15s
bestTest = 0.2137636838
bestIteration = 61
Shrink model to first 62 iterations.


CatBoost: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 41/41 [26:23<00:00, 38.63s/it]

‚úÖ CatBoost: –æ–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ ‚Äî –≤—Å–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –Ω–∞ –¥–∏—Å–∫.





In [23]:
print("üì• –°–æ–±–∏—Ä–∞–µ–º –º–∞—Ç—Ä–∏—Ü—ã –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–µ–π –∏–∑ —Ñ–∞–π–ª–æ–≤...")

cat_val_probas = []
cat_test_probas = []

for i, label in enumerate(classes):
    val_proba_path = os.path.join(PROBA_DIR, f"cat_val_proba_{i}.joblib")
    test_proba_path = os.path.join(PROBA_DIR, f"cat_test_proba_{i}.joblib")

    if os.path.exists(val_proba_path) and os.path.exists(test_proba_path):
        cat_val_probas.append(joblib.load(val_proba_path))
        cat_test_probas.append(joblib.load(test_proba_path))
    else:
        raise FileNotFoundError(f"–ù–µ –Ω–∞–π–¥–µ–Ω—ã –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –¥–ª—è –∫–ª–∞—Å—Å–∞ {label}")

cat_val_proba_matrix = np.column_stack(cat_val_probas)
cat_test_proba_matrix = np.column_stack(cat_test_probas)

print(f"üìä –†–∞–∑–º–µ—Ä –º–∞—Ç—Ä–∏—Ü: val={cat_val_proba_matrix.shape}, test={cat_test_proba_matrix.shape}")

üì• –°–æ–±–∏—Ä–∞–µ–º –º–∞—Ç—Ä–∏—Ü—ã –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–µ–π –∏–∑ —Ñ–∞–π–ª–æ–≤...
üìä –†–∞–∑–º–µ—Ä –º–∞—Ç—Ä–∏—Ü: val=(4307, 41), test=(15046, 41)


In [25]:
hr3_cb = hitrate_at_k(Y_val, cat_val_proba_matrix, k=3)
print(f"Catboost HitRate@3: {hr3_cb:.5f}")

Catboost HitRate@3: 0.88507


In [26]:
save_submission(cat_test_proba_matrix, mlb, test_df['app_name'], "cb_sub1.tsv")

üíæ –°–∞–±–º–∏—Ç —Å–æ—Ö—Ä–∞–Ω–µ–Ω: cb_sub1.tsv


In [28]:
joblib.dump(cat_val_proba_matrix, "cb_val_proba.joblib")
joblib.dump(cat_test_proba_matrix, "cb_test_proba.joblib")

['cb_test_proba.joblib']

In [None]:
lgb_val_probas = []
lgb_test_probas = []

for i, label in enumerate(tqdm(classes, desc="LightGBM")):
    model_path = os.path.join(MODEL_DIR, f"lgb_{i}_{label}.joblib")

    if os.path.exists(model_path):
        print(f"üìÇ –ó–∞–≥—Ä—É–∂–∞–µ–º LightGBM: {label}")
        model = joblib.load(model_path)
    else:
        print(f"üÜï –û–±—É—á–∞–µ–º LightGBM: {label}")

        # –°–æ–∑–¥–∞—ë–º –¥–∞—Ç–∞—Å–µ—Ç—ã
        train_data = lgb.Dataset(X_train, label=Y_train[:, i])
        val_data = lgb.Dataset(X_val, label=Y_val[:, i], reference=train_data)

        # –ü–∞—Ä–∞–º–µ—Ç—Ä—ã
        params = {
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'num_leaves': 63,
            'learning_rate': 0.03,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'is_unbalance': True,
            'random_state': RANDOM_STATE,
            'verbosity': -1,
            'device': 'gpu',
            'gpu_platform_id': 0,
            'gpu_device_id': 0
        }

        # Callbacks
        callbacks = [
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=200)
        ]

        # –û–±—É—á–µ–Ω–∏–µ
        model = lgb.train(
            params,
            train_data,
            num_boost_round=1200,
            valid_sets=[val_data],
            callbacks=callbacks
        )

        # –°–æ—Ö—Ä–∞–Ω—è–µ–º
        joblib.dump(model, model_path)

    # –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ
    lgb_val_proba = model.predict(X_val)
    lgb_test_proba = model.predict(X_test)

    lgb_val_probas.append(lgb_val_proba)
    lgb_test_probas.append(lgb_test_proba)

# –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º –≤ –º–∞—Ç—Ä–∏—Ü—ã
lgb_val_proba_matrix = np.column_stack(lgb_val_probas)
lgb_test_proba_matrix = np.column_stack(lgb_test_probas)

LightGBM:   0%|          | 0/41 [00:00<?, ?it/s]

üÜï –û–±—É—á–∞–µ–º LightGBM: action
üÜï –û–±—É—á–∞–µ–º LightGBM: adsAndServices
[200]	valid_0's binary_logloss: 0.0410955
üÜï –û–±—É—á–∞–µ–º LightGBM: adventure
üÜï –û–±—É—á–∞–µ–º LightGBM: arcade
üÜï –û–±—É—á–∞–µ–º LightGBM: board
üÜï –û–±—É—á–∞–µ–º LightGBM: boardAndCard
[200]	valid_0's binary_logloss: 0.0336743
üÜï –û–±—É—á–∞–µ–º LightGBM: books
[200]	valid_0's binary_logloss: 0.038053
üÜï –û–±—É—á–∞–µ–º LightGBM: business
[200]	valid_0's binary_logloss: 0.107025
[400]	valid_0's binary_logloss: 0.0969849
üÜï –û–±—É—á–∞–µ–º LightGBM: card
üÜï –û–±—É—á–∞–µ–º LightGBM: casual
üÜï –û–±—É—á–∞–µ–º LightGBM: children
[200]	valid_0's binary_logloss: 0.0841547
üÜï –û–±—É—á–∞–µ–º LightGBM: education
[200]	valid_0's binary_logloss: 0.0766164
[400]	valid_0's binary_logloss: 0.0725932
üÜï –û–±—É—á–∞–µ–º LightGBM: entertainment
üÜï –û–±—É—á–∞–µ–º LightGBM: family
[200]	valid_0's binary_logloss: 0.0296803
üÜï –û–±—É—á–∞–µ–º LightGBM: finance
üÜï –û–±—É—á–∞–µ–º LightGBM: foodAndDrin

In [132]:
# base_lgb = LGBMClassifier(
#     objective='binary',
#     boosting_type='gbdt',
#     num_leaves=63,
#     learning_rate=0.03,
#     n_estimators=1200,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     reg_alpha=0.1,
#     reg_lambda=0.1,
#     is_unbalance=True,
#     random_state=RANDOM_STATE,
#     n_jobs=1,
#     device='gpu',                    # ‚Üê –í–∫–ª—é—á–µ–Ω–∏–µ GPU
#     gpu_platform_id=0,               # ‚Üê –ø–ª–∞—Ç—Ñ–æ—Ä–º–∞ (–æ–±—ã—á–Ω–æ 0)
#     gpu_device_id=0,                 # ‚Üê —É—Å—Ç—Ä–æ–π—Å—Ç–≤–æ (–æ–±—ã—á–Ω–æ 0)
#     verbose=200
# )

# lgb_multi = MultiOutputClassifier(base_lgb, n_jobs=-1)
# lgb_multi.fit(X_train, Y_train)
# save_model(lgb_multi, "lgb_multi.joblib")

ValueError: For early stopping, at least one dataset and eval metric is required for evaluation

In [76]:
hr3_lgb = hitrate_at_k(Y_val, lgb_val_proba_matrix, k=3)
print(f"LightGBM HitRate@3: {hr3_lgb:.5f}")

LightGBM HitRate@3: 0.90945


In [93]:
save_submission(lgb_test_proba_matrix, mlb, test_df['app_name'], "lgb_sub.tsv")

üíæ –°–∞–±–º–∏—Ç —Å–æ—Ö—Ä–∞–Ω–µ–Ω: lgb_sub.tsv


In [None]:
joblib.dump(lgb_val_proba_matrix, os.path.join(PROBA_DIR, "lgb_val_proba.joblib"))
joblib.dump(lgb_test_proba_matrix, os.path.join(PROBA_DIR, "lgb_test_proba.joblib"))