In [1]:
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from sklearn.model_selection import KFold, StratifiedKFold

In [5]:
path = "data/"
data_start = pd.read_csv(path + 'train_events.csv')
video_start = pd.read_csv(path + 'video_info_v2.csv')
targets_start = pd.read_csv(path + 'train_targets.csv')

In [15]:
class Imputer:
    """
    Класс для приведения датасета к нормальной форме
    """

    def fit(self):
        pass

    def transform(self, data, video, targets):
        data = data.copy()
        video = video.copy()
        targets = targets.copy()

        video['duration_sec'] = video['duration'] // 1000
        targets['sex'] = targets['sex'].apply(lambda x: 0 if x == 'male' else 1)
        
        return data, video, targets

    def fit_transform(self, data, video, targets):
        self.fit()
        return self.transform(data, video, targets)

In [10]:
class FeatureExtractor:
    """
    Класс для добавления фич
    """

    def fit(self):
        pass

    def transform(self, data, video, targets):
        data = data.copy()
        video = video.copy()
        targets = targets.copy()

        data = pd.merge(data, video[['rutube_video_id', 'category']], on='rutube_video_id', how='inner')
        users_cats = data.groupby('viewer_uid').agg(
            favourite_cat=('category', lambda x: x.value_counts().idxmax()),
            percent_fav_cat=('category', lambda x: x.value_counts().max() / len(x))
        )
        targets = pd.merge(users_cats, targets, on='viewer_uid', how='inner')
        
        return data, video, targets

    def fit_transform(self, data, video, targets):
        self.fit()
        return self.transform(data, video, targets)

In [16]:
data, video, targets = Imputer().fit_transform(data_start, video_start, targets_start)
data, video, targets = FeatureExtractor().fit_transform(data, video, targets)

In [26]:
class CatboostEstimator:
    """
    Класс для обучения Catboost
    """

    def fit(self, X, y, n_splits, cat_features, score):
        """
        Разбивает данные на k фолдов со стратификацией и обучает n_splits катбустов
        """
        self.one_model = False
        self.models = []
        scores = []
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        for ind, (train_index, val_index) in enumerate(skf.split(X, y)):
            X_train = X.loc[train_index]
            y_train = y.loc[train_index]
            X_val = X.loc[val_index]
            y_val = y.loc[val_index]
            
            model = CatBoostClassifier(cat_features=cat_features, verbose=500, iterations=1000)
            model.fit(X_train, y_train, verbose=500, eval_set=(X_val, y_val))
            
            self.models.append(model)
            y_pred = model.predict(X_val)
            scores.append(score(y_val, y_pred))
            print(f'model {ind}: score = {round(scores[-1], 4)}')
        
        scores = np.array(scores)
        print(f'mean score = {scores.mean().round(4)}, std = {scores.std().round(4)}')
        print(f'overall score = {(scores.mean() - scores.std()).round(4)}')
            
    
    def fit_select_features(self, X, y, cat_features, to_drop):
        """
        Обучает один катбуст и выполняет elect features
        """
        self.one_model = True
        
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        
        self.model = CatBoostClassifier(cat_features=cat_features, verbose=150, iterations=2000)
        
        self.model.select_features(X_train, y_train, verbose=500, eval_set=(X_val, y_val), steps=10,
                                  num_features_to_select=30, features_for_select=X.columns,
                                  algorithm='RecursiveByLossFunctionChange', train_final_model=True)

    def predict(self, X, cnt_classes):
        if self.one_model:
            return self.model.predict_proba(X)
        
        y_pred = np.zeros((X.shape[0], cnt_classes))

        for model in self.models:
            y_pred += model.predict_proba(X)
        y_pred /= cnt_classes
        y_pred = np.argmax(y_pred, axis=1)
        
        return y_pred

In [13]:
from sklearn.metrics import f1_score, accuracy_score

In [14]:
def score_sex(y_true, y_pred):
    return accuracy_score(y_true, y_pred)


def score_age(y_true, y_pred):
    return f1_score(y_true, y_pred, average='weighted')

In [27]:
catboost_sex = CatboostEstimator()
catboost_age = CatboostEstimator()

features_to_drop = [
    'viewer_uid',
    'age'
]

target_sex = 'sex'
target_age = 'age_class'

cat_features = [
    'favourite_cat'
]

print('Sex model\n')

catboost_sex.fit(targets.drop(columns=features_to_drop + [target_sex] + [target_age]),
                 targets[target_sex],
                 n_splits=2,
                 cat_features=cat_features,
                 score=score_sex)

print('\n\n\nAge model\n')

catboost_age.fit(targets.drop(columns=features_to_drop + [target_sex] + [target_age]),
                 targets[target_age],
                 n_splits=2,
                 cat_features=cat_features,
                 score=score_age)

Learning rate set to 0.096297
0:	learn: 0.6747307	test: 0.6745138	best: 0.6745138 (0)	total: 30.2ms	remaining: 30.1s
500:	learn: 0.5841304	test: 0.5881546	best: 0.5879945 (341)	total: 12.6s	remaining: 12.6s
999:	learn: 0.5803064	test: 0.5891040	best: 0.5879945 (341)	total: 25.3s	remaining: 0us

bestTest = 0.5879944819
bestIteration = 341

Shrink model to first 342 iterations.
model 0: score = 0.6939
Learning rate set to 0.096297
0:	learn: 0.6743530	test: 0.6742977	best: 0.6742977 (0)	total: 36ms	remaining: 35.9s
500:	learn: 0.5827186	test: 0.5897745	best: 0.5897183 (401)	total: 12.3s	remaining: 12.3s
999:	learn: 0.5790859	test: 0.5899370	best: 0.5897183 (401)	total: 24.9s	remaining: 0us

bestTest = 0.5897183088
bestIteration = 401

Shrink model to first 402 iterations.
model 1: score = 0.6903
mean score = 0.6921, std = 0.0018
overall score = 0.6903
Learning rate set to 0.118144
0:	learn: 1.3498520	test: 1.3497644	best: 1.3497644 (0)	total: 70.2ms	remaining: 1m 10s
500:	learn: 1.1583568