In [None]:
!pip install implicit

In [None]:
!pip install --upgrade polars

In [None]:
from scipy.sparse import csr_matrix
import polars as pl
import implicit

In [None]:
RANDOM_STATE = 42

In [None]:
train = pl.scan_parquet("/kaggle/input/vkrecsys/train_interactions.parquet")
train = train.with_columns(weight=pl.col("like") - pl.col("dislike"))
train_full = train
train = train.select("user_id", "item_id", "weight", 'like', 'dislike')

In [None]:
%%time
train = train.collect(streaming=True)

In [None]:
train["weight"].value_counts()

In [None]:
items_meta = pl.read_parquet("/kaggle/input/vkrecsys/items_meta.parquet")
users_meta = pl.read_parquet("/kaggle/input/vkrecsys/users_meta.parquet")
n_items = items_meta["item_id"].max() + 1
n_users = users_meta["user_id"].max() + 1
n_users, n_items

In [None]:
items_meta

In [None]:
items_meta_wo_embed = items_meta.drop(['embeddings'])
items_meta_embed = items_meta.drop(['duration'])

In [None]:
test = pl.read_csv('/kaggle/input/vkrecsys/test_pairs.csv')
test

In [None]:
from sklearn.metrics import roc_auc_score
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed


def custom_roc_auc_score(
    df: pd.DataFrame,
    n_jobs: int = -1
) -> float:
    '''
    обязательные поля: user_id, rank {-1, 0, 1}, predict
    '''
    def user_roc_auc_score(
        dataframe: pd.DataFrame
    ) -> float:
        metric_value = 0
        pairs_count = 0
        
        ranks = dataframe['weight'].to_numpy()
        scores = dataframe['predict'].to_numpy()
        
        like_scores = scores[ranks == 1]
        
        ignore_scores = scores[ranks == 0]
        
        dislike_scores = scores[ranks == 255]
        
        for neg_scores, pos_scores in zip(
            [dislike_scores, ignore_scores],
            [np.concatenate((ignore_scores, like_scores)), like_scores]
        ):
            if neg_scores.size and pos_scores.size:
                ranks = np.ones(neg_scores.size + pos_scores.size)
                ranks[: neg_scores.size] = 0
                
                scores = np.concatenate(
                    (neg_scores, pos_scores)
                )
                pairs_count_ = neg_scores.size * pos_scores.size
                metric_value += roc_auc_score(ranks, scores) * pairs_count_
                pairs_count += pairs_count_
        if pairs_count:
            return metric_value / pairs_count
        return 0

    assert tqdm.pandas() is None
    
    groups = df.groupby('user_id')

    user_scores = Parallel(n_jobs=n_jobs)(
        delayed(user_roc_auc_score)(group) for _, group in tqdm(groups, total=len(groups))
    )

    return np.mean(user_scores)

In [None]:
from catboost import CatBoostClassifier, Pool, sum_models
import numpy as np
from sklearn.model_selection import train_test_split
import time

In [None]:
train_1 = train.join(items_meta_wo_embed, on='item_id')
train_1 = train_1.join(users_meta, on='user_id')
train_1 = train_1.drop(['like', 'dislike'])
train_1

In [None]:
train_1 = train_1.filter(train_1['weight'] != 0)
train_1

In [None]:
del X
del y
#del X_tr, X_test, y_tr, y_test
#del X_train, X_val, y_train, y_val

In [None]:
X = train.drop(['weight', 'like', 'dislike']).to_pandas()
y = train['weight'].to_pandas() #train_1

start_time = time.time()

print("Start train_test_split")

X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, shuffle=True, stratify=y)

print(f"---Train_test_split: {time.time() - start_time:.2f} seconds.---")

In [None]:
start_time = time.time()
FEATURES = ['user_id','item_id', 'source_id','duration','gender', 'age']#, 'timespent', 'share', 'bookmarks']
CATEGORICAL = ['gender']#, 'share', 'bookmarks']
TARGET = ['weight']

X = train_1.drop('weight').to_pandas()
y = train_1['weight'].to_pandas()

print("Start train_test_split")

X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr, test_size=0.2, random_state=RANDOM_STATE, shuffle=True)

print(f"---Train_test_split: {time.time() - start_time:.2f} seconds.---")

train_pool = Pool(
    data=X_train,
    label=y_train,
    cat_features=CATEGORICAL
)
val_pool = Pool(
    data=X_val,
    label=y_val,
    cat_features=CATEGORICAL    
)

model = CatBoostClassifier(loss_function='MultiClass', random_state=RANDOM_STATE, verbose=100, allow_writing_files=False, task_type="GPU", auto_class_weights='SqrtBalanced')# custom_loss='AUC'
print("Start model.fit")
model.fit(train_pool, eval_set=val_pool)
print(f"---Fitting: {time.time() - start_time:.2f} seconds.---")

In [None]:
from catboost import cv
X = train_1.drop('weight').to_pandas()
y = train_1['weight'].to_pandas()

train_pool = Pool(
    data=X,
    label=y,
    cat_features=['gender']
)

params = {
    'loss_function' : 'MultiClass', 
    'random_state' : RANDOM_STATE, 
    'task_type' : "GPU", 
    'auto_class_weights' : 'SqrtBalanced',

}

cv_data = cv(
    params=params,
    pool=train_pool,
    fold_count=5, 
    shuffle=True,
    partition_random_seed=RANDOM_STATE,
    stratified=True, 
    verbose=500,
)

In [None]:
best_value = np.min(cv_data['test-MultiClass-mean'])
best_iter = np.argmin(cv_data['test-MultiClass-mean'])
print("Best validation MultiClass score, stratified: {:.4f}+/-{:.3f} on step {}".format(
best_value, cv_data['test-MultiClass-std'][best_iter], best_iter))

In [None]:
test_df = X_test.join(y_test)
test_df.drop(['source_id','duration',	'gender', 'age'], axis=1, inplace=True)
test_df.reset_index(drop=True, inplace=True)
test_df

In [None]:
test_re = test.with_columns(pl.col('user_id').cast(pl.UInt32), pl.col('item_id').cast(pl.UInt32))
test_re = test_re.join(users_meta, on='user_id')
test_re = test_re.join(items_meta_wo_embed, on='item_id')
test_re = test_re.to_pandas()


In [None]:
def myPredict(model, data, test_df, label=None):
    start_time = time.time()

    test_pool = Pool(
        data=data,
        label=label,
        cat_features=CATEGORICAL   
    )

    print('Start predicting')
    pred = np.ndarray.flatten(model.predict(test_pool))
    print(f"---Predicting: {time.time() - start_time:.2f} seconds.---")
    print('Start predicting proba')
    pred_proba = np.ndarray.flatten(model.predict_proba(test_pool)[:, 1])
    print(f"---Predicting proba: {time.time() - start_time:.2f} seconds.---")

    test_df = test_df.assign(predict=pred)
    test_df = test_df.assign(predict_proba=pred_proba)

    test_df= test_df.sort_values(by=['user_id','predict_proba'],ascending=[True, False])

    if label is not None:
        score = custom_roc_auc_score(test_df)
        print(score)
    return test_df

In [None]:
test_df = myPredict(model, X_test, test_df, y_test)

In [None]:
test_df = myPredict(model, test_re, test_df=test_re)

In [None]:
model.score( X_test, y_test)

In [None]:
test_df

In [None]:
result = pl.from_pandas(test_df)
result = result.drop(['gender', 'age', 'source_id', 'duration', 'predict'])
result.write_csv(f"sample_submission.csv")

In [None]:
result = result.rename({"predict_proba": "predict"})

In [None]:
result.write_csv(f"sample_submission1.csv")

In [None]:
test_df.groupby('predict').count()

In [None]:
test_df.groupby('weight').count()

In [None]:
model.get_feature_importance()

# Обучение с батчами

In [None]:
def make_batch(train_set):
   
    counts = train_set["weight"].value_counts()
    
    like_part = counts.row(by_predicate=(pl.col("weight") == 1))[-1]
    dislike_part = counts.row(by_predicate=(pl.col("weight") == 2))[-1]
    n_like = int(BATCH_SIZE * like_part / train_set.shape[0])
    n_dislike = int(BATCH_SIZE * dislike_part / train_set.shape[0])
    n_ignore = BATCH_SIZE - n_like - n_dislike

    likes = train_set.filter(pl.col("weight") == 1).sample(n=n_like, seed=RANDOM_STATE)
    dislikes = train_set.filter(pl.col("weight") == 2).sample(n=n_dislike, seed=RANDOM_STATE)
    ignores = train_set.filter(pl.col("weight") == 0).sample(n=n_ignore, seed=RANDOM_STATE)

    train_set = train_set.join(likes, on=["item_id", "user_id"], how='anti')
    train_set = train_set.join(dislikes, on=["item_id", "user_id"], how='anti')
    train_set = train_set.join(ignores, on=["item_id", "user_id"], how='anti')

    return train_set, pl.concat([likes, dislikes, ignores])

In [None]:
df = pl.DataFrame(
    {
        "a": [3, 2, 3, 4, 3],
        "b": [0.5, 0.6, 2.5, 13, 23],
        "c": [True, True, False, False, True],
    }
)
a = df.filter(pl.col('a') == 3).sample(n=2)
df = df.join(a, on=['a', "b"], how="anti")
df()

In [None]:
from catboost import CatBoostClassifier, Pool, sum_models
import numpy as np
import time

FEATURES = ['user_id','item_id', 'source_id','duration', 'gender', 'age', 'embeddings']
TARGET = ['weight']
EMBEDDINGS = ['embeddings']
CATEGORICAL = ['gender']

BATCH_SIZE = 300000
#n_batches = train.shape[0] // BATCH_SIZE
#first_batch_size = train.shape[0] - (n_batches * BATCH_SIZE)
if first_batch_size == 0:
    first_batch_size = BATCH_SIZE
    
params = {
    'loss_function' : 'MultiClass', 
    'iterations' : 500,
    'random_state' : RANDOM_STATE, 
    'verbose': 100,
    'task_type' : 'GPU',
    'classes_count' : 3,
    'auto_class_weights'  : 'SqrtBalanced'
}

def train_batch(train_set):
    start_time = time.time()
    train_size = BATCH_SIZE * 8 // 10
    model1 = CatBoostClassifier(**params)

    print('make_batch started')
    train_set, batch = make_batch(train_set[first_batch_size:])
    batch = batch.join(items_meta, on='item_id')
    batch = batch.join(users_meta, on='user_id')
    train_pool = Pool(
        data=batch[FEATURES].to_pandas(), 
        label=batch[TARGET].to_pandas(),
        embedding_features=EMBEDDINGS,
        cat_features=CATEGORICAL 
    )
#    val_pool = Pool(
#        data=batch[FEATURES][train_size:].to_pandas(),
#        label=batch[TARGET][train_size:].to_pandas(),
#        embedding_features=EMBEDDINGS,
#        cat_features=CATEGORICAL 
#    )

    
    print(f"---Batch {time.time() - start_time :.2f} seconds to complete.---")

    start_time = time.time()
    
    model1.fit(train_pool)

       
    print(f"---The first model fitted {time.time() - start_time :.2f} seconds.---")

    models = [model1]
    for i in range(1, n_batches):
        start_time = time.time()
        model_i = CatBoostClassifier(**params)

        print('make_batch started')
        train_set, batch = make_batch(train_set)
        batch = batch.join(items_meta, on='item_id')
        batch = batch.join(users_meta, on='user_id')

        train_pool = Pool(
            data=batch[FEATURES].to_pandas(), 
            label=batch[TARGET].to_pandas(),
            embedding_features=EMBEDDINGS,
            cat_features=CATEGORICAL 
        )
#        val_pool = Pool(
#            data=batch[FEATURES][train_size:].to_pandas(),
#            label=batch[TARGET][train_size:].to_pandas(),
#            embedding_features=EMBEDDINGS,
#            cat_features=CATEGORICAL 
#        )

        print(f"---Batch {time.time() - start_time :.2f} seconds to complete.---")

        start_time = time.time()
        preds = np.ndarray.flatten(model1.predict(train_pool))
        train_pool.set_baseline(preds)

        model_i.fit(train_pool)
        models.append(model_i)
        model1 = model_i
        print(f'Complited {i + 2} / {n_batches + 1}')
        end_time = time.time()
        elapsed_time = end_time - start_time    
        print(f"---{i + 2} model fitted {elapsed_time:.2f} seconds.---")
    res_model = sum_models(models)
    return res_model

In [None]:
train_b = X_tr.join(y_tr)
train_b.loc[train_b['weight'] == 255, 'weight'] = 2
train_b.reset_index(drop=True, inplace=True)
train_b = pl.from_pandas(train_b)



In [None]:
del train

In [None]:
model = train_batch(train_b)