In [1]:
#!M
# !pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1
# %pip install --upgrade pip
# %pip install -r /home/jupyter/work/resources/riiidNew/requirements.txt --upgrade
# print("ok")

In [2]:
#!M
import numpy as np
import pandas as pd
import pyarrow
from collections import defaultdict
import datatable as dt
import lightgbm as lgb

from catboost.utils import get_gpu_device_count
from catboost import CatBoostClassifier, Pool

from matplotlib import pyplot as plt
from tqdm import tqdm
# import riiideducation
import torch
import pickle
import gc
from pathlib import Path

from preprocess import preprocess_train_data
from catboost_bayesian_search import bayesian_catboost_search, bayesian_catboost_searchCV

# Error handling, ignore all
np.seterr(divide = 'ignore', invalid = 'ignore')

print(f"pyarrow {pyarrow.__version__}")
print(f"curdir {Path.cwd()}")

pyarrow 2.0.0
curdir /home/sergey/mnt/st1500/Usr/Sergey/TheJob/Challenges/riiidNew


In [3]:
#!M

# Data config

# row data dtypes
dtypes = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool',
    'user_answer': 'int8',
}

target = 'answered_correctly'

homedir = Path.home()
print(str(homedir))

if str(homedir) == "/home/sergey":   # Home computer
    kaggle_path = Path.cwd()/"kaggle_tmp/"
    questions_df = pd.read_csv('/mnt/data30G/2020riiid/questions.csv', usecols = [0, 3], dtype = {'question_id': 'int16', 'part': 'int8'})
    # train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns = set(dtypes.keys())).to_pandas()
    # train_df = dt.fread('/mnt/data30G/2020riid/train.csv', columns = set(dtypes.keys())).to_pandas()
    test_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_valid_1e4.feather", columns=dtypes.keys())
    train_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_train_1e5.feather", columns=dtypes.keys())
    # test_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_valid_1e4.feather", columns=dtypes.keys())
    # train_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_train.feather", columns=dtypes.keys())
    # test_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_valid.feather", columns=dtypes.keys())

elif str(homedir) == "/root":   # Datasphere
    
    path = Path.cwd()/"riiidNew"/"data"
    kaggle_path = Path.cwd()/"riiidNew"/"kaggle/"
    questions_df = pd.read_csv(path/'questions.csv', usecols = [0, 3], dtype = {'question_id': 'int16', 'part': 'int8'})
#     train_df = pd.read_feather(path/"cv1_train_1e5.feather", columns=dtypes.keys())
#     test_df = pd.read_feather(path/"cv1_valid_1e4.feather", columns=dtypes.keys())
    # test_df = pd.read_feather(path/"data/cv1_valid_1e4.feather", columns=dtypes.keys())
    # train_df = pd.read_feather(path/"cv1_train.feather", columns=dtypes.keys())
    # test_df = pd.read_feather(path/"cv1_valid.feather", columns=dtypes.keys())
    # train_df = dt.fread(path/'train.csv', columns=dtypes.keys()).to_pandas().astype(dtypes, errors="ignore")
    # train_df = pd.read_pickle(path/"cv1_train.pickle.zip").astype(dtypes, errors="ignore")
    # train_df = train_df.iloc[:int(1e6)]
    test_df = pd.read_pickle(path/"cv1_valid.pickle.zip").astype(dtypes, errors="ignore")
    # test_df = test_df.iloc[:int(1e4)]

print(f"train_df shape = {train_df.shape}")

/home/sergey
train_df shape = (100000, 10)


In [4]:
#!M
train_user_agg, train_content_agg, train_df = preprocess_train_data(train_df, questions_df, target, dtypes)
# test_user_agg, test_content_agg, test_df = preprocess_train_data(test_df, questions_df, target, dtypes)

In [5]:
#!L
features = [
    'content_id', 'prior_question_elapsed_time',
    'prior_question_had_explanation', 'user_correctness',
    'part', 'content_count'
]

cat_features = [
    'prior_question_had_explanation',
    'part',
]

# Catboost initial parameters, to overload next by pds for bayesian search
prior_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'custom_metric': 'AUC:hints=skip_train~false',

    'task_type': 'GPU' if str(homedir) == "/root" else 'CPU',
    # 'task_type': 'GPU' if torch.cuda.is_available() else 'CPU',
    'grow_policy': 'Lossguide',
    'iterations': 2,
    'learning_rate': 4e-3,
    'random_seed': 0,
    'l2_leaf_reg': 5e-1,
    'depth': 10,
    'max_leaves': 10,
    'border_count': 128,
    'verbose': 150,
    'od_type': 'Iter',
    'od_wait': 50,
}

# catboost params intervals for bayesian search
pds = {
    'iterations': [5, 5],
    'l2_leaf_reg': [2e-1, 5e2],
    'depth': [8, 25],
    'max_leaves': [10, 150],
    'border_count': [50, 300],

}

# Parameters dtypes to adjust with catboost
pds_dtypes = {
    'iterations': int,
    'l2_leaf_reg': float,
    'depth': int,
    'max_leaves': int,
    'border_count': int,

}

# Training and validating data
train_set = Pool(train_df[features], label = train_df[target], cat_features=cat_features)
# val_set = Pool(test_df[features], label = test_df[target])


print("optimizer_maxCV")
optimizer_maxCV = bayesian_catboost_searchCV(
    train_set,
    prior_params=prior_params,
    pds=pds, pds_dtypes=pds_dtypes,
    init_points=5, n_iter=7, verbose=False,
)
optimizer_maxCV

optimizer_maxCV
0:	learn: 0.7621390	test: 0.7731304	best: 0.7731304 (0)	total: 259ms	remaining: 1.03s
4:	learn: 0.7714667	test: 0.7812731	best: 0.7812731 (4)	total: 1.02s	remaining: 0us
0:	learn: 0.7601587	test: 0.7696963	best: 0.7696963 (0)	total: 205ms	remaining: 820ms
4:	learn: 0.7718994	test: 0.7800280	best: 0.7802803 (3)	total: 924ms	remaining: 0us
0:	learn: 0.7510965	test: 0.7610383	best: 0.7610383 (0)	total: 188ms	remaining: 752ms
4:	learn: 0.7677083	test: 0.7781958	best: 0.7781958 (4)	total: 821ms	remaining: 0us
0:	learn: 0.7660736	test: 0.7700233	best: 0.7700233 (0)	total: 230ms	remaining: 919ms
4:	learn: 0.7761029	test: 0.7818350	best: 0.7818350 (4)	total: 1.07s	remaining: 0us
0:	learn: 0.7592534	test: 0.7696615	best: 0.7696615 (0)	total: 189ms	remaining: 756ms
4:	learn: 0.7707433	test: 0.7800399	best: 0.7800399 (4)	total: 857ms	remaining: 0us
0:	learn: 0.7627999	test: 0.7700102	best: 0.7700102 (0)	total: 247ms	remaining: 990ms
4:	learn: 0.7756123	test: 0.7826225	best: 0.7826

{'target': 0.7826224717081707,
 'params': {'border_count': 59.24590839640105,
  'depth': 8.133500024364942,
  'iterations': 5.0,
  'l2_leaf_reg': 390.4505023285588,
  'max_leaves': 103.55240947973235}}

In [6]:
pds_fitted = {key: pds_dtypes[key](val) for key, val in optimizer_maxCV["params"].items()}
params = prior_params.copy()
params.update(pds_fitted)
params

{'loss_function': 'Logloss',
 'eval_metric': 'AUC',
 'custom_metric': 'AUC:hints=skip_train~false',
 'task_type': 'CPU',
 'grow_policy': 'Lossguide',
 'iterations': 5,
 'learning_rate': 0.004,
 'random_seed': 0,
 'l2_leaf_reg': 390.4505023285588,
 'depth': 8,
 'max_leaves': 103,
 'border_count': 59,
 'verbose': 150,
 'od_type': 'Iter',
 'od_wait': 50}

In [7]:
# Ratio is 6/24 = 25%
# valid_df = train_df.groupby('user_id').tail(6)
# train_df.drop(valid_df.index, inplace = True)

# val_set = Pool(valid_df[features], label = valid_df[target])

# Model definition
model = CatBoostClassifier(**params)

# Fitting
model.fit(train_set)
print(f"kaggle_path {kaggle_path}")

model.save_model(f"{kaggle_path/'catboost.model'}")

0:	learn: 0.7652921	total: 79.5ms	remaining: 318ms
4:	learn: 0.7800193	total: 402ms	remaining: 0us
kaggle_path /home/sergey/mnt/st1500/Usr/Sergey/TheJob/Challenges/riiidNew/kaggle_tmp


In [8]:
exit(1)


{'loss_function': 'Logloss', 'eval_metric': 'AUC', 'task_type': 'GPU', 'grow_policy': 'Lossguide', 'iterations': 15000, 'learning_rate': 0.01, 'random_seed': 0, 'l2_leaf_reg': 0.1, 'depth': 8, 'border_count': 128, 'verbose': 150, 'od_type': 'Iter', 'od_wait': 50}
bestTest = 0.7371392846

bestTest = 0.7370638549
bestTest = 0.7364509702


In [9]:
model.get_best_score()

{'learn': {'Logloss': 0.6907528451690224, 'AUC': 0.7800192517423378}}

# Inference

In [10]:
#!M
user_sum_dict = train_user_agg['sum'].astype('int32').to_dict(defaultdict(int))
user_count_dict = train_user_agg['count'].astype('int32').to_dict(defaultdict(int))
content_sum_dict = train_content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = train_content_agg['count'].astype('int32').to_dict(defaultdict(int))

for filename, dic in zip(["user_sum_dict", "user_count_dict", "content_sum_dict", "content_count_dict"],
                         [user_sum_dict, user_count_dict, content_sum_dict, content_count_dict]):
    with open(f'{kaggle_path}/{filename}.pickle', 'wb') as handle:
        pickle.dump(dic, handle)


In [11]:
train_df.head()
del train_df
gc.collect()


645

In [12]:
%%time
validaten_flg = True
if validaten_flg:
    from emulator import Iter_Valid
    iter_test = Iter_Valid(test_df,max_user=1000)
    predicted = []
    def set_predict(df):
        predicted.append(df)
else:
    import riiideducation
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    set_predict = env.predict

CPU times: user 27.3 ms, sys: 7.74 ms, total: 35 ms
Wall time: 35.2 ms


In [13]:
# cumcount = sum([len(df) for df in predicted])
# count = 0
# pbar = tqdm(total=cumcount)
# previous_test_df = None
# for (current_test, current_prediction_df) in iter_test:
#     count+=1
#     if previous_test_df is not None:
#         answers = eval(current_test["prior_group_answers_correct"].iloc[0])
#         responses = eval(current_test["prior_group_responses"].iloc[0])
#         previous_test_df['answered_correctly'] = answers
#         previous_test_df['user_answer'] = responses
#         # your feature extraction and model training code here
#     previous_test_df = current_test.copy()
#     current_test = current_test[current_test.content_type_id == 0]
#     # your prediction code here
#     current_test['answered_correctly'] = model.predict(current_test[features])  # 0.5
#     set_predict(current_test.loc[:,['row_id', 'answered_correctly']])
#     pbar.update(len(current_test))
# print(f"count {count} {len(predicted)}")

In [14]:
%%time

prior_test_df = None
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop = True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values

        # for user_id, content_id, answered_correctly in prior_test_df[["user_id", "content_id", target]].values:
        for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1

    prior_test_df = test_df.copy()
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)
    test_df.drop(labels="part", axis=1, inplace=True)
    test_df.content_id = test_df.content_id.astype(int)
    
    test_df = pd.merge(test_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')    
    user_sum = np.zeros(len(test_df), dtype = np.int32)
    user_count = np.zeros(len(test_df), dtype = np.int32)
    content_sum = np.zeros(len(test_df), dtype = np.int32)
    content_count = np.zeros(len(test_df), dtype = np.int32)
    
    for i, (user_id, content_id) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]

    test_df['user_correctness'] = user_sum / user_count
    test_df['content_count'] = content_count
    test_df['content_id'] = content_sum / content_count
    test_df[target] = model.predict_proba(test_df[features])[:,1]
    set_predict(test_df[['row_id', target]])

KeyError: "['part'] not found in axis"