In [134]:
#!M
# !pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1
# %pip install --upgrade pip
# %pip install -r /home/jupyter/work/resources/riiidNew/requirements.txt --upgrade
# print("ok")

In [135]:
#!M
import numpy as np
import pandas as pd
import pyarrow
from collections import defaultdict
import datatable as dt
import lightgbm as lgb

from catboost.utils import get_gpu_device_count
from catboost import CatBoostClassifier, Pool

from matplotlib import pyplot as plt
from tqdm import tqdm
# import riiideducation
import torch
import pickle
import gc
from pathlib import Path

# Error handling, ignore all
np.seterr(divide = 'ignore', invalid = 'ignore')

print(f"pyarrow {pyarrow.__version__}")
print(f"curdir {Path.cwd()}")

pyarrow 2.0.0
curdir /home/sergey/mnt/st1500/Usr/Sergey/TheJob/Challenges/riiidNew


In [136]:
#!M

# Data config

dtypes = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool',
    'user_answer': 'int8',
}

target = 'answered_correctly'

homedir = Path.home()
print(str(homedir))

if str(homedir) == "/home/sergey":   # Home computer
    kaggle_path = Path.cwd()/'kaggle_tmp'
    questions_df = pd.read_csv('/mnt/data30G/2020riid/questions.csv', usecols = [0, 3],
                               dtype = {'question_id': 'int16', 'part': 'int8'})
    print(questions_df.head())
    questions_df.set_index(keys="question_id", inplace=True)
    questions_df.index.names = ["content_id"]
    print(questions_df.head())
    # train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns = set(dtypes.keys())).to_pandas()
    # train_df = dt.fread('/mnt/data30G/2020riid/train.csv', columns = set(dtypes.keys())).to_pandas()
    # test_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_valid_1e4.feather", columns=dtypes.keys())
    train_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_train_1e5.feather", columns=dtypes.keys())
    # test_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_valid_1e4.feather", columns=dtypes.keys())
    # train_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_train.feather", columns=dtypes.keys())
    # test_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_valid.feather", columns=dtypes.keys())

elif str(homedir) == "/root":   # Datasphere
    
    path = Path.cwd()/"riiidNew"/"data"
    kaggle_path = Path.cwd()/"riiidNew"/"kaggle"
    questions_df = pd.read_csv(path/'questions.csv', usecols = [0, 3], dtype = {'question_id': 'int16', 'part': 'int8'})
    questions_df.set_index(keys="question_id", inplace=True)
    questions_df.index.names = ["content_id"]
#     train_df = pd.read_feather(path/"cv1_train_1e5.feather", columns=dtypes.keys())
#     test_df = pd.read_feather(path/"cv1_valid_1e4.feather", columns=dtypes.keys())
    # test_df = pd.read_feather(path/"data/cv1_valid_1e4.feather", columns=dtypes.keys())
    # train_df = pd.read_feather(path/"cv1_train.feather", columns=dtypes.keys())
    # test_df = pd.read_feather(path/"cv1_valid.feather", columns=dtypes.keys())
    # train_df = dt.fread(path/'train.csv', columns=dtypes.keys()).to_pandas().astype(dtypes, errors="ignore")
    train_df = pd.read_pickle(path/"cv1_train.pickle.zip").astype(dtypes, errors="ignore")
    # train_df = train_df.iloc[:int(1e6)]
    # test_df = pd.read_pickle(path/"cv1_valid.pickle.zip").astype(dtypes, errors="ignore")
    # test_df = test_df.iloc[:int(1e4)]

print(f"train_df shape = {train_df.shape}")
train_df.head()

/home/sergey
   question_id  part
0            0     1
1            1     1
2            2     1
3            3     1
4            4     1
            part
content_id      
0              1
1              1
2              1
3              1
4              1
train_df shape = (100000, 10)


Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_answer
0,32933156,0,705741139,128,0,0,1,,,0
1,32933157,20666,705741139,7860,0,1,1,16000.0,False,0
2,32933158,39172,705741139,7922,0,2,1,19000.0,False,1
3,32933159,58207,705741139,156,0,3,1,17000.0,False,2
4,32933160,75779,705741139,51,0,4,1,17000.0,False,0


* Information of the training dataset

In [137]:
#!M
sep = '*' * 50
print(f'Training dataset detailed information \n{sep}')
print(f'Columns: {train_df.columns} \n{sep}')
print(f'Shape: {train_df.shape} \n{sep}')
print(f'NA values in each column: {sum(train_df.isna().sum())} \n{sep}')


def prep_data(df, questions_df=questions_df, dtypes=dtypes):
    # Exclude lectures
    df = df[df[target] != -1].reset_index(drop=True, inplace=False)
    # Fill NaN values in the 'prior_question_had_explanation' columns
    df['prior_question_had_explanation'].fillna(False, inplace=True)

    # Set type
    df = df.astype(dtypes)
    
    # Answer for the previous questions of users
    df['lag'] = df.groupby('user_id')[target].shift()
    # For each user (groupby('user_id')), compute the cummulative number of correct answers and number answers in general
    groupby = df.groupby('user_id')['lag']
    cum = groupby.agg(['cumsum', 'cumcount'])

    # User correctness (measure the users' learning progress)
    df['user_correctness'] = cum['cumsum'] / cum['cumcount']
    # Drop the 'lag' feature
    df.drop(columns=['lag'], inplace=True)
    df.head()    
    
    # Overall correctness of users
    user_agg = df.groupby('user_id')[target].agg(['sum', 'count'])
    # Overall difficulty of questions
    content_agg = df.groupby('content_id')[target].agg(['sum', 'count'])    

    # Take only 24 last observations of each user
    df = df.groupby('user_id').tail(24).reset_index(drop=True)
    
    df = df.join(questions_df, on='content_id' )
    # df = pd.merge(df, questions_df, left_on='content_id', right_on='question_id', how='left')
    # df.drop(columns=['question_id'], inplace=True)

    # How many questions have been answered in each content ID?
    df['content_count'] = df['content_id'].map(content_agg['count']).astype('int32')
    # How hard are questions in each content ID?
    df['content_difficalty'] = df['content_id'].map(content_agg['sum'] / content_agg['count'])
    # df.drop('content_id', inplace=True, axis=1)

    return user_agg, content_agg, df
    

Training dataset detailed information 
**************************************************
Columns: Index(['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation',
       'user_answer'],
      dtype='object') 
**************************************************
Shape: (100000, 10) 
**************************************************
NA values in each column: 5347 
**************************************************


# Extract the validation set

In [138]:
#!M
train_user_agg, train_content_agg, train_df = prep_data(train_df)

# user_sum_dict = train_user_agg['sum'].astype('int16').to_dict(defaultdict(int))
# user_count_dict = train_user_agg['count'].astype('int16').to_dict(defaultdict(int))
# content_sum_dict = train_content_agg['sum'].astype('int32').to_dict(defaultdict(int))
# content_count_dict = train_content_agg['count'].astype('int32').to_dict(defaultdict(int))
# gc.collect()


# Ratio is 6/24 = 25%
valid_df = train_df.groupby('user_id').tail(6)
train_df.drop(valid_df.index, inplace = True)

# Training

In [139]:
#!L
# features = ['content_difficalty', "content_id", 'prior_question_elapsed_time',
#             'prior_question_had_explanation', 'user_correctness',
#             'part', 'content_count']

features = ['content_difficalty', 'user_id', 'prior_question_elapsed_time',
            'prior_question_had_explanation', 'user_correctness',
            'part', 'content_count']

params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'custom_metric': 'AUC:hints=skip_train~false',

    'task_type': 'CPU',  # 'GPU' if get_gpu_device_count() > 0 else 'CPU',
    # 'task_type': 'GPU' if torch.cuda.is_available() else 'CPU',
    'grow_policy': 'Lossguide',
    'iterations': 2,
    'learning_rate': 4e-3,
    'random_seed': 0,
    'l2_leaf_reg': 5e-1,
    'depth': 10,
    #'rsm': 0.3,
    # 'max_leaves': 10,
    'border_count': 128,
    'verbose': 150,
    'od_type': 'Iter',
    'od_wait': 50,
}
print(params)

# Training and validating data
train_set = Pool(train_df[features], label = train_df[target])
val_set = Pool(valid_df[features], label = valid_df[target])
# val_set = Pool(test_df[features], label = test_df[target])

# Model definition
model = CatBoostClassifier(**params)

# Fitting
model.fit(train_set, eval_set = val_set, use_best_model = True)
print(f"kaggle_path {kaggle_path}")

model.save_model(str(kaggle_path/'catboost.model'))

{'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': 'AUC:hints=skip_train~false', 'task_type': 'CPU', 'grow_policy': 'Lossguide', 'iterations': 2, 'learning_rate': 0.004, 'random_seed': 0, 'l2_leaf_reg': 0.5, 'depth': 10, 'border_count': 128, 'verbose': 150, 'od_type': 'Iter', 'od_wait': 50}
0:	learn: 0.7758269	test: 0.7631521	best: 0.7631521 (0)	total: 35.7ms	remaining: 35.7ms
1:	learn: 0.7812295	test: 0.7702233	best: 0.7702233 (1)	total: 63.3ms	remaining: 0us

bestTest = 0.7702233275
bestIteration = 1

kaggle_path /home/sergey/mnt/st1500/Usr/Sergey/TheJob/Challenges/riiidNew/kaggle_tmp



{'loss_function': 'Logloss', 'eval_metric': 'AUC', 'task_type': 'GPU', 'grow_policy': 'Lossguide', 'iterations': 15000, 'learning_rate': 0.01, 'random_seed': 0, 'l2_leaf_reg': 0.1, 'depth': 8, 'border_count': 128, 'verbose': 150, 'od_type': 'Iter', 'od_wait': 50}
bestTest = 0.7371392846

bestTest = 0.7370638549
bestTest = 0.7364509702


In [140]:
model.get_best_score()

{'learn': {'Logloss': 0.6912894493875078, 'AUC': 0.7812294949729874},
 'validation': {'Logloss': 0.6914273813025933, 'AUC': 0.7702233274556736}}

In [141]:
train_df.head()
del train_df
gc.collect()

95

# Inference

In [142]:
#!M
mean_dict = {
    'user_sum_dict': train_user_agg['sum'].astype('int16').to_dict(defaultdict(int)),
    'user_count_dict': train_user_agg['count'].astype('int16').to_dict(defaultdict(int)),
    'content_sum_dict': train_content_agg['sum'].astype('int32').to_dict(defaultdict(int)),
    'content_count_dict': train_content_agg['count'].astype('int32').to_dict(defaultdict(int)),

}

# for filename, dic in mean_dict.items():
#     with open(f'{kaggle_path}/{filename}.pickle', 'wb') as handle:
#         pickle.dump(dic, handle)

# with open(f"{kaggle_path/'mean_dict.pickle'}", 'wb') as handle:
#     pickle.dump(mean_dict, handle)

In [143]:
user_df = pd.DataFrame(data={
    'sum_': train_user_agg['sum'].astype('int32'),
    'count_': train_user_agg['count'].astype('int32')
})
print("user_df")
user_df.head()

user_df


Unnamed: 0_level_0,sum_,count_
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
44331,4,7
1084314,21,33
1250518,15,50
1744476,3,7
2393889,22,30


In [144]:
content_df = pd.DataFrame(data={
    'sum_': train_content_agg['sum'].astype('int32'),
    'count_': train_content_agg['count'].astype('int32')
})
print("content_df")
content_df.head()


content_df


Unnamed: 0_level_0,sum_,count_
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9,10
1,9,9
2,33,70
3,23,32
4,66,117


In [145]:
test_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_valid_1e4.feather", columns=dtypes.keys())

test_user_agg, test_content_agg, test_df = prep_data(test_df)
print(test_df.columns)

Index(['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation',
       'user_answer', 'user_correctness', 'part', 'content_count',
       'content_difficalty'],
      dtype='object')


In [146]:
test_df.drop(labels=['user_correctness', 'content_count', 'content_difficalty'], axis=1, inplace=True)
gc.collect()

test_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_answer,part
0,45253,2146476402,1186307,4451,0,370,1,10000.0,True,3,5
1,91995999,4916060626,1951897185,8534,0,504,0,47000.0,True,3,5
2,99237056,33806,2105538840,5631,0,1,1,21000.0,False,2,5
3,55360606,701477552,1174413790,5168,0,221,0,69000.0,True,0,5
4,44070864,11040265647,932625141,412,0,390,1,16000.0,True,3,2


In [147]:
%%time

validaten_flg = True
if validaten_flg:
    from emulator import Iter_Valid
    iter_test = Iter_Valid(test_df,max_user=1000)
    predicted = []
    def set_predict(df):
        predicted.append(df)
else:
    import riiideducation
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    set_predict = env.predict

CPU times: user 26.3 ms, sys: 0 ns, total: 26.3 ms
Wall time: 26.3 ms


In [148]:
# cumcount = sum([len(df) for df in predicted])
# count = 0
# pbar = tqdm(total=cumcount)
# previous_test_df = None
# for (current_test, current_prediction_df) in iter_test:
#     count+=1
#     if previous_test_df is not None:
#         answers = eval(current_test["prior_group_answers_correct"].iloc[0])
#         responses = eval(current_test["prior_group_responses"].iloc[0])
#         previous_test_df['answered_correctly'] = answers
#         previous_test_df['user_answer'] = responses
#         # your feature extraction and model training code here
#     previous_test_df = current_test.copy()
#     current_test = current_test[current_test.content_type_id == 0]
#     # your prediction code here
#     current_test['answered_correctly'] = model.predict(current_test[features])  # 0.5
#     set_predict(current_test.loc[:,['row_id', 'answered_correctly']])
#     pbar.update(len(current_test))
# print(f"count {count} {len(predicted)}")

In [149]:
%%time

prior_test_df = None
count = 0
for (test_df, sample_prediction_df) in iter_test:
    print("count loop starting", count)

    if prior_test_df is not None:

        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])

        prior_user_id_idx = prior_test_df.index.get_level_values("user_id")
        content_id_idx = prior_test_df.index.get_level_values("content_id")

        new_users = prior_test_df.loc[~prior_user_id_idx.isin(user_df.index)]
        new_user_id_idx = new_users.index.get_level_values("user_id").drop_duplicates()

        if len(new_users) > 0:
            new_users_df = pd.DataFrame(data=0, index=new_user_id_idx, columns=user_df.columns, )
            user_df = user_df.append(new_users_df, sort=True)

        user_df.sum_ = \
            user_df.sum_.add(prior_test_df[target], fill_value=0, level="user_id")
        #
        # user_df.count_ = \
        #     user_df.count_.add(1, fill_value=0, level="user_id")
        #
        # content_df.sum_ = \
        #     content_df.sum_.add(prior_test_df[target], fill_value=0, level="content_id")
        #
        # content_df.count_ = \
        #     user_df.count_.add(1, fill_value=0, level="content_id")


    test_df = test_df[test_df['content_type_id'] == 0]  # .reset_index(drop=True)
    test_df.set_index(["user_id", "content_id"], inplace=True)
    prior_test_df = test_df.copy()

    test_df.drop(labels="part", axis=1, inplace=True)

    if count > 1:
        print(test_df.index.is_unique)
    test_df = test_df.join(questions_df, how='left')

    user_correctness = pd.Series(data=(user_df.sum_ / user_df.count_).values, name="user_correctness")
    user_correctness.index.names = ["user_id"]

    content_difficalty = pd.Series(data=(content_df.sum_ / content_df.count_).values, name="content_difficalty")
    content_difficalty.index.names = ["content_id"]

    content_count = pd.Series(data=content_df.count_.values, name="content_count")
    content_count.index.names = ["content_id"]

    test_df = test_df.join(content_count)
    test_df = test_df.join(content_difficalty)
    test_df = test_df.join(user_correctness)

    test_df.reset_index(inplace=True)

    test_df[target] = model.predict_proba(test_df[features])[:,1]
    set_predict(test_df[['row_id', target]])

    print("count loop finished", count)
    count += 1

count loop starting 0
count loop finished 0
count loop starting 1
count loop finished 1
count loop starting 2
True
True
count loop finished 2
count loop starting 3
True
True
count loop finished 3
count loop starting 4
True
True
count loop finished 4
count loop starting 5
True
True
count loop finished 5
count loop starting 6
True
True
count loop finished 6
count loop starting 7




TypeError: Join on level between two MultiIndex objects is ambiguous

In [150]:
# %%time
#
# prior_test_df = None
# count = 0
# for (test_df, sample_prediction_df) in iter_test:
#     if prior_test_df is not None:
#         prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
#         prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop = True)
#
#         # user_ids = prior_test_df['user_id'].values
#         # content_ids = prior_test_df['content_id'].values
#         # targets = prior_test_df[target].values
#
#         for idx, (user_id, content_id, target) in prior_test_df[["user_id", "content_id", target]].iterrows():
#         # for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
#             mean_dict['user_sum_dict'][user_id] += target
#             mean_dict['user_count_dict'][user_id] += 1
#             mean_dict['content_sum_dict'][content_id] += target
#             mean_dict['content_count_dict'][content_id] += 1
#
#     if count < 1:
#         print(type(mean_dict['user_sum_dict']))
#         # mean_df = pd.DataFrame.from_dict(mean_dict['user_sum_dict'])
#         count = 1
#         # print(mean_df)
#
#     prior_test_df = test_df.copy()
#
#     test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)
#     test_df.drop(labels="part", axis=1, inplace=True)
#     test_df.content_id = test_df.content_id.astype(int)
#
#     test_df = pd.merge(test_df, questions_df, left_on = 'content_id',
#                        right_on = 'question_id', how = 'left', right_index=True)
#     test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')
#
#     user_sum = np.zeros(len(test_df), dtype = np.int16)
#     user_count = np.zeros(len(test_df), dtype = np.int16)
#     content_sum = np.zeros(len(test_df), dtype = np.int32)
#     content_count = np.zeros(len(test_df), dtype = np.int32)
#
#     # "for i, (user_id, content_id) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values)):
#     for i, (user_id, content_id) in test_df[['user_id', 'content_id']].iterrows():
#         user_sum[i] = mean_dict['user_sum_dict'][user_id]
#         user_count[i] = mean_dict['user_count_dict'][user_id]
#         content_sum[i] = mean_dict['content_sum_dict'][content_id]
#         content_count[i] = mean_dict['content_count_dict'][content_id]
#
#     test_df['user_correctness'] = user_sum / user_count
#     test_df['content_count'] = content_count
#     test_df['content_id'] = content_sum / content_count
#
#     test_df[target] = model.predict_proba(test_df[features])[:,1]
#     set_predict(test_df[['row_id', target]])
#

In [151]:
print("len(predicted)", len(predicted))

print(predicted[0])

len(predicted) 7
     row_id  answered_correctly
0     45253            0.498758
1  91995999            0.497171
2  99237056            0.497832
3  55360606            0.502441
4  44070864            0.498758


In [152]:
test_df.shape

(5, 13)