# Use the package 'datatable' for fast handling

In [1]:
# !pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1
# %pip install --upgrade pip
# %pip install -r /home/jupyter/work/resources/riiidNew/requirements.txt --upgrade 
# %pip install --upgrade wheel
# %pip install --upgrade pyarrow==2.0.0


# Necessary packages

In [2]:
import numpy as np
import pandas as pd
import pyarrow
from collections import defaultdict
# import datatable as dt
import lightgbm as lgb

from catboost.utils import get_gpu_device_count
from catboost import CatBoostClassifier, Pool

from matplotlib import pyplot as plt
from tqdm import tqdm
# import riiideducation
import torch
import pickle
import gc
from pathlib import Path

import sys

# Error handling, ignore all
np.seterr(divide = 'ignore', invalid = 'ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

# Preprocessing

* Data config

In [3]:
dtypes = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool',
    'user_answer': 'int8',
}

target = 'answered_correctly'

* Import data

In [4]:
#!M
%%time

homedir = Path.home()

if str(homedir) == "/home/sergey":   # Home computer
    kaggle_path = "kaggle/"
    questions_df = pd.read_csv('/mnt/data30G/2020riid/questions.csv', usecols = [0, 3], dtype = {'question_id': 'int16', 'part': 'int8'})
    # train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns = set(dtypes.keys())).to_pandas()
    # train_df = dt.fread('/mnt/data30G/2020riid/train.csv', columns = set(dtypes.keys())).to_pandas()
    test_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_valid_1e4.feather", columns=dtypes.keys())
    train_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_train_1e5.feather", columns=dtypes.keys())
    # valid_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_valid_1e4.feather", columns=dtypes.keys())
    # train_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_train.feather", columns=dtypes.keys())
    # valid_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_valid.feather", columns=dtypes.keys())

elif str(homedir) == "/root":   # Datasphere
    
    path = Path.cwd()/"riiidNew"/"data"
    kaggle_path = Path.cwd()/"riiidNew"/"kaggle"
    questions_df = pd.read_csv(path/'questions.csv', usecols = [0, 3], dtype = {'question_id': 'int16', 'part': 'int8'})
#     train_df = pd.read_feather(path/"cv1_train_1e5.feather", columns=dtypes.keys())
#     valid_df = pd.read_feather(path/"cv1_valid_1e4.feather", columns=dtypes.keys())
    # valid_df = pd.read_feather(path/"data/cv1_valid_1e4.feather", columns=dtypes.keys())
    # train_df = pd.read_feather(path/"cv1_train.feather", columns=dtypes.keys())
    # valid_df = pd.read_feather(path/"cv1_valid.feather", columns=dtypes.keys())
    train_df = pd.read_pickle(path/"cv1_train.pickle.zip").astype(dtypes, errors="ignore")
    train_df = train_df.iloc[:int(1e6)]
    valid_df = pd.read_pickle(path/"cv1_valid.pickle.zip").astype(dtypes, errors="ignore")
    valid_df = valid_df.iloc[:int(1e4)]


UsageError: Line magic function `%%time` not found.


* Information of the training dataset

In [None]:
#!M
sep = '*' * 50
print(f'Training dataset detailed information \n{sep}')
print(f'Columns: {train_df.columns} \n{sep}')
print(f'Shape: {train_df.shape} \n{sep}')
print(f'NA values in each column: {sum(train_df.isna().sum())} \n{sep}')


In [None]:
def prep_data(df, questions_df=questions_df):
    # Exclude lectures
    df = df[df[target] != -1].reset_index(drop = True, inplace = False)
    # Fill NaN values in the 'prior_question_had_explanation' columns
    df['prior_question_had_explanation'].fillna(False, inplace = True)
    # Set type
    df = df.astype(dtypes)
    
    # Answer for the previous questions of users
    df['lag'] = df.groupby('user_id')[target].shift()
    # For each user (groupby('user_id')), compute the cummulative number of correct answers and number answers in general
    groupby = df.groupby('user_id')['lag']
    cum = groupby.agg(['cumsum', 'cumcount'])

    # User correctness (measure the users' learning progress)
    df['user_correctness'] = cum['cumsum'] / cum['cumcount']
    # Drop the 'lag' feature
    df.drop(columns = ['lag'], inplace = True)
    df.head()    
    
    # Overall correctness of users
    user_agg = df.groupby('user_id')[target].agg(['sum', 'count'])
    # Overall difficulty of questions
    content_agg = df.groupby('content_id')[target].agg(['sum', 'count'])    

    # Take only 24 last observations of each user
    df = df.groupby('user_id').tail(24).reset_index(drop = True)
    
    df = pd.merge(df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
    df.drop(columns = ['question_id'], inplace = True)

    # How many questions have been answered in each content ID?
    df['content_count'] = df['content_id'].map(content_agg['count']).astype('int32')
    # How hard are questions in each content ID?
    df['content_id'] = df['content_id'].map(content_agg['sum'] / content_agg['count'])
    
    return user_agg, content_agg, df
    

# Extract the validation set

In [None]:
#!M
%%time
train_user_agg, train_content_agg, train_df = prep_data(train_df)

user_sum_dict = train_user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = train_user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = train_content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = train_content_agg['count'].astype('int32').to_dict(defaultdict(int))
gc.collect()

In [None]:
#!M


In [None]:
#!M
%%time
valid_user_agg, valid_content_agg, valid_df = prep_data(valid_df)
gc.collect()

In [None]:
#!M


In [None]:
# Ratio is 6/24 = 25%
# valid_df = train_df.groupby('user_id').tail(6)
# train_df.drop(valid_df.index, inplace = True)

# Training

* Construct data

In [None]:
#!L
features = ['content_id', 'prior_question_elapsed_time', 
            'prior_question_had_explanation', 'user_correctness', 
            'part', 'content_count']

params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',

    'task_type': 'GPU' if get_gpu_device_count() > 0 else 'CPU',
    # 'task_type': 'GPU' if torch.cuda.is_available() else 'CPU',
    'grow_policy': 'Lossguide',
    'iterations': 25000,
    'learning_rate': 4e-3,
    'random_seed': 0,
    'l2_leaf_reg': 1e-1,
    'depth': 16,
    # 'max_leaves': 10,
    'border_count': 128,
    'verbose': 50,
    'od_type': 'Iter',
    'od_wait': 100,
}

print(params)

In [None]:
#!L
%%time

# Training and validating data
train_set = Pool(train_df[features], label = train_df[target])
val_set = Pool(valid_df[features], label = valid_df[target])

# Model definition
model = CatBoostClassifier(**params)

# Fitting
model.fit(train_set, eval_set = val_set, use_best_model = True, plot=True)

In [None]:
model.save_model(f"{kaggle_path/'model1.cbm'}")

# Inference

In [None]:
#!M
%%time
user_sum_dict = train_user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = train_user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = train_content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = train_content_agg['count'].astype('int32').to_dict(defaultdict(int))

In [None]:
for filename, dic in zip(["user_sum_dict", "user_count_dict", "content_sum_dict", "content_count_dict"],
                         [user_sum_dict, user_count_dict, content_sum_dict, content_count_dict]):

    with open(f'{kaggle_path/filename}.pickle', 'wb') as handle:
        pickle.dump(dic, handle)

In [None]:
train_df.head()


In [None]:
del train_df
gc.collect()


In [None]:
print(valid_df.shape)
valid_df.head()

In [None]:
%%time
# test_df = pd.read_pickle("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_train.pickle.zip")
# test_df = test_df.iloc[:int(1e5)]

In [None]:
%%time

validaten_flg = True
if validaten_flg:
    sys.path.append(str(Path.cwd()/"riiidNew"))
    from emulator import Iter_Valid
    iter_test = Iter_Valid(valid_df,max_user=1000)
    predicted = []
    def set_predict(df):
        predicted.append(df)
else:
    import riiideducation
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    set_predict = env.predict

In [None]:
# cumcount = sum([len(df) for df in predicted])
# count = 0
# pbar = tqdm(total=cumcount)
# previous_test_df = None
# for (current_test, current_prediction_df) in iter_test:
#     count+=1
#     if previous_test_df is not None:
#         answers = eval(current_test["prior_group_answers_correct"].iloc[0])
#         responses = eval(current_test["prior_group_responses"].iloc[0])
#         previous_test_df['answered_correctly'] = answers
#         previous_test_df['user_answer'] = responses
#         # your feature extraction and model training code here
#     previous_test_df = current_test.copy()
#     current_test = current_test[current_test.content_type_id == 0]
#     # your prediction code here
#     current_test['answered_correctly'] = model.predict(current_test[features])  # 0.5
#     set_predict(current_test.loc[:,['row_id', 'answered_correctly']])
#     pbar.update(len(current_test))
# print(f"count {count} {len(predicted)}")

In [None]:
valid_df

In [None]:
%%time

prior_test_df = None
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop = True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
        
        for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1

    prior_test_df = test_df.copy()
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)
    test_df.drop(labels="part", axis=1, inplace=True)
    test_df.content_id = test_df.content_id.astype(int)
    
    test_df = pd.merge(test_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')    
    user_sum = np.zeros(len(test_df), dtype = np.int16)
    user_count = np.zeros(len(test_df), dtype = np.int16)
    content_sum = np.zeros(len(test_df), dtype = np.int32)
    content_count = np.zeros(len(test_df), dtype = np.int32)
    
    for i, (user_id, content_id) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]

    test_df['user_correctness'] = user_sum / user_count
    test_df['content_count'] = content_count
    test_df['content_id'] = content_sum / content_count
    test_df[target] = model.predict_proba(test_df[features])[:,1]
    set_predict(test_df[['row_id', target]])