# Use the package 'datatable' for fast handling

In [1]:
# !pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

# Necessary packages

In [2]:
import numpy as np
import pandas as pd
from collections import defaultdict
import datatable as dt
import lightgbm as lgb
from matplotlib import pyplot as plt
from tqdm import tqdm
# import riiideducation
import torch
import pickle
import gc

# Error handling, ignore all
np.seterr(divide = 'ignore', invalid = 'ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [3]:
kaggle_path = "kaggle/"

# Preprocessing

* Data config

In [4]:
data_types_dict = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool',
    'user_answer': 'int8',
}

target = 'answered_correctly'

* Import data

In [5]:
%%time

questions_df = pd.read_csv(
    '/mnt/data30G/2020riid/questions.csv',
    usecols = [0, 3],
    dtype = {'question_id': 'int16', 'part': 'int8'}
)

# train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns = set(data_types_dict.keys())).to_pandas()
# train_df = dt.fread('/mnt/data30G/2020riid/train.csv', columns = set(data_types_dict.keys())).to_pandas()
# test_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_valid_1e4.feather", columns=data_types_dict.keys())
train_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_train_1e5.feather", columns=data_types_dict.keys())
valid_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_valid_1e4.feather", columns=data_types_dict.keys())
# train_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_train.feather", columns=data_types_dict.keys())
# valid_df = pd.read_feather("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_valid.feather", columns=data_types_dict.keys())


CPU times: user 75.8 ms, sys: 12 ms, total: 87.8 ms
Wall time: 90.6 ms


* Information of the training dataset

In [6]:
sep = '*' * 50
print(f'Training dataset detailed information \n{sep}')
print(f'Columns: {train_df.columns} \n{sep}')
print(f'Shape: {train_df.shape} \n{sep}')
print(f'NA values in each column: {sum(train_df.isna().sum())} \n{sep}')


Training dataset detailed information 
**************************************************
Columns: Index(['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation',
       'user_answer'],
      dtype='object') 
**************************************************
Shape: (100000, 10) 
**************************************************
NA values in each column: 5347 
**************************************************


In [7]:
def prep_data(df, questions_df=questions_df):
    # Exclude lectures
    df = df[df[target] != -1].reset_index(drop = True, inplace = False)
    # Fill NaN values in the 'prior_question_had_explanation' columns
    df['prior_question_had_explanation'].fillna(False, inplace = True)
    # Set type
    df = df.astype(data_types_dict)
    
    # Answer for the previous questions of users
    df['lag'] = df.groupby('user_id')[target].shift()
    # For each user (groupby('user_id')), compute the cummulative number of correct answers and number answers in general
    groupby = df.groupby('user_id')['lag']
    cum = groupby.agg(['cumsum', 'cumcount'])

    # User correctness (measure the users' learning progress)
    df['user_correctness'] = cum['cumsum'] / cum['cumcount']
    # Drop the 'lag' feature
    df.drop(columns = ['lag'], inplace = True)
    df.head()    
    
    # Overall correctness of users
    user_agg = df.groupby('user_id')[target].agg(['sum', 'count'])
    # Overall difficulty of questions
    content_agg = df.groupby('content_id')[target].agg(['sum', 'count'])    

    # Take only 24 last observations of each user
    df = df.groupby('user_id').tail(24).reset_index(drop = True)
    
    df = pd.merge(df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
    df.drop(columns = ['question_id'], inplace = True)

    # How many questions have been answered in each content ID?
    df['content_count'] = df['content_id'].map(content_agg['count']).astype('int32')
    # How hard are questions in each content ID?
    df['content_id'] = df['content_id'].map(content_agg['sum'] / content_agg['count'])
    
    return user_agg, content_agg, df
    

# Extract the validation set

In [8]:
train_user_agg, train_content_agg, train_df = prep_data(train_df)

user_sum_dict = train_user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = train_user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = train_content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = train_content_agg['count'].astype('int32').to_dict(defaultdict(int))
gc.collect()

0

In [9]:
valid_user_agg, valid_content_agg, valid_df = prep_data(valid_df)
gc.collect()

23

In [10]:
# Ratio is 6/24 = 25%
# valid_df = train_df.groupby('user_id').tail(6)
# train_df.drop(valid_df.index, inplace = True)

# Training

* Construct data

In [11]:
features = ['content_id', 'prior_question_elapsed_time', 
            'prior_question_had_explanation', 'user_correctness', 
            'part', 'content_count']

params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'task_type': 'GPU' if torch.cuda.is_available() else 'CPU',
    'grow_policy': 'Lossguide',
    'iterations': 25,
    'learning_rate': 4e-3,
    'random_seed': 0,
    'l2_leaf_reg': 1e-1,
    'depth': 6,
    # 'max_leaves': 10,
    'border_count': 128,
    'verbose': 50,
    'od_type': 'Iter',
    'od_wait': 30,
}

In [12]:
from catboost import CatBoostClassifier, Pool

# Training and validating data
train_set = Pool(train_df[features], label = train_df[target])
val_set = Pool(valid_df[features], label = valid_df[target])

In [13]:
# Model definition
model = CatBoostClassifier(**params)

# Fitting
model.fit(train_set, eval_set = val_set, use_best_model = True)

model.save_model(f"{kaggle_path}catboost.model")



0:	learn: 0.7819289	test: 0.9390251	best: 0.9390251 (0)	total: 261ms	remaining: 6.26s
24:	learn: 0.7879686	test: 0.9402484	best: 0.9404278 (12)	total: 6.71s	remaining: 0us
bestTest = 0.9404277802
bestIteration = 12
Shrink model to first 13 iterations.


# Inference

In [14]:

user_sum_dict = train_user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = train_user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = train_content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = train_content_agg['count'].astype('int32').to_dict(defaultdict(int))

for filename, dic in zip(["user_sum_dict", "user_count_dict", "content_sum_dict", "content_count_dict"],
                         [user_sum_dict, user_count_dict, content_sum_dict, content_count_dict]):
    with open(f'{kaggle_path}{filename}.pickle', 'wb') as handle:
        pickle.dump(dic, handle)


In [15]:
train_df.head()


Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_answer,user_correctness,part,content_count
0,50109776,210966,1061227082,0.372549,0,7,0,23000.0,False,0,0.857143,5,51
1,50109777,268082,1061227082,0.178571,0,8,0,15000.0,False,3,0.75,5,56
2,50109778,316496,1061227082,0.252632,0,9,0,53000.0,False,2,0.666667,5,95
3,50109779,347320,1061227082,0.317907,0,10,0,45000.0,False,0,0.6,1,994
4,22643630,163726,486769105,0.146254,0,4,0,17000.0,False,3,0.833333,3,841


In [16]:
del train_df
gc.collect()


45

In [17]:
valid_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_answer,user_correctness,part,content_count
0,45253,2146476402,1186307,0.666667,0,370,1,10000.0,True,3,,5,6
1,91995999,4916060626,1951897185,0.5,0,504,0,47000.0,True,3,,5,2
2,99237056,33806,2105538840,0.6,0,1,1,21000.0,False,2,,5,5
3,55360606,701477552,1174413790,0.5,0,221,0,69000.0,True,0,,5,2
4,44070864,11040265647,932625141,0.8,0,390,1,16000.0,True,3,,2,5


In [18]:
%%time
# test_df = pd.read_pickle("/home/sergey/mnt/4.5Tb/Downloads/riiidCVdata/cv1_train.pickle.zip")
# test_df = test_df.iloc[:int(1e5)]

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 9.3 µs


In [19]:
%%time
validaten_flg = True
if validaten_flg:
    from emulator import Iter_Valid
    iter_test = Iter_Valid(valid_df,max_user=1000)
    predicted = []
    def set_predict(df):
        predicted.append(df)
else:
    import riiideducation
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    set_predict = env.predict

CPU times: user 52.7 ms, sys: 0 ns, total: 52.7 ms
Wall time: 29.4 ms


In [20]:
# cumcount = sum([len(df) for df in predicted])
# count = 0
# pbar = tqdm(total=cumcount)
# previous_test_df = None
# for (current_test, current_prediction_df) in iter_test:
#     count+=1
#     if previous_test_df is not None:
#         answers = eval(current_test["prior_group_answers_correct"].iloc[0])
#         responses = eval(current_test["prior_group_responses"].iloc[0])
#         previous_test_df['answered_correctly'] = answers
#         previous_test_df['user_answer'] = responses
#         # your feature extraction and model training code here
#     previous_test_df = current_test.copy()
#     current_test = current_test[current_test.content_type_id == 0]
#     # your prediction code here
#     current_test['answered_correctly'] = model.predict(current_test[features])  # 0.5
#     set_predict(current_test.loc[:,['row_id', 'answered_correctly']])
#     pbar.update(len(current_test))
# print(f"count {count} {len(predicted)}")

In [None]:
valid_df

In [None]:
%%time

prior_test_df = None
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop = True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
        
        for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1

    prior_test_df = test_df.copy()
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)
    test_df.drop(labels="part", axis=1, inplace=True)
    test_df.content_id = test_df.content_id.astype(int)
    
    test_df = pd.merge(test_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')    
    user_sum = np.zeros(len(test_df), dtype = np.int16)
    user_count = np.zeros(len(test_df), dtype = np.int16)
    content_sum = np.zeros(len(test_df), dtype = np.int32)
    content_count = np.zeros(len(test_df), dtype = np.int32)
    
    for i, (user_id, content_id) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]

    test_df['user_correctness'] = user_sum / user_count
    test_df['content_count'] = content_count
    test_df['content_id'] = content_sum / content_count
    test_df[target] = model.predict_proba(test_df[features])[:,1]
    set_predict(test_df[['row_id', target]])

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_answer,user_correctness,part,content_count
0,45253,2146476402,1186307,0.666667,0,370,1,10000.0,True,3,,5,6
1,91995999,4916060626,1951897185,0.500000,0,504,0,47000.0,True,3,,5,2
2,99237056,33806,2105538840,0.600000,0,1,1,21000.0,False,2,,5,5
3,55360606,701477552,1174413790,0.500000,0,221,0,69000.0,True,0,,5,2
4,44070864,11040265647,932625141,0.800000,0,390,1,16000.0,True,3,,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6678,61008574,3581627779,1294623452,0.500000,0,456,0,70000.0,True,1,0.500,5,2
6679,89313593,0,1897784132,0.571429,0,0,0,,False,3,,5,7
6680,91851712,2906053590,1949015647,0.000000,0,1600,0,38000.0,True,3,0.500,5,2
6681,59170457,47419955086,1255308297,1.000000,0,155,1,41000.0,True,3,0.000,5,1


In [22]:
%%time

prior_test_df = None
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop = True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
        
        for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1

    prior_test_df = test_df.copy()
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)
    test_df.drop(labels="part", axis=1, inplace=True)
    test_df.content_id = test_df.content_id.astype(int)
    
    test_df = pd.merge(test_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')    
    user_sum = np.zeros(len(test_df), dtype = np.int16)
    user_count = np.zeros(len(test_df), dtype = np.int16)
    content_sum = np.zeros(len(test_df), dtype = np.int32)
    content_count = np.zeros(len(test_df), dtype = np.int32)
    
    for i, (user_id, content_id) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]

    test_df['user_correctness'] = user_sum / user_count
    test_df['content_count'] = content_count
    test_df['content_id'] = content_sum / content_count
    test_df[target] = model.predict_proba(test_df[features])[:,1]
    set_predict(test_df[['row_id', target]])

CPU times: user 10.2 s, sys: 231 ms, total: 10.4 s
Wall time: 9.58 s
