## Source Kernel
This kernel generates and submits predictions using the model and features developed in the kernel titled [RIIID: BigQuery-XGBoost End-to-End](https://www.kaggle.com/calebeverett/riiid-bigquery-xgboost-end-to-end).

In [1]:
import gc
import json
import pandas as pd
from pathlib import Path
import sqlite3
import riiideducation
import time
import xgboost as xgb

In [2]:
env = riiideducation.make_env()
iter_test = env.iter_test()

## Load Model

In [3]:
PATH = Path('../input/riiid-submission')

In [4]:
model = xgb.Booster(model_file=PATH/'model.xgb')
print('model loaded')

model loaded


## Load State

In [5]:
dtypes = {
    'answered_correctly': 'int8',
    'answered_correctly_content_id_cumsum': 'int16',
    'answered_correctly_content_id_cumsum_pct': 'int16',
    'answered_correctly_cumsum': 'int16',
    'answered_correctly_cumsum_pct': 'int8',
    'answered_correctly_cumsum_upto': 'int8',
    'answered_correctly_rollsum': 'int8',
    'answered_correctly_rollsum_pct': 'int8',
    'answered_incorrectly': 'int8',
    'answered_incorrectly_content_id_cumsum': 'int16',
    'answered_incorrectly_cumsum': 'int16',
    'answered_incorrectly_rollsum': 'int8',
    'bundle_id': 'uint16',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'correct_answer': 'uint8',
    'lecture_id': 'uint16',
    'lectures_cumcount': 'int16',
    'part': 'uint8',
    'part_correct_pct': 'uint8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_elapsed_time_rollavg': 'float32',
    'prior_question_had_explanation': 'bool',
    'question_id': 'uint16',
    'question_id_correct_pct': 'uint8',
    'row_id': 'int64',
    'tag': 'uint8',
    'tag__0': 'uint8',
    'tag__0_correct_pct': 'uint8',
    'tags': 'str',
    'task_container_id': 'int16',
    'task_container_id_orig': 'int16',
    'timestamp': 'int64',
    'type_of': 'str',
    'user_answer': 'int8',
    'user_id': 'int32'
}

batch_cols_all = [
    'user_id',
    'content_id',
    'row_id',
    'task_container_id',
    'timestamp',
    'prior_question_elapsed_time',
    'prior_question_had_explanation'
]

batch_cols_prior = [
    'user_id',
    'content_id',
    'content_type_id'
]

with open(PATH/'columns.json') as cj:
    test_cols = json.load(cj)

batch_cols = ['user_id', 'content_id', 'row_id'] + [c for c in batch_cols_all if c in test_cols]

print('test_cols:')
_ = list(map(print, test_cols))

dtypes_test = {k: v for k,v in dtypes.items() if k in test_cols}
dtypes_test = {**dtypes_test, **{'user_id': 'int32', 'content_id': 'int16'}}

test_cols:
answered_correctly_content_id_cumsum
answered_correctly_cumsum
answered_correctly_cumsum_pct
answered_incorrectly_content_id_cumsum
answered_incorrectly_cumsum
part
part_correct_pct
question_id_correct_pct
tag__0
tag__0_correct_pct
task_container_id
timestamp


### Load Users-Content

In [6]:
df_users_content = pd.read_pickle(PATH/'df_users_content.pkl')
df_users_content.head()

Unnamed: 0,user_id,content_id,answered_correctly,answered_incorrectly
0,115,4,0,1
1,115,6,1,0
2,115,25,1,0
3,115,45,0,1
4,115,50,1,0


### Create Users Dataframe

In [7]:
df_users = df_users_content[['user_id', 'answered_correctly', 'answered_incorrectly']].groupby('user_id').sum().reset_index()
df_users = df_users.astype({'user_id': 'int32', 'answered_correctly': 'int16', 'answered_incorrectly': 'int16'})
df_users.head()

Unnamed: 0,user_id,answered_correctly,answered_incorrectly
0,115,32,14
1,124,7,23
2,2746,11,8
3,5382,84,41
4,8623,70,39


### Load Questions
Question related features joined with batches received from competition api prior to making predictions.

In [8]:
df_questions = pd.read_pickle(PATH/'df_questions.pkl')
df_questions.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,tag__0,part_correct_pct,tag__0_correct_pct,question_id_correct_pct
808,0,0,0,1,"[51, 131, 162, 38]",51,75,80,91
6089,1,1,1,1,"[131, 36, 81]",131,75,80,89
2645,2,2,0,1,"[131, 101, 162, 92]",131,75,80,55
2696,3,3,0,1,"[131, 149, 162, 29]",131,75,80,78
12456,4,4,3,1,"[131, 5, 162, 38]",131,75,80,61


## Create Database

In [9]:
conn = sqlite3.connect(':memory:')
cursor = conn.cursor()

### Create Users-Content Table

In [10]:
%%time

chunk_size = 20000
total = len(df_users_content)
n_chunks = (total // chunk_size + 1)

i = 0
while i < n_chunks:
    df_users_content.iloc[i * chunk_size:(i + 1) * chunk_size].to_sql('users_content', conn, method='multi', if_exists='append', index=False)
    i += 1

conn.execute('CREATE UNIQUE INDEX users_content_index ON users_content (user_id, content_id)')
del df_users_content
gc.collect()

CPU times: user 3min 31s, sys: 13.3 s, total: 3min 44s
Wall time: 3min 44s


0

In [11]:
%%time
pd.read_sql('SELECT * from users_content LIMIT 5', conn)

CPU times: user 3.3 ms, sys: 26 µs, total: 3.33 ms
Wall time: 3.49 ms


Unnamed: 0,user_id,content_id,answered_correctly,answered_incorrectly
0,115,4,0,1
1,115,6,1,0
2,115,25,1,0
3,115,45,0,1
4,115,50,1,0


### Create Users Table

In [12]:
%%time

chunk_size = 20000
total = len(df_users)
n_chunks = (total // chunk_size + 1)

i = 0
while i < n_chunks:
    df_users.iloc[i * chunk_size:(i + 1) * chunk_size].to_sql('users', conn, method='multi', if_exists='append', index=False)
    i += 1

_ = conn.execute('CREATE UNIQUE INDEX users_index ON users (user_id)')
del df_users
gc.collect()

CPU times: user 914 ms, sys: 34.9 ms, total: 949 ms
Wall time: 948 ms


0

In [13]:
%%time
pd.read_sql('SELECT * from users LIMIT 5', conn)

CPU times: user 1.7 ms, sys: 859 µs, total: 2.56 ms
Wall time: 2.03 ms


Unnamed: 0,user_id,answered_correctly,answered_incorrectly
0,115,32,14
1,124,7,23
2,2746,11,8
3,5382,84,41
4,8623,70,39


### Create Questions Table

In [14]:
%%time

q_cols = [
    'question_id',
    'part',
    'tag__0',
    'part_correct_pct',
    'tag__0_correct_pct',
    'question_id_correct_pct'
]

df_questions[q_cols].to_sql('questions', conn, method='multi', index=False)
_ = conn.execute('CREATE UNIQUE INDEX question_id_index ON questions (question_id)')
del df_questions
gc.collect()

CPU times: user 183 ms, sys: 8.07 ms, total: 191 ms
Wall time: 190 ms


0

In [15]:
%%time
pd.read_sql('SELECT * from questions LIMIT 5', conn)

CPU times: user 3.74 ms, sys: 0 ns, total: 3.74 ms
Wall time: 3.28 ms


Unnamed: 0,question_id,part,tag__0,part_correct_pct,tag__0_correct_pct,question_id_correct_pct
0,0,1,51,75,80,91
1,1,1,131,75,80,89
2,2,1,131,75,80,55
3,3,1,131,75,80,78
4,4,1,131,75,80,61


In [16]:
db_size = pd.read_sql('SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()', conn)['size'][0]
print(f'Total size of database is: {db_size/1e9:0.3f} GB')

Total size of database is: 3.061 GB


In [17]:
import sys
if True:
    local_vars = list(locals().items())
    for var, obj in local_vars:
        size = sys.getsizeof(obj)
        if size > 1e7:
            print(f'{var:<18}{size/1e6:>10,.1f} MB')

## Predict

### Get State

In [18]:
def select_state(batch_cols, records):
    return f"""
        WITH b ({(', ').join(batch_cols)}) AS (
        VALUES {(', ').join(list(map(str, records)))}
        )
        SELECT
            {(', ').join([f'b.{col}' for col in batch_cols])},
            IFNULL(answered_correctly_cumsum, 0) answered_correctly_cumsum, 
            IFNULL(answered_incorrectly_cumsum, 0) answered_incorrectly_cumsum,
            IIF(
                (answered_correctly_cumsum + answered_incorrectly_cumsum) > 0,
                answered_correctly_cumsum * 100 / (answered_correctly_cumsum + answered_incorrectly_cumsum),
                0
            ) answered_correctly_cumsum_pct,
            IFNULL(answered_correctly_content_id_cumsum, 0) answered_correctly_content_id_cumsum,
            IFNULL(answered_incorrectly_content_id_cumsum, 0) answered_incorrectly_content_id_cumsum,
            {(', ').join(q_cols)}
        FROM b
        LEFT JOIN (
            SELECT user_id, answered_correctly answered_correctly_cumsum,
                answered_incorrectly answered_incorrectly_cumsum
            FROM users
            WHERE {(' OR ').join([f'user_id = {r[0]}' for r in records])}
        ) u ON (u.user_id = b.user_id)
        LEFT JOIN (
            SELECT user_id, content_id, answered_correctly answered_correctly_content_id_cumsum, 
            answered_incorrectly answered_incorrectly_content_id_cumsum
            FROM users_content uc
            WHERE {(' OR ').join([f'(user_id = {r[0]} AND content_id = {r[1]})' for r in records])}
        ) uc ON (uc.user_id = b.user_id AND uc.content_id = b.content_id)
        LEFT JOIN (
            SELECT {(', ').join(q_cols)}
            FROM questions
        ) q ON (q.question_id = b.content_id)
    """

### Update State

In [19]:
def update_state(df):
    
    def get_select_params(r):
        values_uc = f'({r.user_id}, {r.content_id}, {r.answered_correctly}, {1-r.answered_correctly})'
        values_u = f'({r.user_id}, {r.answered_correctly}, {1-r.answered_correctly})'
        return values_uc, values_u
    
    values = df.apply(get_select_params, axis=1, result_type='expand')
    
    return f"""
        INSERT INTO users_content(user_id, content_id, answered_correctly, answered_incorrectly)
        VALUES {(',').join(values[0])}
        ON CONFLICT(user_id, content_id) DO UPDATE SET
            answered_correctly = answered_correctly + excluded.answered_correctly,
            answered_incorrectly = answered_incorrectly + excluded.answered_incorrectly;
             
        INSERT INTO users(user_id, answered_correctly, answered_incorrectly)
        VALUES {(',').join(values[1])}
        ON CONFLICT(user_id) DO UPDATE SET
            answered_correctly = answered_correctly + excluded.answered_correctly,
            answered_incorrectly = answered_incorrectly + excluded.answered_incorrectly;
    """

In [20]:
%%time
df_batch_prior = None
counter = 0

for test_batch in iter_test:
    counter += 1

    # update state
    if df_batch_prior is not None:
        answers = eval(test_batch[0]['prior_group_answers_correct'].iloc[0])
        df_batch_prior['answered_correctly'] = answers
        cursor.executescript(update_state(df_batch_prior[df_batch_prior.content_type_id == 0]))

        if not counter % 100:
            conn.commit()

    # save prior batch for state update
    df_batch_prior = test_batch[0][batch_cols_prior].astype({k: dtypes[k] for k in batch_cols_prior})

    # get state
    df_batch = test_batch[0][test_batch[0].content_type_id == 0]
    records = df_batch[batch_cols].fillna(0).to_records(index=False)
    df_batch = pd.read_sql(select_state(batch_cols, records), conn)

    # predict
    predictions = model.predict(xgb.DMatrix(df_batch[test_cols]))
    df_batch['answered_correctly'] = predictions

    #submit
    env.predict(df_batch[['row_id', 'answered_correctly']])

CPU times: user 479 ms, sys: 17 ms, total: 496 ms
Wall time: 195 ms
