In [1]:
global_test = 2

#1 - feature generation, #2 - submission

In [2]:
import numpy as np
import pandas as pd
import glob
import riiideducation
#from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
#from sklearn.model_selection import train_test_split
from tqdm import tqdm
#from sklearn.preprocessing import StandardScaler
#from sklearn.inspection import permutation_importance
from catboost import CatBoostClassifier

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [3]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

## file access functions:

In [4]:
def df_to_np(df, filter_lectures:bool, convert_answers:bool):

    tmstmp = (df['timestamp']/3600000).to_numpy(dtype = np.float32) 
    userid = df['user_id'].to_numpy()
    ctntid = df['content_id'].to_numpy()
    ctnttp = df['content_type_id'].to_numpy()
    contnr = df['task_container_id'].to_numpy()
    
    pqtime = np.nan_to_num(df['prior_question_elapsed_time']\
                           .to_numpy(dtype = np.float32), nan = float32m1)
    
    pqexpl = df['prior_question_had_explanation']\
             .to_numpy(dtype = np.int8, na_value = 1)
    
    if convert_answers:
        usrans = df['user_answer'].to_numpy()              
        anscor = df['answered_correctly'].to_numpy()
           
    if filter_lectures:
        f = ctnttp == int8_0
        
        if convert_answers: 
            return tmstmp[f], userid[f], ctntid[f], ctnttp[f],\
            contnr[f], pqtime[f], pqexpl[f], usrans[f], anscor[f]
        else:
            return tmstmp[f], userid[f], ctntid[f], ctnttp[f],\
            contnr[f], pqtime[f], pqexpl[f]
    else:
        return tmstmp, userid, ctntid, ctnttp, contnr, pqtime, pqexpl, usrans, anscor

## initialization functions - run once:

In [5]:
def question_maps():
    
    global qestn_tagsmap, qestn_partmap, qestn_bndlmap, qestn_cansmap
    
    df = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')

    tags_list = df['tags'].replace(np.nan, 188).to_list()

    qestn_tagsmap = [[np.uint8(y) for y in str(x).split()] for x in tags_list]
    qestn_partmap = df['part'].to_numpy().astype(np.uint8)-uint8_1
    qestn_bndlmap = df['bundle_id'].to_numpy().astype(np.uint16)
    qestn_cansmap = df['correct_answer'].to_numpy().astype(np.uint8)

In [6]:
def lecture_maps():
    
    global lectr_tag_map, lectr_partmap, lectr_typemap

    df = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')

    df_lecture_id = df.lecture_id\
    .to_numpy(dtype = np.int16)

    df_tag  = df.tag\
    .to_numpy(dtype=np.uint8)

    df_part = df.part\
    .to_numpy(dtype=np.uint8)

    df_type_of = df.type_of\
    .replace({'starter':0,'concept':1,'solving question':2,'intention':3})\
    .to_numpy(dtype = np.uint8)

    lectr_tag_map = np.zeros(np.iinfo(np.int16).max, dtype = np.uint8)
    lectr_partmap = np.zeros(np.iinfo(np.int16).max, dtype = np.uint8)
    lectr_typemap = np.zeros(np.iinfo(np.int16).max, dtype = np.uint8)


    for i in range(len(df)):
        lectr_tag_map[df_lecture_id[i]] = df_tag[i]
        lectr_partmap[df_lecture_id[i]] = df_part[i]
        lectr_typemap[df_lecture_id[i]] = df_type_of[i]

In [7]:
def get_cor_table():
     
    max_neigbrs = 1000
    
    global cor_table
    
    map1 = np.load('../input/content-correlation-100to300/ctnt_map.npy')
    map2 = np.load('../input/content-correlation/ctnt_map.npy')

    cor1 = np.load('../input/content-correlation-100to300/result.npy')
    cor2 = np.load('../input/content-correlation/result.npy')

    cor = (cor2[map2[:max_content],:,:][:,map2[:max_content],:].astype(np.uint32)
          +cor1[map1[:max_content],:,:][:,map1[:max_content],:].astype(np.uint32))

    #correction for correlated content size, 
    #giving more weight to questions with a lot of answers
    size = np.log(np.log(cor[:,:,0].sum(axis = 1)+1))
    size = size/size.max()/10
    corrs = (cor[:,:,0]+5)/(cor[:,:,1]*1.7+cor[:,:,0]+10)+size
    
    cor_table = corrs.argsort(axis = 1)[:,-max_neigbrs:].astype(np.int32)


In [8]:
def get_content_answer_shares():
    
    global ca_shares_all

    columns = ['user_id',
                   'content_id',
                   'content_type_id',
                   'user_answer',
                   'answered_correctly']

    df = get_train_large(t_part=99, columns=columns) 
    df = df.loc[df.content_type_id == 0, df.columns != 'content_type_id']

    ca_shares_all = pd.pivot_table(df,
                                   values='answered_correctly',
                                   index='content_id',
                                   columns='user_answer',
                                   aggfunc='count',
                                   fill_value=0).to_numpy()+1
    
    ca_shares_all = (ca_shares_all.T/ca_shares_all.sum(axis = 1)).T.astype(np.float32)

In [9]:
def get_content_first_answer_mean():
    
    global ctnt_fam
    
    columns = ['user_id',
            'new_order',
            'answered_correctly',
            'content_id',
            'content_type_id']
        
    df = get_train_large(t_part = 99, columns = columns)

    df = df\
    .loc[df.content_type_id==0, df.columns!='content_type_id']\
    .sort_values(by = 'new_order')

    df = df.groupby(['user_id','content_id']).first()

    df = df.groupby('content_id').answered_correctly.mean().sort_index()
    
    ctnt_fam = df.to_numpy(dtype = np.float32)

In [10]:
def get_ctnt_enc():
    
    global ctnt_enc
    
    qestn_tagsmap_ohe = np.zeros((len(qestn_tagsmap), 189), dtype = np.bool)

    for i,j in enumerate(qestn_tagsmap):
        for k in j:
            qestn_tagsmap_ohe[i,k] = True
    
    
    tags_comps = StandardScaler().fit_transform(
        PCA(n_components=3, random_state=0).fit_transform(qestn_tagsmap_ohe)
    )

    corr_comps = StandardScaler().fit_transform(
        PCA(n_components=9, random_state=0).fit_transform(cor_table)
    )

    
    comb_comps = np.concatenate([tags_comps,corr_comps], axis = 1)

    ctnt_enc = PCA(n_components=1, random_state=0)\
    .fit_transform(comb_comps).astype(np.float32).ravel()

In [11]:
def get_ac_pqexpl():
    
    global ac_pqexpl
    
    ac_pqexpl = np.zeros((max_content, 2, 2), dtype = np.int32)
    
    columns = ['content_id',
               'prior_question_had_explanation',
               'content_type_id',
               'answered_correctly']

    df = get_train_large(t_part = 99, columns = columns)

    df = df\
    .loc[df.content_type_id==0]\
    .groupby(['content_id','prior_question_had_explanation'])\
    .agg({'answered_correctly':['sum','count']})\
    .droplevel(0, axis  =1).reset_index()\
    .fillna(1)

    ctntid = df['content_id'].to_numpy()
    pqexpl = df['prior_question_had_explanation'].to_numpy(dtype = np.int8)
    anssum = df['sum'].to_numpy(dtype = np.int32)
    anscnt = df['count'].to_numpy(dtype = np.int32)     
    ansfls = anscnt - anssum

    for r in range(len(df)):
        ac_pqexpl[ctntid[r],pqexpl[r],1] += ansfls[r]
        ac_pqexpl[ctntid[r],pqexpl[r],0] += anssum[r]

## pretrain functions, run every part:

In [12]:
def get_train_small(t_part:int):

    all_files  = glob.glob('../input/riiid-parquets-v5/df_*')
    read_files = [file for file in all_files if file.endswith('_'+str(t_part))]
    df = pd.read_parquet(read_files[0])
    return df

In [13]:
def get_train_large(t_part:int, columns:list):
    
    all_files  = glob.glob('../input/riiid-parquets-v5/df_*')
    read_files = [file for file in all_files if not file.endswith('_'+str(t_part))]
    df = pd.concat([pd.read_parquet(file, columns = columns) for file in read_files])
    return df

In [14]:
def get_train_groups(t_part:int):
    
    df = get_train_small(t_part)

    groups = []
    for i in np.arange(0, 10000, dtype = np.int16):
        group = df.loc[df.new_order == i].reset_index(drop = True)
        groups.append(group)
    
    return groups

In [15]:
def get_arrays_and_lists():
    
    global next_uplace,\
    au_ctntid,\
    a_userid,\
    lu_seq,\
    lu_seq_part,\
    au_anshar,\
    au_ctshar,\
    user_map,\
    au_tmstmp_prv
    
    au_ctntid     = np.zeros((max_users, max_content, 3), dtype = np.int8)
    a_userid      = np.zeros((max_users, 2), dtype = np.int16)
    
    au_anshar     = np.zeros((max_users, 2), dtype = np.float32) 
    au_ctshar     = np.zeros((max_users, 2), dtype = np.float32) 

    user_map      = np.zeros(np.iinfo(np.int32).max,dtype = np.int32)
    next_uplace   = np.int32(1)

    au_tmstmp_prv = np.zeros((max_users,3), dtype = np.float32)
    
    lu_seq        = [[] for _ in range(max_users)]
    lu_seq_part   = [[[],[],[],[],[],[],[]] for _ in range(max_users)]


## iterational functions, run every iteration:

In [16]:
def update_user_map(unique_users):
    
    global next_uplace
    
    for u in unique_users:
        if user_map[u] == int32_0:
            user_map[u] = next_uplace
            next_uplace += int32_1

In [17]:
def update_arrays(df):

    tmstmp,userid,ctntid,ctnttp,contnr,pqtime,pqexpl,usrans,anscor = df_to_np(df,False,True)
    
    for r in range(len(df)):
        
        user_ = user_map[userid[r]]

        if tmstmp[r] > au_tmstmp_prv[user_,0]:
            au_tmstmp_prv[user_,2] = au_tmstmp_prv[user_,1]
            au_tmstmp_prv[user_,1] = au_tmstmp_prv[user_,0]
            au_tmstmp_prv[user_,0] = tmstmp[r]
  
        if ctnttp[r] == int8_0:
            
            lsu   = lu_seq[user_]
            lsup  = lu_seq_part[user_][qestn_partmap[ctntid[r]]]
            
            bndl_ = qestn_bndlmap[ctntid[r]]
            ctnt_ = ctntid[r]
                        
            if len(lsu)>m: lsu.pop(0)
            if len(lsup)>m: lsup.pop(0)
            
      
            au_ctntid[user_,ctnt_,1]         += int8_1
            au_ctntid[user_,ctnt_,2]          = usrans[r]
            
            
            if anscor[r] == int8_1:
                a_userid[user_,0]            += int16_1
                au_ctntid[user_,ctnt_,0]      = int8_1
                lsu.append(True)
                lsup.append(True)
                au_anshar[user_, 0]           += ca_shares_all[ctnt_,usrans[r]]
                au_ctshar[user_, 0]           += ctnt_fam[ctnt_]
                
            else:
                a_userid[user_,1]             += int16_1
                au_ctntid[user_,ctnt_,0]       = int8_0
                lsu.append(False)
                lsup.append(False)
                au_anshar[user_, 1]           += ca_shares_all[ctnt_,usrans[r]]
                au_ctshar[user_, 1]           += ctnt_fam[ctnt_]


In [18]:
def get_features(df, is_test:bool):
    
    if is_test:
        tmstmp,userid,ctntid,ctnttp,contnr,pqtime,pqexpl=\
        df_to_np(df,True,False)
    else:
        tmstmp,userid,ctntid,ctnttp,contnr,pqtime,pqexpl,usrans,anscor=\
        df_to_np(df,True,True)

    user = user_map[userid]
    part = qestn_partmap[ctntid]
    userid_ctntid_ = au_ctntid[user,ctntid]  
    
    userid_ = a_userid[user,:]
    userid_avg_ = (userid_[:,0]/(userid_[:,0]+userid_[:,1]+int16_1)).astype(np.float32)
    
    cp_ = ac_pqexpl[ctntid,pqexpl,:]
    ctntid_pqexpl_avg_ = (cp_[:,0]/(cp_[:,0]+cp_[:,1]+int32_1)).astype(np.float32)
    
    
    #based on answer and content solve probability
    
    uanshar_ = au_anshar[user, :]
    #uanshar_slf = (uanshar_[:,0]/(uanshar_[:,0]+uanshar_[:,1]+e)).astype(np.float32)
    #uanshar_cor = (uanshar_[:,0]/(userid_[:,0]+e)).astype(np.float32)
    uanshar_fls = (uanshar_[:,1]/(userid_[:,1]+e)).astype(np.float32)
    
    uctshar_ = au_ctshar[user, :]
    uctshar_slf = (uctshar_[:,0]/(uctshar_[:,0]+uctshar_[:,1]+e)).astype(np.float32)
    uctshar_cor = (uctshar_[:,0]/(userid_[:,0]+e)).astype(np.float32)
    uctshar_fls = (uctshar_[:,1]/(userid_[:,1]+e)).astype(np.float32)
    
    
    #user features based on neighboring questions:
    
    correlation_ids = cor_table[ctntid]
    neigh = au_ctntid[user.reshape(-1,1),correlation_ids,:]
    
    all_ans_cnt = np.count_nonzero(neigh[:,:,1],axis = 1).astype(np.int16)
    cor_ans_cnt = np.count_nonzero(neigh[:,:,0],axis = 1).astype(np.int16)
    fls_ans_cnt = all_ans_cnt - cor_ans_cnt
    
    neigh_ca_shrs_all = ca_shares_all[correlation_ids,neigh[:,:,2]]*(neigh[:,:,1]!=int8_0)
    
    cor_shrs_all = ((neigh_ca_shrs_all*(neigh[:,:,0]==int8_1)).sum(axis = 1)/
                    (cor_ans_cnt+e)).astype(np.float32)
    
    fls_shrs_all = ((neigh_ca_shrs_all*(neigh[:,:,0]==int8_0)).sum(axis = 1)/
                    (fls_ans_cnt+e)).astype(np.float32)

    
    #user features based on last n questions:
    
    lu_seq_        = [lu_seq[u] for u in user]
    
    lst_m_avg      = np.array(
        [x.count(True)/(len(x)+e) for x in lu_seq_],
        dtype = np.float32)
    
    lst_s_avg      = np.array(
        [x[-s:].count(True)/(len(x[-s:])+e) for x in lu_seq_],
        dtype = np.float32)
       
        
    lu_seq_part_   = [lu_seq_part[u][part[_]] for _, u in enumerate(user)]
    
    lst_part_m_avg = np.array(
        [x.count(True)/(len(x)+e) for x in lu_seq_part_],
        dtype = np.float32)
    
    lst_part_s_avg = np.array(
        [x[-s:].count(True)/(len(x[-s:])+e) for x in lu_seq_part_],
        dtype = np.float32)



    X = pd.DataFrame({
        'part':part,
        'prior_explanation':pqexpl,
        'prior_elapsed_time':pqtime,
        'content':ctntid,
        'ctntent_encoded':ctnt_enc[ctntid],
        'task_container':contnr,
        
        'time_to_cont_1':tmstmp - au_tmstmp_prv[user,0],
        'time_to_cont_3':tmstmp - au_tmstmp_prv[user,2],
        'time_cont1_to_cont2':au_tmstmp_prv[user,0] - au_tmstmp_prv[user,1],
        'time_cont2_to_cont3':au_tmstmp_prv[user,1] - au_tmstmp_prv[user,2],
        
        'user_content_attempts':userid_ctntid_[:,1],
        'user_content_last_1':userid_ctntid_[:,0],
        
        'user_part_last_m_avg':lst_part_m_avg, 
        'user_part_last_s_avg':lst_part_s_avg,
        'user_last_m_avg':lst_m_avg,
        'user_last_s_avg':lst_s_avg, 

        'content_explanation_avg':ctntid_pqexpl_avg_,
        'content_first_answer_avg':ctnt_fam[ctntid],
        'content_avg_time':ctnt_mtime[ctntid],

        'user_relative_content_avg':uctshar_slf,        
        'user_true_content_avg':uctshar_cor, 
        'user_false_content_avg':uctshar_fls,
        'user_false_answer_avg':uanshar_fls,
        
        'neighbor_content_true_shares':cor_shrs_all,
        'neighbor_content_false_shares':fls_shrs_all,
    })
    
    if is_test:
        return X
    else:
        return X, anscor


## execution:

In [19]:
%%time

uint8_0 = np.uint8(0)
uint8_1 = np.uint8(1)

uint16_0 = np.uint16(0)
uint16_1 = np.uint16(1)

int8_0  = np.int8(0)
int8_1  = np.int8(1)

int16_0 = np.int16(0)
int16_1 = np.int16(1)

int32_0 = np.int32(0)
int32_1 = np.int32(1)

float32m1 = np.float32(-1)

max_users   = 450000
max_content = 13523

m = 100
s = 20
e = 0.1

question_maps()
lecture_maps()
get_cor_table()
get_content_answer_shares()
get_ac_pqexpl()
get_content_first_answer_mean()
get_ctnt_enc()

ctnt_mtime = np.load('../input/question-duration/question_mean_time.npy')

CPU times: user 3min 25s, sys: 38.7 s, total: 4min 3s
Wall time: 4min 30s


In [20]:
%%time

if global_test == 1:

    for i in tqdm(range(10)):

        X = []
        y = []

        get_arrays_and_lists()
        groups = get_train_groups(i)

        for df in groups:

            update_user_map(df.user_id.unique())
            X_, y_ = get_features(df,False)
            X.append(X_)
            y.append(y_)
            update_arrays(df)
        
        del(groups)

        X = pd.concat(X)
        y = np.concatenate(y)

        X.to_parquet('X_'+str(i))
        np.save('y_'+str(i), y)

CPU times: user 20 µs, sys: 4 µs, total: 24 µs
Wall time: 8.11 µs


In [21]:
%%time

if global_test == 2:
    
    get_arrays_and_lists()
    
    for i in tqdm(range(10)):

        groups = get_train_groups(i)
        

        for df in groups:

            update_user_map(df.user_id.unique())
            update_arrays(df)

        del(groups)


100%|██████████| 10/10 [52:10<00:00, 313.09s/it]

CPU times: user 51min 22s, sys: 43.8 s, total: 52min 6s
Wall time: 52min 14s





In [22]:
if global_test == 2:
    
    model1 = CatBoostClassifier()
    model1.load_model(fname='../input/riiid-model/cb1')
    
    model2 = CatBoostClassifier()
    model2.load_model(fname='../input/riiid-model/cb2')

In [23]:
if global_test == 2:
    
    env = riiideducation.make_env()
    iter_test = env.iter_test()

In [24]:
%%time

if global_test == 2:
    
    old_df = None

    for (new_df, sample) in iter_test:
        
        
        if old_df is not None:
            old_df['user_answer'] = np.array(
                [int(x) for x in new_df.iloc[0,9][1:-1].split(', ')], dtype = np.int8)
            old_df['answered_correctly'] = np.array(
                [int(x) for x in new_df.iloc[0,8][1:-1].split(', ')], dtype = np.int8)
            
            update_arrays(old_df)
            
            
        old_df = new_df.iloc[:,:8].copy()
        update_user_map(new_df.user_id.unique())
        X = get_features(new_df, True)

        sample['answered_correctly'] =  (
            model1.predict_proba(X)[:,1]/2 + model2.predict_proba(X)[:,1]/2
        )
        
        env.predict(sample)

CPU times: user 276 ms, sys: 55.1 ms, total: 331 ms
Wall time: 595 ms
