# About this notebook¶
- This is the notebook for the final submission.
- In order to meet memory and time constraints, we used relatively few features.
- The details of the features are presented in the following notebooks.  
https://www.kaggle.com/tkyiws/riiid-feature-engineering

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
import lightgbm as lgb
from collections import defaultdict
import gc
import pickle
import joblib
import riiideducation

In [None]:
features = [
    'content_id',
    'prior_question_elapsed_time',
    'prior_question_had_explanation',
    'user_correctness',
    'content_count',
    'part',
    'content_mean',
    'cumcount_u',
    'cumcount_p',
    'attempt',
    'part_avg',
    'timestamp_diff1',
    'timestamp_diff2',
    'cluster_id',
    'cumcount_cl',
    'target_lag',
    'cluster0_avg',
    'cluster1_avg',
    'cluster2_avg',
    'prior_tag',
    'task_num',
    'user_rating',
    'time_mean_diff',
]

target = 'answered_correctly'

features_dtypes = {
    'content_id': 'int16',
    'content_mean': 'float32',
    'prior_question_elapsed_time': 'float64',
    'prior_question_had_explanation': 'bool',
    'user_correctness': 'float32',
    'content_count': 'int32',
    'part': 'int8',
    'cumcount_u': 'uint16',
    'cumcount_p': 'uint16',
    'attempt': 'uint16',
    'part_avg': 'float32',
    'timestamp_diff1': 'float64',
    'timestamp_diff2': 'float64',
    'cluster_id': 'int8',
    'cumcount_cl': 'uint16',
    'target_lag': 'int8',
    'cluster0_avg': 'float32',
    'cluster1_avg': 'float32',
    'cluster2_avg': 'float32',
    'prior_tag': 'int16',
    'task_num': 'int8',
    'user_rating': 'float32',
    'time_mean_diff': 'float32',
}

# Data Loading

In [None]:
time_dict1 = joblib.load("../input/riiid-inf-data/time_dict1.pkl.zip")
time_dict2 = joblib.load("../input/riiid-inf-data/time_dict2.pkl.zip")
time_dict3 = joblib.load("../input/riiid-inf-data/time_dict3.pkl.zip")

questions_df = pd.read_pickle('../input/riiid-inf-data/questions_df.pickle')
lectures_df = pd.read_pickle('../input/riiid-inf-data/lectures_df.pickle')

part_null_data = pd.read_pickle('../input/riiid-inf-data/part_null_data.pickle')
cluster_null_data = pd.read_pickle('../input/riiid-inf-data/cluster_null_data.pickle')

In [None]:
user_dict_sum = joblib.load("../input/riiid-dict-data/user_dict_sum.pkl.zip")
user_dict_count = joblib.load("../input/riiid-dict-data/user_dict_count.pkl.zip")

part_dict_sum = joblib.load("../input/riiid-dict-data/part_dict_sum.pkl.zip")
part_dict_count = joblib.load("../input/riiid-dict-data/part_dict_count.pkl.zip")

cluster_dict_sum = joblib.load("../input/riiid-dict-data/cluster_dict_sum.pkl.zip")
cluster_dict_count = joblib.load("../input/riiid-dict-data/cluster_dict_count.pkl.zip")

lag_dict = joblib.load("../input/riiid-dict-data/lag_dict.pkl.zip")
last_lecture_dict = joblib.load("../input/riiid-dict-data/last_lecture_dict.pkl.zip")
content_mean_sum_dict = joblib.load("../input/riiid-dict-data/content_mean_sum_dict.pkl.zip")
time_adm_dict = joblib.load("../input/riiid-dict-data/time_adm_dict.pkl.zip")

# Preprocess

In [None]:
def get_state():
    data = pd.read_pickle('../input/riiid-inf-data/state_data.pickle')   
    state = dict()
    
    for user_id in data['user_id'].unique():
        state[user_id] = {}

    user_content = data.groupby('user_id')['content_id'].apply(np.array).apply(np.sort).apply(np.unique)
    user_attempts = data.groupby(['user_id', 'content_id'])['content_id'].count().astype(np.uint8).groupby('user_id').apply(np.array).values
    user_attempts -= 1
    
    del data
    gc.collect()
    
    for user_id, content, attempt in zip(state.keys(), user_content, user_attempts):
        state[user_id]['user_content_attempts'] = dict(zip(content, attempt))
        
    del user_content, user_attempts
    gc.collect()
    
    return state

state = get_state()

In [None]:
def get_attempt(test):
    attempt = []
    
    for idx, (user_id, content_id) in enumerate(test[['user_id', 'content_id']].values):
        if user_id in state:
            if content_id in state[user_id]['user_content_attempts']:
                state[user_id]['user_content_attempts'][content_id] = min(6, state[user_id]['user_content_attempts'][content_id] + 1)
            else:
                state[user_id]['user_content_attempts'][content_id] = 0
        else:
            dict_keys = ['user_content_attempts']
            dict_default_vals = [dict(zip([content_id],[0]))]
            state[user_id] = dict(zip(dict_keys, dict_default_vals))
            
        attempt.append(state[user_id]['user_content_attempts'][content_id])
    
    return attempt

In [None]:
def get_timestamp_diff(test):
    timestamp_diff1 = []
    timestamp_diff2 = []
    
    for user_id, timestamp in test[['user_id', 'timestamp']].values:
        if user_id in time_dict1:     
            if timestamp > time_dict1[user_id]:  
                if time_dict2[user_id] is np.nan:
                    timestamp_diff1.append(timestamp - time_dict1[user_id])
                    timestamp_diff2.append(np.nan)
                    time_dict3[user_id] = time_dict2[user_id]
                    time_dict2[user_id] = time_dict1[user_id]
                    time_dict1[user_id] = timestamp   
                else:
                    timestamp_diff1.append(timestamp - time_dict1[user_id])
                    timestamp_diff2.append(timestamp - time_dict2[user_id])
                    time_dict3[user_id] = time_dict2[user_id]
                    time_dict2[user_id] = time_dict1[user_id]
                    time_dict1[user_id] = timestamp                           
            else:
                if time_dict2[user_id] is np.nan:
                    timestamp_diff1.append(np.nan)
                    timestamp_diff2.append(np.nan)               
                elif time_dict3[user_id] is np.nan:
                    timestamp_diff1.append(timestamp - time_dict2[user_id])
                    timestamp_diff2.append(np.nan)                   
                else:
                    timestamp_diff1.append(timestamp - time_dict2[user_id])
                    timestamp_diff2.append(timestamp - time_dict3[user_id])         
        else:
            timestamp_diff1.append(np.nan)
            timestamp_diff2.append(np.nan)
            time_dict1[user_id] = timestamp
            time_dict2[user_id] = np.nan
            time_dict3[user_id] = np.nan
            
    return timestamp_diff1, timestamp_diff2

In [None]:
def get_user_data(test):
    user_correctness = []
    cumcount_u = []
    part_avg = []
    cumcount_p = []
    cluster0_avg = []
    cluster1_avg = []
    cluster2_avg = []
    cumcount_cl = []
    target_lag = []
    user_rating = []
    
    for user_id, part, cluster_id, content_mean in test[['user_id', 'part', 'cluster_id', 'content_mean']].values:
        try:
            part_null = part_null_data[part]
        except:
            part_null = part_null_data.mean()
          
        try:
            cluster0_null = cluster_null_data[0]
        except:
            cluster0_null = cluster_null_data.mean()
            
        try:
            cluster1_null = cluster_null_data[1]
        except:
            cluster1_null = cluster_null_data.mean()
            
        try:
            cluster2_null = cluster_null_data[2]
        except:
            cluster2_null = cluster_null_data.mean()
            
        if user_id in user_dict_sum:
            user_correctness.append(user_dict_sum[user_id] / user_dict_count[user_id])    
            cumcount_u.append(min(7500, user_dict_count[user_id]))
            user_rating.append((user_dict_sum[user_id] - content_mean_sum_dict[user_id]) / user_dict_count[user_id])
        else:
            user_correctness.append(0.68)
            cumcount_u.append(0)
            user_rating.append(0)
            
        k = (user_id, part)
        if k in part_dict_sum:
            part_avg.append(part_dict_sum[k] / part_dict_count[k])    
            cumcount_p.append(min(7500, part_dict_count[k]))
        else:
            part_avg.append(part_null)
            cumcount_p.append(0)
            
        k = (user_id, cluster_id)
        if k in cluster_dict_sum:  
            cumcount_cl.append(min(7500, cluster_dict_count[k]))
        else:
            cumcount_cl.append(0)
            
        k = (user_id, 0)
        if k in cluster_dict_sum:
            cluster0_avg.append(cluster_dict_sum[k] / cluster_dict_count[k])    
        else:
            cluster0_avg.append(cluster0_null)
            
        k = (user_id, 1)
        if k in cluster_dict_sum:
            cluster1_avg.append(cluster_dict_sum[k] / cluster_dict_count[k])    
        else:
            cluster1_avg.append(cluster1_null)
            
        k = (user_id, 2)
        if k in cluster_dict_sum:
            cluster2_avg.append(cluster_dict_sum[k] / cluster_dict_count[k])    
        else:
            cluster2_avg.append(cluster2_null)
            
        if user_id in lag_dict:
            target_lag.append(lag_dict[user_id])
        else:
            target_lag.append(1)
            
    return user_correctness, cumcount_u, part_avg, cumcount_p, cluster0_avg, cluster1_avg, cluster2_avg, cumcount_cl, target_lag, user_rating

In [None]:
def get_prior_tag(test):
    prior_tag = []
    
    for user_id, tag in test[['user_id', 'prior_tag']].values:
        if tag == -1:
            if user_id in last_lecture_dict:
                prior_tag.append(last_lecture_dict[user_id])
            else:
                prior_tag.append(-1)
                last_lecture_dict[user_id] = -1
        else:
            prior_tag.append(tag)
            last_lecture_dict[user_id] = tag
                
    return prior_tag

In [None]:
def update_last_lecture_dict(df):
    df = df.groupby('user_id').tail(1)[['user_id', 'lecture_tag']]
    df['lecture_tag'].fillna(-1, inplace=True)    
    
    for user_id, lecture_tag in df.values:
        last_lecture_dict[user_id] = lecture_tag

In [None]:
def get_time_mean_diff(test):  
    time_mean_diff = []
    user_list = []
    time_adm_list = []
    
    for user_id, timestamp_diff1 in test[['user_id', 'timestamp_diff1']].values:    
        time_adm = min(100000, timestamp_diff1)
        user_list.append(user_id)
        time_adm_list.append(time_adm)      
        if user_id in time_adm_dict:
            time_mean_diff.append(time_adm - time_adm_dict[user_id] / user_dict_count[user_id])
        else:
            time_mean_diff.append(0)
            
    # dict update
    for u, t in zip(user_list, time_adm_list):
        time_adm_dict[u] += t
                
    return time_mean_diff

# Submission

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None
model = lgb.Booster(model_file='../input/riiid-model-42v1/model42v1.txt')

In [None]:
gc.collect()

In [None]:
%%time
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop=True)
        
        # dict update
        for user_id, content_id, part, cluster_id, answered_correctly, content_mean in prior_test_df[['user_id', 'content_id', 'part', 'cluster_id', 'answered_correctly', 'content_mean']].values:          
            user_dict_sum[user_id] += answered_correctly
            user_dict_count[user_id] += 1
            
            part_dict_sum[(user_id, part)] += answered_correctly
            part_dict_count[(user_id, part)] += 1
            
            cluster_dict_sum[(user_id, cluster_id)] += answered_correctly
            cluster_dict_count[(user_id, cluster_id)] += 1
            
            lag_dict[user_id] = answered_correctly
            
            content_mean_sum_dict[user_id] += content_mean

    test_df = pd.merge(test_df, questions_df, left_on='content_id', right_on='question_id', how='left')
    test_df = pd.merge(test_df, lectures_df, on=['content_id', 'content_type_id'], how='left')
    prior_test_df = test_df.copy()
    test_df['prior_tag'] = test_df.groupby('user_id')['lecture_tag'].shift()
    test_df['prior_tag'].fillna(-1, inplace=True)
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    
    # make data
    test_df['user_correctness'], test_df['cumcount_u'], \
    test_df['part_avg'], test_df['cumcount_p'], \
    test_df['cluster0_avg'], test_df['cluster1_avg'], test_df['cluster2_avg'], test_df['cumcount_cl'], \
    test_df['target_lag'], test_df['user_rating'] = get_user_data(test_df)

    test_df['timestamp_diff1'], test_df['timestamp_diff2'] = get_timestamp_diff(test_df)
    test_df['attempt'] = get_attempt(test_df)
    test_df['prior_tag'] = get_prior_tag(test_df)
    update_last_lecture_dict(prior_test_df)
    
    test_df['timestamp_diff1'] = test_df['timestamp_diff1'] / test_df['task_num']
    test_df['timestamp_diff2'] = test_df['timestamp_diff2'] / test_df['task_num']
    
    # missing value
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df['prior_question_elapsed_time'].fillna(22000., inplace=True)
    test_df['timestamp_diff1'].fillna(25572., inplace=True)
    test_df['timestamp_diff2'].fillna(53309., inplace=True)
    
    test_df['time_mean_diff'] = get_time_mean_diff(test_df)
    
    # dtype
    test_df = test_df.astype(features_dtypes)
    
    # predict
    test_df['answered_correctly'] = model.predict(test_df[features].values)
    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

In [None]:
test_df[features].head(10)