In [19]:
import os
import sys
import gc
import glob
import time
from os import listdir

import tqdm
from typing import Dict

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import numpy as np
import pandas as pd


In [20]:
folder_path = '/Users/hesu/Documents/KT/riiid/'
train_csv = folder_path + 'train.csv'
test_csv =  folder_path + 'example_test.csv'
lec_csv  =  folder_path + 'lectures.csv'
que_csv =   folder_path + 'questions.csv'
sample_csv =    folder_path + 'example_sample_submission.csv'

data_types_dict = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int32',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
     'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float',
    'prior_question_had_explanation': 'boolean'
}

In [21]:
train_df = pd.read_csv(train_csv, 
                       nrows=1000000)

question_df = pd.read_csv(que_csv)


In [22]:
train_df = pd.merge(train_df, question_df, left_on='content_id', right_on='question_id',how='left')

In [23]:
train_df.drop(['question_id'], axis=1)

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,bundle_id,correct_answer,part,tags
0,0,0,115,5692,0,1,3,1,,,5692.0,3.0,5.0,151
1,1,56943,115,5716,0,2,2,1,37000.0,False,5716.0,2.0,5.0,168
2,2,118363,115,128,0,0,0,1,55000.0,False,128.0,0.0,1.0,131 149 92
3,3,131167,115,7860,0,3,0,1,19000.0,False,7860.0,0.0,1.0,131 104 81
4,4,137965,115,7922,0,4,1,1,11000.0,False,7922.0,1.0,1.0,131 149 92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,999995,26482248,20949024,8803,0,29,1,1,14000.0,True,8803.0,1.0,5.0,170
999996,999996,26516686,20949024,4664,0,30,3,1,17000.0,True,4664.0,3.0,5.0,34
999997,999997,26537967,20949024,4108,0,31,1,0,18000.0,True,4108.0,0.0,5.0,23
999998,999998,26590240,20949024,5014,0,32,3,0,6000.0,True,5014.0,0.0,5.0,181


In [24]:
train_df = train_df[train_df.content_type_id==False]
train_df = train_df.sort_values(['timestamp'], ascending=True).reset_index(drop=True)
train_df.head(10)

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,bundle_id,correct_answer,part,tags
0,0,0,115,5692,0,1,3,1,,,5692.0,5692.0,3.0,5.0,151
1,924336,0,19437777,7900,0,0,0,1,,,7900.0,7900.0,0.0,1.0,131 93 81
2,722435,0,15244238,4356,0,0,1,1,,,4356.0,4356.0,1.0,5.0,14
3,722405,0,15240250,7900,0,0,2,0,,,7900.0,7900.0,0.0,1.0,131 93 81
4,32725,0,786789,7900,0,0,0,1,,,7900.0,7900.0,0.0,1.0,131 93 81
5,403313,0,7872136,7900,0,0,0,1,,,7900.0,7900.0,0.0,1.0,131 93 81
6,403383,0,7876172,7900,0,0,0,1,,,7900.0,7900.0,0.0,1.0,131 93 81
7,32777,0,800824,5346,0,0,3,0,,,5346.0,5346.0,1.0,5.0,55
8,722064,0,15238770,7900,0,0,1,0,,,7900.0,7900.0,0.0,1.0,131 93 81
9,171161,0,3646751,8494,0,0,2,0,,,8494.0,8494.0,1.0,5.0,108


In [25]:
len(train_df)

980093

In [26]:
train_df["content_id"] = train_df["content_id"].astype('str')
train_df["prior_question_elapsed_time"] = train_df["prior_question_elapsed_time"].astype('str')
train_df["prior_question_had_explanation"] = train_df["prior_question_had_explanation"].astype('str')
train_df["part"] = train_df["part"].astype('str')
train_df["answered_correctly"] = train_df["answered_correctly"].astype('str')

In [27]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980093 entries, 0 to 980092
Data columns (total 15 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   row_id                          980093 non-null  int64  
 1   timestamp                       980093 non-null  int64  
 2   user_id                         980093 non-null  int64  
 3   content_id                      980093 non-null  object 
 4   content_type_id                 980093 non-null  int64  
 5   task_container_id               980093 non-null  int64  
 6   user_answer                     980093 non-null  int64  
 7   answered_correctly              980093 non-null  object 
 8   prior_question_elapsed_time     980093 non-null  object 
 9   prior_question_had_explanation  980093 non-null  object 
 10  question_id                     980093 non-null  float64
 11  bundle_id                       980093 non-null  float64
 12  correct_answer  

In [64]:
result_user3 =  train_df.groupby('user_id')
combine = result_user3.agg({'content_id':','.join, 'answered_correctly':','.join,'part':','.join,
                            'prior_question_elapsed_time':','.join}).reset_index()

In [65]:
type(combine)

pandas.core.frame.DataFrame

In [66]:
combine.columns = ['all_content_id','all_answer','all_part','all_prior_elapsed']

ValueError: Length mismatch: Expected axis has 5 elements, new values have 4 elements

In [67]:
combine.head(10)

Unnamed: 0,user_id,content_id,answered_correctly,part,prior_question_elapsed_time
0,115,"5692,5716,128,7860,7922,156,51,50,7896,7863,15...","1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,0,1,1,0,0,0,...","5.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1....","nan,37000.0,55000.0,19000.0,11000.0,5000.0,170..."
1,124,"7900,7876,175,1278,2063,2064,2065,3364,3365,33...","1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,...","1.0,1.0,1.0,2.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4....","nan,26000.0,29000.0,26000.0,18000.0,18000.0,18..."
2,2746,"5273,758,5976,236,404,382,405,873,531,775,294,...",0001011110110101101,"5.0,2.0,5.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2....","nan,28000.0,17000.0,24000.0,20000.0,16000.0,16..."
3,5382,"5000,3944,217,5844,5965,4990,5235,6050,5721,55...","1,0,1,0,1,1,1,1,0,0,0,1,0,1,1,1,1,0,1,1,0,0,0,...","5.0,5.0,2.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1....","nan,24000.0,35000.0,88000.0,18000.0,12000.0,50..."
4,8623,"3915,4750,6456,3968,6104,5738,6435,5498,6102,4...","1,1,1,1,1,1,1,0,0,1,1,0,0,1,1,0,1,1,1,0,0,1,1,...","5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,2....","nan,16000.0,33000.0,30000.0,40000.0,35000.0,30..."
5,8701,"3901,6671,4963,6143,8279,3964,4002,754,1110,77...",11101000100111011,"5.0,5.0,5.0,5.0,5.0,5.0,5.0,2.0,2.0,2.0,2.0,2....","nan,13000.0,15000.0,24000.0,25000.0,44000.0,17..."
6,12741,"5145,9691,9697,5202,4787,5695,7858,5653,5889,4...","0,1,0,1,1,0,1,0,1,0,0,1,0,1,1,0,0,1,0,1,0,0,1,...","5.0,5.0,5.0,5.0,5.0,5.0,1.0,5.0,5.0,5.0,5.0,6....","nan,13000.0,18000.0,29000.0,35000.0,15000.0,21..."
7,13134,"3926,564,3865,4231,3684,3988,3968,5219,4447,61...","1,0,0,1,1,0,0,1,1,0,1,1,1,0,1,1,0,1,0,1,0,1,1,...","5.0,2.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5....","nan,22000.0,18000.0,19000.0,13000.0,43000.0,65..."
8,24418,"7900,7876,175,1278,2063,2064,2065,3363,3365,33...","0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,1,1,...","1.0,1.0,1.0,2.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4....","nan,30000.0,20000.0,24000.0,17000.0,17000.0,17..."
9,24600,"7900,7876,175,1278,2063,2064,2065,3365,3363,33...","1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...","1.0,1.0,1.0,2.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4....","nan,24000.0,23000.0,22000.0,18000.0,18000.0,18..."


In [68]:
len(combine)

3824

In [69]:
train_df.nunique(axis=0)

row_id                            980093
timestamp                         749458
user_id                             3824
content_id                         13076
content_type_id                        1
task_container_id                   7740
user_answer                            4
answered_correctly                     2
prior_question_elapsed_time         1660
prior_question_had_explanation         3
question_id                        13076
bundle_id                           9527
correct_answer                         4
part                                   7
tags                                1501
dtype: int64

In [70]:
print(type(combine))

<class 'pandas.core.frame.DataFrame'>


In [71]:
combine.head(10)

Unnamed: 0,user_id,content_id,answered_correctly,part,prior_question_elapsed_time
0,115,"5692,5716,128,7860,7922,156,51,50,7896,7863,15...","1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,0,1,1,0,0,0,...","5.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1....","nan,37000.0,55000.0,19000.0,11000.0,5000.0,170..."
1,124,"7900,7876,175,1278,2063,2064,2065,3364,3365,33...","1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,...","1.0,1.0,1.0,2.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4....","nan,26000.0,29000.0,26000.0,18000.0,18000.0,18..."
2,2746,"5273,758,5976,236,404,382,405,873,531,775,294,...",0001011110110101101,"5.0,2.0,5.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2....","nan,28000.0,17000.0,24000.0,20000.0,16000.0,16..."
3,5382,"5000,3944,217,5844,5965,4990,5235,6050,5721,55...","1,0,1,0,1,1,1,1,0,0,0,1,0,1,1,1,1,0,1,1,0,0,0,...","5.0,5.0,2.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1....","nan,24000.0,35000.0,88000.0,18000.0,12000.0,50..."
4,8623,"3915,4750,6456,3968,6104,5738,6435,5498,6102,4...","1,1,1,1,1,1,1,0,0,1,1,0,0,1,1,0,1,1,1,0,0,1,1,...","5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,2....","nan,16000.0,33000.0,30000.0,40000.0,35000.0,30..."
5,8701,"3901,6671,4963,6143,8279,3964,4002,754,1110,77...",11101000100111011,"5.0,5.0,5.0,5.0,5.0,5.0,5.0,2.0,2.0,2.0,2.0,2....","nan,13000.0,15000.0,24000.0,25000.0,44000.0,17..."
6,12741,"5145,9691,9697,5202,4787,5695,7858,5653,5889,4...","0,1,0,1,1,0,1,0,1,0,0,1,0,1,1,0,0,1,0,1,0,0,1,...","5.0,5.0,5.0,5.0,5.0,5.0,1.0,5.0,5.0,5.0,5.0,6....","nan,13000.0,18000.0,29000.0,35000.0,15000.0,21..."
7,13134,"3926,564,3865,4231,3684,3988,3968,5219,4447,61...","1,0,0,1,1,0,0,1,1,0,1,1,1,0,1,1,0,1,0,1,0,1,1,...","5.0,2.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5....","nan,22000.0,18000.0,19000.0,13000.0,43000.0,65..."
8,24418,"7900,7876,175,1278,2063,2064,2065,3363,3365,33...","0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,1,1,...","1.0,1.0,1.0,2.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4....","nan,30000.0,20000.0,24000.0,17000.0,17000.0,17..."
9,24600,"7900,7876,175,1278,2063,2064,2065,3365,3363,33...","1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...","1.0,1.0,1.0,2.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4....","nan,24000.0,23000.0,22000.0,18000.0,18000.0,18..."


In [81]:
d = dict()
index = 0
for row in combine.itertuples():
    cid = getattr(row, 'prior_question_elapsed_time')
#     print(cid)
    splits = str(cid).strip().split(',')
    for split in splits:
        if split in d.keys():
            continue
        d[split] = index
        index += 1
d['unknown'] = index+1


In [82]:
d

{'nan': 0,
 '37000.0': 1,
 '55000.0': 2,
 '19000.0': 3,
 '11000.0': 4,
 '5000.0': 5,
 '17000.0': 6,
 '16000.0': 7,
 '22000.0': 8,
 '23000.0': 9,
 '21000.0': 10,
 '24000.0': 11,
 '20000.0': 12,
 '18000.0': 13,
 '29000.0': 14,
 '15000.0': 15,
 '14333.0': 16,
 '26000.0': 17,
 '33333.0': 18,
 '21666.0': 19,
 '32000.0': 20,
 '27000.0': 21,
 '14000.0': 22,
 '7000.0': 23,
 '6500.0': 24,
 '28000.0': 25,
 '13000.0': 26,
 '35000.0': 27,
 '88000.0': 28,
 '12000.0': 29,
 '92000.0': 30,
 '70000.0': 31,
 '79000.0': 32,
 '66000.0': 33,
 '30000.0': 34,
 '34000.0': 35,
 '25000.0': 36,
 '78000.0': 37,
 '36000.0': 38,
 '43000.0': 39,
 '38000.0': 40,
 '64000.0': 41,
 '61000.0': 42,
 '82000.0': 43,
 '33000.0': 44,
 '50000.0': 45,
 '9000.0': 46,
 '57000.0': 47,
 '85000.0': 48,
 '81000.0': 49,
 '53000.0': 50,
 '49000.0': 51,
 '90000.0': 52,
 '48000.0': 53,
 '62000.0': 54,
 '51000.0': 55,
 '93000.0': 56,
 '201000.0': 57,
 '41000.0': 58,
 '72000.0': 59,
 '39000.0': 60,
 '54000.0': 61,
 '40000.0': 62,
 '46000.0

In [83]:
len(d)

1661

In [42]:
a = train_df.groupby('user_id').reset_index()

AttributeError: 'DataFrameGroupBy' object has no attribute 'reset_index'

Unnamed: 0,user_id,count
0,115,46
1,124,30
2,2746,19
3,5382,125
4,8623,109
5,8701,17
6,12741,265
7,13134,1243
8,24418,6283
9,24600,50


In [15]:
x = pd.DataFrame()

In [16]:
for i in range(15):
    last_records = train_df.drop_duplicates('user_id',keep='last')
    train_df = train_df[~train_df.index.isin(last_records.index)]
    x = x.append(last_records)

In [17]:
x.head(10)

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,bundle_id,correct_answer,part,tags
1433,965193,0,19887248,6368,0,0,3,0,,,6368.0,6368.0,2.0,5.0,134
2989,87415,0,1880240,192,0,0,1,1,,,192.0,192.0,1.0,1.0,131 104 38
4957,103084,22153,2148001,9320,0,1,0,0,21000.0,False,9320.0,9320.0,3.0,5.0,55
6989,783332,32301,15960740,7924,0,1,0,0,77000.0,False,7924.0,7924.0,2.0,1.0,131 40 38
8830,87902,48795,1946295,175,0,2,2,1,20000.0,False,175.0,175.0,2.0,1.0,9 10 92
10004,585088,55747,11814782,10395,0,10,0,0,3000.0,False,10395.0,10395.0,2.0,1.0,51 131 81
10243,24955,57373,457531,7216,0,15,0,0,750.0,False,7216.0,7216.0,3.0,7.0,97 50 21
11193,842626,64507,17465601,2595,0,7,3,0,1333.0,False,2595.0,2593.0,0.0,4.0,82 103 102
16243,158836,107800,3258925,880,0,11,0,1,0.0,False,880.0,880.0,0.0,2.0,69 137 88 162 38 81
16295,664658,108242,13483292,7219,0,15,1,0,2000.0,False,7219.0,7216.0,2.0,7.0,50 19 21


In [18]:
len(x)

56693