In [3]:
import pandas as pd
import numpy as np 
import os
from riiid_feature_maker import *

base_dir = os.path.join("..","..")

In [None]:
train_df = pd.read_csv(os.path.join(base_dir,"input","riiid-test-answer-prediction","train.csv"),nrows=50000000)

In [3]:
from collections import defaultdict
from tqdm import tqdm

def init_user_dict(train_df):
    
    train_df = train_df[["user_id","answered_correctly"]]
    train_df = train_df[train_df["answered_correctly"] != -1]
    
    tmp = train_df.groupby("user_id").agg({"answered_correctly":["mean","sum","count"]})
    tmp.columns = ["answered_correctly_user_mean",
                   "answered_correctly_user_sum",
                   "answered_correctly_user_count"]
    tmp = tmp.reset_index()
    
    user_sum_dict = defaultdict(int, zip(tmp["user_id"].values,tmp["answered_correctly_user_sum"].values))
    user_count_dict = defaultdict(int, zip(tmp["user_id"].values,tmp["answered_correctly_user_count"].values))
    user_avg_dict = defaultdict(int, zip(tmp["user_id"].values,tmp["answered_correctly_user_mean"].values))
    
    return (user_sum_dict, user_count_dict, user_avg_dict)

def update_user_dict(train_df, user_sum_dict, user_count_dict, user_avg_dict):
    
    train_df = train_df[["user_id","answered_correctly"]]
    train_df = train_df[train_df["answered_correctly"] != -1]
    
    for idx, row in tqdm(train_df.iterrows()):
        user_sum_dict[row["user_id"]] += row["answered_correctly"]                                    # sum
        user_count_dict[row["user_id"]] += 1                                                          # count
        user_avg_dict[row["user_id"]] = user_sum_dict[row["user_id"]]/user_count_dict[row["user_id"]] # average
    
    return (user_sum_dict, user_count_dict, user_avg_dict)



def add_UserFeatures(df, user_dict):
    
    cols=["answered_correctly_users_sum",
          "answered_correctly_users_count",
          "answered_correctly_users_mean"]
    
    for i in range(3):
        tmp = pd.DataFrame(pd.Series(user_dict[i]), columns=[cols[i]])
        tmp.index.name = "user_id"
        tmp = tmp.reset_index()
        df = pd.merge(df,tmp,on="user_id",how="left")
    
    new_features = ["answered_correctly_users_mean",
                    "answered_correctly_users_sum",
                    "answered_correctly_users_count"]
    
    df[new_features] = df[new_features].fillna(df[new_features].mean())
    
    return (new_features, df)

In [4]:
from collections import defaultdict
from tqdm import tqdm

def init_content_dict(train_df):
    
    train_df = train_df[["content_id","answered_correctly"]]
    train_df = train_df[train_df["answered_correctly"] != -1]
    
    tmp = train_df.groupby("content_id").agg({"answered_correctly":["mean","sum","count"]})
    tmp.columns = ["answered_correctly_contents_mean",
                   "answered_correctly_contents_sum",
                   "answered_correctly_contents_count"]
    tmp = tmp.reset_index()
    
    content_sum_dict = defaultdict(int, zip(tmp["content_id"].values,tmp["answered_correctly_contents_sum"].values))
    content_count_dict = defaultdict(int, zip(tmp["content_id"].values,tmp["answered_correctly_contents_count"].values))
    content_avg_dict = defaultdict(int, zip(tmp["content_id"].values,tmp["answered_correctly_contents_mean"].values))
    
    return (content_sum_dict, content_count_dict, content_avg_dict)
    

def update_content_dict(train_df, content_sum_dict, content_count_dict, content_avg_dict):
    
    train_df = train_df[["content_id","answered_correctly"]]
    train_df = train_df[train_df["answered_correctly"] != -1]
    
    for idx, row in tqdm(train_df.iterrows()):
        content_sum_dict[row["content_id"]] += row["answered_correctly"]                                    # sum
        content_count_dict[row["content_id"]] += 1                                                          # count
        content_avg_dict[row["content_id"]] = content_sum_dict[row["content_id"]]/content_count_dict[row["content_id"]] # average
    
    return (content_sum_dict, content_count_dict, content_avg_dict)


def add_ContentFeatures(df, content_dict):
    
    cols=["answered_correctly_contents_sum",
          "answered_correctly_contents_count",
          "answered_correctly_contents_mean"]
    
    for i in range(3):
        tmp = pd.DataFrame(pd.Series(content_dict[i]), columns=[cols[i]])
        tmp.index.name = "content_id"
        tmp = tmp.reset_index()
        df = pd.merge(df,tmp,on="content_id",how="left")
    
    
    new_features = ["answered_correctly_contents_mean",
                    "answered_correctly_contents_sum",
                    "answered_correctly_contents_count"]
    
    df[new_features] = df[new_features].fillna(df[new_features].mean())
    
    return (new_features, df)

In [5]:
_, df = make_base_features(train_df)
df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,prior_question_elapsed_time,prior_question_had_explanation,answered_correctly
0,0,0,115,5692,0,1,3,,,1
1,1,56943,115,5716,0,2,2,37000.0,0.0,1
2,2,118363,115,128,0,0,0,55000.0,0.0,1
3,3,131167,115,7860,0,3,0,19000.0,0.0,1
4,4,137965,115,7922,0,4,1,11000.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...
49999995,49999995,1445072,1059615240,7217,0,15,1,34500.0,0.0,0
49999996,49999996,1445072,1059615240,7219,0,15,0,34500.0,0.0,0
49999997,49999997,0,1059622238,7900,0,0,1,,,0
49999998,49999998,25373,1059622238,7876,0,1,2,32000.0,0.0,0


In [6]:
content_dict = init_content_dict(train_df)
_, df = add_ContentFeatures(df,content_dict)
df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,prior_question_elapsed_time,prior_question_had_explanation,answered_correctly,answered_correctly_contents_sum,answered_correctly_contents_count,answered_correctly_contents_mean
0,0,0,115,5692,0,1,3,,,1,13273.0,17779.0,0.746555
1,1,56943,115,5716,0,2,2,37000.0,0.0,1,8670.0,11759.0,0.737308
2,2,118363,115,128,0,0,0,55000.0,0.0,1,9358.0,9671.0,0.967635
3,3,131167,115,7860,0,3,0,19000.0,0.0,1,10296.0,10770.0,0.955989
4,4,137965,115,7922,0,4,1,11000.0,0.0,1,9147.0,9566.0,0.956199
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49999995,49999995,1445072,1059615240,7217,0,15,1,34500.0,0.0,0,39306.0,78605.0,0.500045
49999996,49999996,1445072,1059615240,7219,0,15,0,34500.0,0.0,0,21251.0,78605.0,0.270352
49999997,49999997,0,1059622238,7900,0,0,1,,,0,73175.0,88703.0,0.824944
49999998,49999998,25373,1059622238,7876,0,1,2,32000.0,0.0,0,39039.0,93240.0,0.418694


In [7]:
user_dict = init_user_dict(train_df)
_, df = add_UserFeatures(df,user_dict)
df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,prior_question_elapsed_time,prior_question_had_explanation,answered_correctly,answered_correctly_contents_sum,answered_correctly_contents_count,answered_correctly_contents_mean,answered_correctly_users_sum,answered_correctly_users_count,answered_correctly_users_mean
0,0,0,115,5692,0,1,3,,,1,13273.0,17779.0,0.746555,32,46,0.695652
1,1,56943,115,5716,0,2,2,37000.0,0.0,1,8670.0,11759.0,0.737308,32,46,0.695652
2,2,118363,115,128,0,0,0,55000.0,0.0,1,9358.0,9671.0,0.967635,32,46,0.695652
3,3,131167,115,7860,0,3,0,19000.0,0.0,1,10296.0,10770.0,0.955989,32,46,0.695652
4,4,137965,115,7922,0,4,1,11000.0,0.0,1,9147.0,9566.0,0.956199,32,46,0.695652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49999995,49999995,1445072,1059615240,7217,0,15,1,34500.0,0.0,0,39306.0,78605.0,0.500045,12,30,0.400000
49999996,49999996,1445072,1059615240,7219,0,15,0,34500.0,0.0,0,21251.0,78605.0,0.270352,12,30,0.400000
49999997,49999997,0,1059622238,7900,0,0,1,,,0,73175.0,88703.0,0.824944,1,3,0.333333
49999998,49999998,25373,1059622238,7876,0,1,2,32000.0,0.0,0,39039.0,93240.0,0.418694,1,3,0.333333


In [5]:
lecture_df = pd.read_csv(os.path.join(base_dir,"input","riiid-test-answer-prediction","lectures.csv"))
lecture_df

Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept
2,185,45,6,concept
3,192,79,5,solving question
4,317,156,5,solving question
...,...,...,...,...
413,32535,8,5,solving question
414,32570,113,3,solving question
415,32604,24,6,concept
416,32625,142,2,concept
