## 📌 1등팀 Feature Engineering

In [None]:
# Python black formatter 적용
%load_ext nb_black
%load_ext lab_black

import pandas as pd
import numpy as np

import math
import time

from tqdm import tqdm

In [None]:
# 데이터 프레임의 데이터타입 지정
dtype = {"userID": "int16", "answerCode": "int8", "KnowledgeTag": "int16"}

# 데이터 경로 맞춰주세요!
DATA_PATH = "/opt/ml/input/data/FE_total_data.csv"

df = pd.read_csv(DATA_PATH, dtype=dtype, parse_dates=["Timestamp"])
df = df.sort_values(by=["userID", "Timestamp"]).reset_index(drop=True)

train = pd.read_csv(DATA_PATH, dtype=dtype, parse_dates=["Timestamp"])
train = train.sort_values(by=["userID", "Timestamp"]).reset_index(drop=True)

In [None]:
df["problem_number"] = df["assessmentItemID"].apply(lambda x: int(x[-3:]))

In [None]:
correct_t = df.groupby(["testId"])["answerCode"].agg(["mean", "sum"])
correct_t.columns = ["test_mean", "test_sum"]
correct_k = df.groupby(["KnowledgeTag"])["answerCode"].agg(["mean", "sum"])
correct_k.columns = ["tag_mean", "tag_sum"]
correct_a = df.groupby(["assessmentItemID"])["answerCode"].agg(["mean", "sum"])
correct_a.columns = ["ass_mean", "ass_sum"]
correct_p = df.groupby(["problem_number"])["answerCode"].agg(["mean", "sum"])
correct_p.columns = ["prb_mean", "prb_sum"]

In [None]:
def add_last_problem(df):
    new = []
    pre = df["testId"][0]
    for idx in df["testId"]:
        if pre != idx:
            new[-1] = -1
            pre = idx
        new.append(0)
    df["last_problem"] = new
    return df

In [None]:
def is_previous_ordered(row):
    q_num = row.problem_number
    q_num_prev = row.q_num_prev
    delta = row.delta
    delta_thres = 1  # hour

    if pd.isnull(delta) or delta > pd.Timedelta(hours=1):
        return -1
    elif q_num == q_num_prev + 1:
        return 1
    else:
        return 0

In [None]:
def is_previous_decreasing(row):
    q_num = row.problem_number
    q_num_prev = row.q_num_prev
    delta = row.delta
    delta_thres = 1  # hour

    if pd.isnull(delta) or delta > pd.Timedelta(hours=1):
        return -1
    elif q_num < q_num_prev:
        return 1
    else:
        return 0

In [None]:
def is_probably_easy(row):
    delta = row.delta
    delta_thres = 1  # hour

    is_prev_ord = row.is_previous_ordered
    is_prev_dec = row.is_previous_decreasing
    is_prev_ord_shift = row.is_prev_ord_shift
    is_prev_dec_shift = row.is_prev_dec_shift

    case = (is_prev_ord_shift, is_prev_dec_shift, is_prev_ord, is_prev_dec)

    probably_easy_l = [
        (np.nan, np.nan, -1, -1),
        (-1, -1, 1, 0),
        (1, 0, 1, 0),
        (1, 0, 0, 0),
    ]

    if pd.isnull(delta) or delta > pd.Timedelta(hours=1):
        return -1
    elif case in probably_easy_l:
        return 1
    else:
        return 0

In [None]:
# 난이도 설정을 위한 ELO 사용
def ELO_function(df):
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(
                answers_df[granularity_feature_name]
            )
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("Parameter estimation is starting...")

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(
                answers_df.userID.values,
                answers_df[granularity_feature_name].values,
                answers_df.left_asymptote.values,
                answers_df.answerCode.values,
            )
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
        return student_parameters, item_parameters

    def gou_func(theta, beta):
        return 1 / (1 + np.exp(-(theta - beta)))

    df["left_asymptote"] = 0

    print(f"Dataset of shape {df.shape}")
    print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)

    prob = [
        gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"])
        for student, item in zip(df.userID.values, df.assessmentItemID.values)
    ]

    df["elo_prob"] = prob

    return df

In [None]:
def feature_engineering(df):
    print("-" * 20, "Feature Engineering Start", "-" * 20)
    start_time = time.time()
    # 유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=["userID", "Timestamp"], inplace=True)
    df = add_last_problem(df)
    # elo 추가
    df = ELO_function(df)

    df["hour"] = df["Timestamp"].dt.hour
    df["dow"] = df["Timestamp"].dt.dayofweek

    # 푸는 시간
    diff = (
        df.loc[:, ["userID", "Timestamp"]]
        .groupby("userID")
        .diff()
        .fillna(pd.Timedelta(seconds=0))
    )
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff["Timestamp"].apply(lambda x: x.total_seconds())
    df["elapsed"] = diff
    df["elapsed"] = df["elapsed"].apply(lambda x: x if x < 650 and x >= 0 else 0)

    df["grade"] = df["testId"].apply(lambda x: int(x[1:4]) // 10)
    df["mid"] = df["testId"].apply(lambda x: int(x[-3:]))
    df["problem_number"] = df["assessmentItemID"].apply(lambda x: int(x[-3:]))

    correct_h = df.groupby(["hour"])["answerCode"].agg(["mean", "sum"])
    correct_h.columns = ["hour_mean", "hour_sum"]
    correct_d = df.groupby(["dow"])["answerCode"].agg(["mean", "sum"])
    correct_d.columns = ["dow_mean", "dow_sum"]

    df = pd.merge(df, correct_t, on=["testId"], how="left")
    df = pd.merge(df, correct_k, on=["KnowledgeTag"], how="left")
    df = pd.merge(df, correct_a, on=["assessmentItemID"], how="left")
    df = pd.merge(df, correct_p, on=["problem_number"], how="left")
    df = pd.merge(df, correct_h, on=["hour"], how="left")
    df = pd.merge(df, correct_d, on=["dow"], how="left")

    o_df = df[df["answerCode"] == 1]
    x_df = df[df["answerCode"] == 0]

    elp_k = df.groupby(["KnowledgeTag"])["elapsed"].agg("mean").reset_index()
    elp_k.columns = ["KnowledgeTag", "tag_elp"]
    elp_k_o = o_df.groupby(["KnowledgeTag"])["elapsed"].agg("mean").reset_index()
    elp_k_o.columns = ["KnowledgeTag", "tag_elp_o"]
    elp_k_x = x_df.groupby(["KnowledgeTag"])["elapsed"].agg("mean").reset_index()
    elp_k_x.columns = ["KnowledgeTag", "tag_elp_x"]

    df = pd.merge(df, elp_k, on=["KnowledgeTag"], how="left")
    df = pd.merge(df, elp_k_o, on=["KnowledgeTag"], how="left")
    df = pd.merge(df, elp_k_x, on=["KnowledgeTag"], how="left")

    ass_k = df.groupby(["assessmentItemID"])["elapsed"].agg("mean").reset_index()
    ass_k.columns = ["assessmentItemID", "ass_elp"]
    ass_k_o = o_df.groupby(["assessmentItemID"])["elapsed"].agg("mean").reset_index()
    ass_k_o.columns = ["assessmentItemID", "ass_elp_o"]
    ass_k_x = x_df.groupby(["assessmentItemID"])["elapsed"].agg("mean").reset_index()
    ass_k_x.columns = ["assessmentItemID", "ass_elp_x"]

    df = pd.merge(df, ass_k, on=["assessmentItemID"], how="left")
    df = pd.merge(df, ass_k_o, on=["assessmentItemID"], how="left")
    df = pd.merge(df, ass_k_x, on=["assessmentItemID"], how="left")

    prb_k = df.groupby(["problem_number"])["elapsed"].agg("mean").reset_index()
    prb_k.columns = ["problem_number", "prb_elp"]
    prb_k_o = o_df.groupby(["problem_number"])["elapsed"].agg("mean").reset_index()
    prb_k_o.columns = ["problem_number", "prb_elp_o"]
    prb_k_x = x_df.groupby(["problem_number"])["elapsed"].agg("mean").reset_index()
    prb_k_x.columns = ["problem_number", "prb_elp_x"]

    df = pd.merge(df, prb_k, on=["problem_number"], how="left")
    df = pd.merge(df, prb_k_o, on=["problem_number"], how="left")
    df = pd.merge(df, prb_k_x, on=["problem_number"], how="left")

    df["user_correct_answer"] = (
        df.groupby("userID")["answerCode"]
        .transform(lambda x: x.cumsum().shift(1))
        .fillna(0)
    )
    df["user_total_answer"] = df.groupby("userID")["answerCode"].cumcount()
    df["user_acc"] = (df["user_correct_answer"] / df["user_total_answer"]).fillna(0)
    df["Grade_o"] = (
        df.groupby(["userID", "grade"])["answerCode"]
        .transform(lambda x: x.cumsum().shift(1))
        .fillna(0)
    )
    df["GradeCount"] = df.groupby(["userID", "grade"]).cumcount()
    df["GradeAcc"] = (df["Grade_o"] / df["GradeCount"]).fillna(0)
    df["GradeElp"] = (
        df.groupby(["userID", "grade"])["elapsed"]
        .transform(lambda x: x.cumsum())
        .fillna(0)
    )
    df["GradeMElp"] = df["GradeElp"] / [
        v if v != 0 else 1 for v in df["GradeCount"].values
    ]

    f = lambda x: len(set(x))
    test = df.groupby(["testId"]).agg({"problem_number": "max", "KnowledgeTag": f})
    test.reset_index(inplace=True)

    test.columns = ["testId", "problem_count", "tag_count"]

    df = pd.merge(df, test, on="testId", how="left")

    gdf = df[["userID", "testId", "problem_number", "grade", "Timestamp"]].sort_values(
        by=["userID", "grade", "Timestamp"]
    )
    gdf["buserID"] = gdf["userID"] != gdf["userID"].shift(1)
    gdf["bgrade"] = gdf["grade"] != gdf["grade"].shift(1)
    gdf["first"] = gdf[["buserID", "bgrade"]].any(axis=1).apply(lambda x: 1 - int(x))
    gdf["RepeatedTime"] = gdf["Timestamp"].diff().fillna(pd.Timedelta(seconds=0))
    gdf["RepeatedTime"] = (
        gdf["RepeatedTime"].apply(lambda x: x.total_seconds()) * gdf["first"]
    )
    df["RepeatedTime"] = gdf["RepeatedTime"].apply(lambda x: math.log(x + 1))

    df["prior_KnowledgeTag_frequency"] = df.groupby(
        ["userID", "KnowledgeTag"]
    ).cumcount()

    df["problem_position"] = df["problem_number"] / df["problem_count"]
    df["solve_order"] = df.groupby(["userID", "testId"]).cumcount()
    df["solve_order"] = (
        df["solve_order"]
        - df["problem_count"] * (df["solve_order"] > df["problem_count"]).apply(int)
        + 1
    )
    df["retest"] = (df["solve_order"] > df["problem_count"]).apply(int)
    T = df["solve_order"] != df["problem_number"]
    TT = T.shift(1)
    TT[0] = False
    df["solved_disorder"] = (TT.apply(lambda x: not x) & T).apply(int)

    df["testId"] = df["testId"].apply(lambda x: int(x[1:4] + x[-3]))

    print("-" * 20, "Feature Engineering End", "-" * 20)
    print(f"Feature Engineering에 걸린 시간 : {time.time() - start_time}s")
    return df

In [None]:
train = feature_engineering(train)
train.head()

In [None]:
# null 값 분포 우선 fillna로 처리
train = train.fillna(0)

In [None]:
train.to_csv("/opt/ml/input/data/train_after.csv", index=False)