Versions:

- Version 1: 0.420  
  Always predict `correct` = 1.

- Version 6:  0.659
  Prediction is default value of @CPMP

- Version 7:  0.640
  From Version 6 prediction for Q13 is replaced with model: "q13_56_logreg_power_pipeline"

- Version 8: 
  From Version 6 prediction for Q15 is replaced with model: "q13_56_logreg_power_pipeline"

## Prepare user script

In [1]:
# import os
# WORK_DIR = os.getcwd()

# %cd "/kaggle/input/kaggle-psp-training-q13-56-logreg-power/kaggle-psp/kaggle_psp/feature_extraction/"

# !python setup.py build_ext --inplace

# os.chdir(WORK_DIR)

In [2]:
import sys
SCRIPT_DIR = "/kaggle/input/kaggle-psp-training-q13-56-logreg-power/kaggle-psp"
if SCRIPT_DIR not in sys.path: sys.path.append(SCRIPT_DIR)

## Import library

In [3]:
!dir /kaggle/input/

kaggle-psp-features
kaggle-psp-training-q13-56-logreg-power
kaggle-psp-training-q15-9-gaussiannb
predict-student-performance-from-game-play


In [4]:
%load_ext Cython

In [5]:
import json
import pickle
from typing import Tuple

try:
    import jo_wilder
except ImportError:
    import jo_wilder_310 as jo_wilder
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer

from kaggle_psp.base import get_available_levels_from, get_level_group_from, get_q_from
from kaggle_psp.feature_extraction import (
    TimeDiffFeatureGenerator, RoomMovementCountFeatureGenerator, ElapsedTimeFeatureGenerator,
    LogCountFeatureGenerator, FlagCountFeatureGenerator
)



In [6]:
CATEGORIES = ("text_fqid", "room_fqid", "event_name")

In [7]:
def order_dataframes(tow_dataframes: Tuple[pd.DataFrame, pd.DataFrame]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Return (`sample_submission`, `test`) in this order always.
    """
    # If there is column named `correct` that probably be `sample_submission`
    if 'correct' in tow_dataframes[0]:
        return two_dataframes[0], two_dataframes[1]  # samlpe_submission, test
    else:
        return two_dataframes[1], two_dataframes[0]

In [8]:
def get_default_submission(sample_submission: pd.DataFrame) -> pd.DataFrame:
    """All prediction will be default value.

    From @CPMP
    https://www.kaggle.com/competitions/predict-student-performance-from-game-play/discussion/396970
    """
    sample_submission.loc[sample_submission.question == 'q1', 'correct'] = 1 
    sample_submission.loc[sample_submission.question == 'q2', 'correct'] = 1 
    sample_submission.loc[sample_submission.question == 'q3', 'correct'] = 1 
    sample_submission.loc[sample_submission.question == 'q4', 'correct'] = 1 
    sample_submission.loc[sample_submission.question == 'q5', 'correct'] = 0 
    sample_submission.loc[sample_submission.question == 'q6', 'correct'] = 1 
    sample_submission.loc[sample_submission.question == 'q7', 'correct'] = 1 
    sample_submission.loc[sample_submission.question == 'q8', 'correct'] = 0 
    sample_submission.loc[sample_submission.question == 'q9', 'correct'] = 1 
    sample_submission.loc[sample_submission.question == 'q10', 'correct'] = 0 
    sample_submission.loc[sample_submission.question == 'q11', 'correct'] = 1 
    sample_submission.loc[sample_submission.question == 'q12', 'correct'] = 1 
    sample_submission.loc[sample_submission.question == 'q13', 'correct'] = 0 
    sample_submission.loc[sample_submission.question == 'q14', 'correct'] = 1 
    sample_submission.loc[sample_submission.question == 'q15', 'correct'] = 0
    sample_submission.loc[sample_submission.question == 'q16', 'correct'] = 1
    sample_submission.loc[sample_submission.question == 'q17', 'correct'] = 1
    sample_submission.loc[sample_submission.question == 'q18', 'correct'] = 1
    return sample_submission

In [9]:
with open("/kaggle/input/kaggle-psp-training-q15-9-gaussiannb/q15_9_gaussiannb_pipeline.pkl", "rb") as f:
    model_q15 = pickle.load(f)
model_q15

In [10]:
mappings = {}  # Mapping category -> integer
for c in CATEGORIES:
    with open(f"/kaggle/input/kaggle-psp-features/mapping_{c}.json", "r") as f:
        mappings[c] = json.load(f)

In [11]:
# Initialize feature engineering utilities
flag_count_fullscreen = FlagCountFeatureGenerator(col="fullscreen")
flag_count_hq = FlagCountFeatureGenerator(col="hq")
flag_count_music = FlagCountFeatureGenerator(col="music")

levels_and_room_fqids_and_event_names = pd.read_csv("/kaggle/input/kaggle-psp-features/resource_levels_and_room_fqids_and_event_names.csv")
log_count_event_name = LogCountFeatureGenerator(
    levels=levels_and_room_fqids_and_event_names["level"].tolist(),
    room_fqids=levels_and_room_fqids_and_event_names["room_fqid"].tolist(),
    categories=levels_and_room_fqids_and_event_names["event_name"].tolist(),
    col="event_name"
)
levels_and_room_fqids_and_text_fqids = pd.read_csv("/kaggle/input/kaggle-psp-features/resource_levels_and_room_fqids_and_text_fqids.csv")
log_count_text_fqid = LogCountFeatureGenerator(
    levels=levels_and_room_fqids_and_text_fqids["level"].tolist(),
    room_fqids=levels_and_room_fqids_and_text_fqids["room_fqid"].tolist(),
    categories=levels_and_room_fqids_and_text_fqids["text_fqid"].tolist(),
    col="text_fqid"
)

room_movement = RoomMovementCountFeatureGenerator()

elapsed_time = ElapsedTimeFeatureGenerator()
time_diff = TimeDiffFeatureGenerator()

In [12]:
env = jo_wilder.make_env()
iter_test = env.iter_test()

In [13]:
for two_dataframes in iter_test:

    sample_submission, test = order_dataframes(two_dataframes)
    sample_submission["question"] = get_q_from(sample_submission["session_id"])

    sample_submission = get_default_submission(sample_submission)

#     env.predict(sample_submission[["session_id", "correct"]])
#     continue

    # Preprocess
    test["room_fqid"] = test["room_fqid"].map(mappings["room_fqid"]).fillna(-1)
    test["event_name"] = test["event_name"].map(mappings["event_name"]).fillna(-1)
    test["text_fqid"] = test["text_fqid"].map(mappings["text_fqid"]).fillna(mappings["text_fqid"]["NaN"])

    # Upsert feature engineering utility's db with given batch
    flag_count_fullscreen.upsert(test)
    flag_count_hq.upsert(test)
    flag_count_music.upsert(test)
    log_count_event_name.upsert(test)
    log_count_text_fqid.upsert(test)
    room_movement.upsert(test)
    elapsed_time.upsert(test)
    time_diff.upsert(test)

    # Extract feature
    first_row = test.iloc[0]
    session_id = first_row["session_id"]
    level_group = first_row["level_group"]
    X = {}
    for level in get_available_levels_from(level_group):
        X.update(flag_count_fullscreen.query(session_id, level))
        X.update(flag_count_hq.query(session_id, level))
        X.update(flag_count_music.query(session_id, level))
        X.update(log_count_event_name.query(session_id, level))
        X.update(log_count_text_fqid.query(session_id, level))
        X.update(room_movement.query(session_id, level))
        X.update(elapsed_time.query(session_id, level))
        X.update(time_diff.query(session_id, level))
    X = pd.DataFrame(X, index=[0])

    # Prediction
    questions = sample_submission["question"].unique()

#     if "q13" in sample_submission["question"].to_numpy():
#         prediction_q13 = model_q13.predict(X[model_q13.feature_names_in_])
#         sample_submission.loc[sample_submission.question == 'q13', 'correct'] = prediction_q13[0]

    # - q15
    if "q15" in questions:
        prediction_q15 = model_q15.predict(X[model_q15.feature_names_in_])
        sample_submission.loc[sample_submission.question == 'q15', 'correct'] = prediction_q15[0]
        

    env.predict(sample_submission[["session_id", "correct"]])

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [14]:
sample_submission

Unnamed: 0,session_id,correct,question
0,20090312331414616_q14,1,q14
1,20090312331414616_q15,1,q15
2,20090312331414616_q16,1,q16
3,20090312331414616_q17,1,q17
4,20090312331414616_q18,1,q18


In [15]:
test

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312331414616,489,928004,2,undefined,13,,297.729307,-260.559972,656.0,482.0,,,,13,82.0,0.0,0.0,1.0,13-22
1,20090312331414616,490,928836,2,undefined,13,,459.877554,-262.914382,716.0,474.0,,,,13,82.0,0.0,0.0,1.0,13-22
2,20090312331414616,491,929968,2,undefined,13,,622.914322,-354.953570,731.0,531.0,,,,13,82.0,0.0,0.0,1.0,13-22
3,20090312331414616,492,930868,2,undefined,13,,779.643974,-258.638647,791.0,462.0,,,toentry,13,82.0,0.0,0.0,1.0,13-22
4,20090312331414616,493,931750,8,basic,13,,777.372366,-42.768409,773.0,331.0,,,,13,82.0,0.0,0.0,1.0,13-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,20090312331414616,1001,1581679,7,basic,22,,,,,,484.0,,tunic.wildlife,2,82.0,0.0,0.0,1.0,13-22
513,20090312331414616,1002,1583044,7,basic,22,,,,,,783.0,,tunic.capitol_2,2,82.0,0.0,0.0,1.0,13-22
514,20090312331414616,1003,1583410,8,undefined,22,,483.726363,-3.880047,456.0,332.0,,,tunic.capitol_2,2,82.0,0.0,0.0,1.0,13-22
515,20090312331414616,1004,1585841,2,undefined,22,,192.372139,38.216178,383.0,272.0,,,chap4_finale_c,18,82.0,0.0,0.0,1.0,13-22
