In [None]:
import os
import gc
import sys

import numpy as np
import pandas as pd
import polars as pl

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from lightgbm import early_stopping
from lightgbm import log_evaluation
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, f1_score

### DataLoader & Preprocessing

In [None]:
dtypes={ 
    'event_name':'category',
    'name':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    
    'page':'category',
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,

    'elapsed_time':np.int32,
    'text':'category',
    'level':np.uint8,
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group':'category'
}

train=pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', dtype=dtypes)

In [None]:
# Define a column operation list to preprocess data after reading the CSV file
columns = [
    # Calculate the difference value of the 'elapsed_time' column (the difference between the current row and the previous row), fill missing values with 0, and then restrict the result within the range [0, 1e9]
    # Finally, group the results by 'session_id' and 'level' columns
    (
        (pl.col('elapsed_time') - pl.col('elapsed_time').shift(1))
        .fill_null(0)
        .clip(0, 1e9)
        .over(['session_id', 'level'])
        .alias('elapsed_time_diff')
    ),

    (
        (pl.col('screen_coor_x') - pl.col('screen_coor_x').shift(1))
        .abs().
        over(['session_id', 'level'])
    ),

    (
        (pl.col("screen_coor_y") - pl.col("screen_coor_y").shift(1))
        .abs()
        .over(["session_id", "level"])
    ),

    (
        (pl.col('room_coor_x') - pl.col('room_coor_x').shift(1))
        .abs().
        over(['session_id', 'level'])
    ),

    (
        (pl.col("room_coor_y") - pl.col("room_coor_y").shift(1))
        .abs()
        .over(["session_id", "level"])
    ),

    pl.col("fqid").fill_null("fqid_None"),
    pl.col("text_fqid").fill_null("text_fqid_None")
]

train = pl.from_pandas(train).drop(['fullscreen', 'hq', 'music']).with_columns(columns)

fqid_lists = list(train['fqid'].unique())
text_fqid_lists = list(train['text_fqid'].unique())
room_fqid_lists = list(train['room_fqid'].unique())
name_feature_lists = list(train['name'].unique())
event_name_feature_lists = list(train['event_name'].unique())
levels = list(train['level'].unique())
level_groups = list(train['level_group'].unique())

print(levels)


In [None]:
# Divide the dataset based on level_group, corresponding to the three question checkpoints in the game
df1 = train.filter(pl.col("level_group") == '0-4')
df2 = train.filter(pl.col("level_group") == '5-12')
df3 = train.filter(pl.col("level_group") == '13-22')

print(df1.shape, df2.shape, df3.shape)

# Delete the original training set to free up memory
del train
gc.collect()

print(df1.columns)

In [None]:
# Category Feature
category = [
    'event_name', 'name', 'fqid', 'room_fqid', 'text_fqid'
]

# Value Feature
numeric = [
    'page', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y',
    'hover_duration', 'elapsed_time_diff'
]

### Feature Engineering

In [None]:
def elapsed_diff_agg(column, column_value_list, feature_suffix):
    elapsed_agg = [
        *[pl.col(column).filter(pl.col(column) == c).count().alias(f"{c}_{column}_counts_{feature_suffix}") for c in column_value_list],
        *[pl.col("elapsed_time_diff").filter(pl.col(column) == c).std().alias(f"{c}_{column}_elapsed_std_{feature_suffix}") for c in column_value_list],
        *[pl.col("elapsed_time_diff").filter(pl.col(column) == c).mean().alias(f"{c}_{column}_elapsed_mean_{feature_suffix}") for c in column_value_list],
        *[pl.col("elapsed_time_diff").filter(pl.col(column) == c).max().alias(f"{c}_{column}_elapsed_max_{feature_suffix}") for c in column_value_list],
        *[pl.col("elapsed_time_diff").filter(pl.col(column) == c).min().alias(f"{c}_{column}_elapsed_min_{feature_suffix}") for c in column_value_list],
        *[pl.col("elapsed_time_diff").filter(pl.col(column) == c).sum().alias(f"{c}_{column}_elapsed_sum_{feature_suffix}") for c in column_value_list], 
    ]
    return elapsed_agg

def feature_engineer(x, grp, feature_suffix):
    """
    This function is used for feature engineering, including the calculation of statistical features and the construction of specific features based on business logic.

    Parameters:
    x: The input data
    grp: The level of the data, used to construct different features according to different levels
    feature_suffix: The feature suffix, used to distinguish features of different levels
    """
    aggs = [
        pl.col('index').count().alias(f"session_number_{feature_suffix}"),
        *[pl.col(c).drop_nulls().n_unique().alias(f"{c}_unique_{feature_suffix}") for c in category],
        *[pl.col(c).std().alias(f"{c}_std_{feature_suffix}") for c in numeric],
        *[pl.col(c).mean().alias(f"{c}_mean_{feature_suffix}") for c in numeric],
        *[pl.col(c).min().alias(f"{c}_min_{feature_suffix}") for c in numeric],
        *[pl.col(c).max().alias(f"{c}_max_{feature_suffix}") for c in numeric],
        *[pl.col(c).sum().alias(f"{c}_sum_{feature_suffix}") for c in numeric],
    ]

    aggs.extend(elapsed_diff_agg('fqid', fqid_lists, feature_suffix))
    aggs.extend(elapsed_diff_agg('text_fqid', text_fqid_lists, feature_suffix))
    aggs.extend(elapsed_diff_agg('room_fqid', room_fqid_lists, feature_suffix))
    aggs.extend(elapsed_diff_agg('name', name_feature_lists, feature_suffix))
    aggs.extend(elapsed_diff_agg('event_name', event_name_feature_lists, feature_suffix))
    aggs.extend(elapsed_diff_agg('level', levels, feature_suffix))
    aggs.extend(elapsed_diff_agg('level_group', level_groups, feature_suffix))

    df = x.groupby(['session_id'], maintain_order=True).agg(aggs).sort('session_id')
    

    if grp == '5-12':
        aggs = [
            pl.col("elapsed_time").filter((pl.col("text")=="Here's the log book.") |(pl.col("fqid")=='logbook.page.bingo')).apply(lambda s: s.max()-s.min()).alias("logbook_bingo_duration"),

            pl.col("index").filter((pl.col("text") == "Here's the log book.") | (pl.col("fqid") == 'logbook.page.bingo')).apply(lambda s: s.max() - s.min()).alias("logbook_bingo_indexCount"),

            pl.col("elapsed_time").filter(((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'reader')) | (pl.col("fqid") == "reader.paper2.bingo")).apply(lambda s: s.max() - s.min()).alias("reader_bingo_duration"),

            pl.col("index").filter(((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'reader')) | (pl.col("fqid") == "reader.paper2.bingo")).apply(lambda s: s.max() - s.min()).alias("reader_bingo_indexCount"),

            pl.col("elapsed_time").filter(((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'journals')) | (pl.col("fqid") == "journals.pic_2.bingo")).apply(lambda s: s.max() - s.min()).alias("journals_bingo_duration"),

            pl.col("index").filter(((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'journals')) | (pl.col("fqid") == "journals.pic_2.bingo")).apply(lambda s: s.max() - s.min()).alias("journals_bingo_indexCount"),
        ]
        tmp = x.groupby(["session_id"], maintain_order=True).agg(aggs).sort("session_id")
        df = df.join(tmp, on="session_id", how='left')

    if grp == '13-22':
        aggs = [
            pl.col("elapsed_time").filter(((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'reader_flag')) | (pl.col("fqid") == "tunic.library.microfiche.reader_flag.paper2.bingo")).apply(lambda s: s.max() - s.min() if s.len() > 0 else 0).alias("reader_flag_duration"),

            pl.col("index").filter(((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'reader_flag')) | (pl.col("fqid") == "tunic.library.microfiche.reader_flag.paper2.bingo")).apply(lambda s: s.max() - s.min() if s.len() > 0 else 0).alias("reader_flag_indexCount"),

            pl.col("elapsed_time").filter(((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'journals_flag')) | (pl.col("fqid") == "journals_flag.pic_0.bingo")).apply(lambda s: s.max() - s.min() if s.len() > 0 else 0).alias("journalsFlag_bingo_duration"),

            pl.col("index").filter(((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'journals_flag')) | (pl.col("fqid") == "journals_flag.pic_0.bingo")).apply(lambda s: s.max() - s.min() if s.len() > 0 else 0).alias("journalsFlag_bingo_indexCount")
        ]
        tmp = x.groupby(["session_id"], maintain_order=True).agg(aggs).sort("session_id")
        df = df.join(tmp, on="session_id", how='left')

    return df.to_pandas()

In [None]:
df1 = feature_engineer(df1, grp='0-4', feature_suffix='xgboost')
df1.set_index('session_id')
print(df1.shape)

df2 = feature_engineer(df2, grp='5-12', feature_suffix='xgboost')
df2.set_index('session_id')
print(df2.shape)

df3 = feature_engineer(df3, grp='13-22', feature_suffix='xgboost')
df3.set_index('session_id')
print(df3.shape)

In [None]:
# Calculate the ratio of missing values in each column for each dataset
null1 = df1.isnull().sum().sort_values(ascending=False) / len(df1)
null2 = df2.isnull().sum().sort_values(ascending=False) / len(df1)
null3 = df3.isnull().sum().sort_values(ascending=False) / len(df1)

# Find the columns where the missing value ratio is greater than 0.9
drop1 = list(null1[null1 > 0.9].index)
drop2 = list(null2[null2 > 0.9].index)
drop3 = list(null3[null3 > 0.9].index)
print(len(drop1), len(drop2), len(drop3))

for col in df1.columns:
    if df1[col].nunique() == 1:
        print(col)
        drop1.append(col)
print("*********df1 DONE*********")

for col in df2.columns:
    if df2[col].nunique() == 1:
        print(col)
        drop2.append(col)
print("*********df2 DONE*********")

for col in df3.columns:
    if df3[col].nunique() == 1:
        print(col)
        drop3.append(col)
print("*********df3 DONE*********")

features1 = [c for c in df1.columns if c not in drop1 + ['level_group']]
features2 = [c for c in df2.columns if c not in drop2 + ['level_group']]
features3 = [c for c in df3.columns if c not in drop3 + ['level_group']]

hyperparams

In [None]:
estimators_xgb = [498, 448, 378, 364, 405, 495, 456, 249, 384, 405, 356, 262, 484, 381, 392, 248 ,248, 345]
xgb_params = {
    'booster': 'gbtree',
    'tree_method': 'hist',
    'objective': 'binary:logistic',
    'eval_metric':'logloss',
    'learning_rate': 0.02,
    'alpha': 8,
    'max_depth': 4,
    'subsample':0.8,
    'colsample_bytree': 0.5,
    'seed': 2023
}


targets = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')

targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]))
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]))

In [None]:
for t in range(1, 19):
    if t <= 3:
        grp = '0-4'
        df = df1
        features = features1 + [f'pre{i}' for i in range(1, t)]
    elif t <= 13:
        grp = '5-12'
        df = df2
        features = features2 + [f'pre{i}' for i in range(4, t)]
    else:
        grp = '13-22'
        df = df3
        features = features3 + [f'pre{i}' for i in range(14, t)]

    train_users = df['session_id']
    train_y = targets.loc[targets.q == t].set_index('session').loc[train_users]

    xgb_params['n_estimators'] = estimators_xgb[t-1]

    clf =  XGBClassifier(**xgb_params)
    clf.fit(df[features].astype('float32'), train_y['correct'], verbose = 0)
    df[f'pre{t}'] = clf.predict_proba(df[features].astype('float32'))[:,1]
    clf.save_model(f'XGB_question{t}.xgb')
    
    print(f'model XGB saved for question {t}')  

In [None]:
import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

In [None]:
limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (test, sample_submission) in iter_test:
    test = test.sort_values(by = 'index')
    session_id = test.session_id.values[0]
    grp = test['level_group'].values[0]

    a,b = limits[grp]
    if a == 1:
        features = features1 
    elif a == 4:
        features = features2
    else:
        features = features3

    test = pl.from_pandas(test).drop(['fullscreen', 'hq', 'music']).with_columns(columns)
    test = feature_engineer(test, grp, feature_suffix='xgboost')
    features_pre = []

    for t in range(a, b):
        clf = XGBClassifier()
        clf.load_model(f"/kaggle/working/XGB_question{t}.xgb")
        mask = sample_submission.session_id.str.contains(f'q{t}')
        p = clf.predict_proba(test[features+features_pre].astype('float32'))[:, 1]
        test[f'pre{t}'] = p
        sample_submission.loc[mask, 'correct'] = int((p.item()) > 0.625)
        features_pre.append(f'pre{t}')

    env.predict(sample_submission)

In [None]:
print(sample_submission)