<div style="text-align: center; background-color: #0A6EBD; font-family: 'Trebuchet MS', Arial, sans-serif; color: white; padding: 20px; font-size: 40px; font-weight: bold; border-radius: 0 0 0 0; box-shadow: 0px 6px 8px rgba(0, 0, 0, 0.2);">
 Project - Predict Student Performance from Game  Player  @ FIT-HCMUS, VNU-HCM 📌
</div>

***

# Author

ID: 21127743 \
Name: Tran Thai Toan

***

<div style="text-align: center; background-color: #5A96E3; font-family: 'Trebuchet MS', Arial, sans-serif; color: white; padding: 20px; font-size: 40px; font-weight: bold; border-radius: 0 0 0 0; box-shadow: 0px 6px 8px rgba(0, 0, 0, 0.2);">
 Stage 01 -Data Discovering  📌
</div>

# Importing libraries

Due to the large input, we set the dtypes for them to reduce the memory.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import tensorflow as tf
import matplotlib.patches as mpatches
from itertools import cycle
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

dtypes = {
    'elapsed_time': np.int32,
    'event_name': 'category',
    'name': 'category',
    'level': np.uint8,
    'room_coor_x': np.float32,
    'room_coor_y': np.float32,
    'screen_coor_x': np.float32,
    'screen_coor_y': np.float32,
    'hover_duration': np.float32,
    'text': 'category',
    'fqid': 'category',
    'room_fqid': 'category',
    'text_fqid': 'category',
    'fullscreen': 'category',
    'hq': 'category',
    'music': 'category',
    'level_group': 'category'
}

# Reading files

Using chunking for train file

In [None]:
file_path = "/kaggle/input/predict-student-performance-from-game-play/train.csv"
chunksize = 100000 
chunk_list = []

for chunk in pd.read_csv(file_path, dtype=dtypes, chunksize=chunksize):
    chunk_list.append(chunk)
full_data = pd.concat(chunk_list, ignore_index=True)

In [None]:
full_data.head(3)

Reading labels

In [None]:
labels = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')

In [None]:
labels.head(3)

In [None]:
labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]) )
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )
labels.head()

# Discovering datasets

Train data

In [None]:
full_data.info()

Label data

In [None]:
labels.info()

In [None]:
plt.figure(figsize=(30, 20))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.suptitle("\"Correct\" column values for each question", fontsize=30, y=0.94)

max_y = 25000
color_cycle = plt.cm.tab20.colors[:2]  

for n in range(1, 19):
    ax = plt.subplot(3, 6, n)

    plot_df = labels.loc[labels.q == n]
    plot_df = plot_df.correct.value_counts()

    plot_df.index = plot_df.index.map({1: "Correct", 0: "Incorrect"})

    bars = plot_df.plot(ax=ax, kind="bar", color=color_cycle)

    for bar in bars.patches:
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
                int(bar.get_height()), ha='center', va='bottom', fontsize=10)

    ax.set_title("Question " + str(n))
    ax.set_xlabel("")
    ax.set_ylabel("Count")
    ax.set_ylim(0, max_y)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0)

plt.show()


In [None]:
event_counts = full_data['event_name'].value_counts()
color_cycle = cycle(plt.cm.tab10.colors) 
colors = [next(color_cycle) for _ in range(len(event_counts))]

plt.figure(figsize=(12, 6))
bars = event_counts.plot(kind='bar', color=colors, edgecolor='black')

for bar in bars.patches:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
             int(bar.get_height()), ha='center', va='bottom', fontsize=10)

plt.title("Frequency of Event Names", fontsize=16)
plt.xlabel("Event Name", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.xticks(rotation=45, ha='right')

patches = [mpatches.Patch(color=colors[i], label=event_counts.index[i]) for i in range(len(event_counts))]
plt.legend(handles=patches, title="Event Names", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
columns = ['fullscreen', 'hq', 'music']

plt.figure(figsize=(15, 5))
plt.subplots_adjust(hspace=0.5, wspace=0.3)

for i, col in enumerate(columns, start=1):
    value_counts = full_data[col].value_counts()

    value_counts.index = value_counts.index.map({'0': 'No', '1': 'Yes'})
    unique_values = value_counts.index.tolist()
    color_map = plt.cm.get_cmap('tab20', len(unique_values)) 
    
    ax = plt.subplot(1, 3, i)  
    bars = ax.bar(unique_values, value_counts.values, 
                  color=[color_map(j) for j in range(len(unique_values))], 
                  edgecolor='black')

    for bar in bars:
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
                int(bar.get_height()), ha='center', va='bottom', fontsize=10)

    ax.set_title(f"Distribution of {col}", fontsize=14)
    ax.set_xlabel(col, fontsize=12)
    ax.set_ylabel("Count", fontsize=12)
    ax.set_xticks(range(len(unique_values)))
    ax.set_xticklabels(unique_values, rotation=0, ha='right')

    patches = [mpatches.Patch(color=color_map(j), label=str(unique_values[j])) for j in range(len(unique_values))]
    ax.legend(handles=patches, title=f"{col} Values", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)

plt.tight_layout()
plt.show()


In [None]:
level_group = full_data['level_group'].value_counts()
custom_order = ['0-4', '5-12', '13-22']
level_group_sorted = level_group[custom_order]

color_cycle = cycle(plt.cm.tab10.colors)
colors = [next(color_cycle) for _ in range(len(level_group_sorted))]

plt.figure(figsize=(12, 6))
bars = level_group_sorted.plot(kind='bar', color=colors, edgecolor='black')

for bar in bars.patches:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
             int(bar.get_height()), ha='center', va='bottom', fontsize=10)

plt.title("Frequency of Level group", fontsize=16)
plt.xlabel("Event Name", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.xticks(rotation=45, ha='right')

patches = [mpatches.Patch(color=colors[i], label=level_group_sorted.index[i]) for i in range(len(level_group_sorted))]
plt.legend(handles=patches, title="Event Names", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)

plt.tight_layout()
plt.show()

<div style="text-align: center; background-color: #5A96E3; font-family: 'Trebuchet MS', Arial, sans-serif; color: white; padding: 20px; font-size: 40px; font-weight: bold; border-radius: 0 0 0 0; box-shadow: 0px 6px 8px rgba(0, 0, 0, 0.2);">
 Stage 02 - Data Preprocessing  📌
</div>

In [None]:
def feature_engineering(data, labels, CATEGORICAL, NUMERICAL):
    dfs = []
    for c in CATEGORICAL:
        tmp = data.groupby(['session_id', 'level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = data.groupby(['session_id', 'level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = data.groupby(['session_id', 'level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = data.groupby(['session_id', 'level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    for c in EVENTS:
        data[c] = (data.event_name == c).astype('int8')
    for c in EVENTS + ['elapsed_time']:
        tmp = data.groupby(['session_id', 'level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
        
    data = data.drop(EVENTS, axis=1)

    tmp = data.groupby(['session_id', 'level_group'])['elapsed_time'].agg(['max', 'min', 'count'])
    tmp['time'] = tmp['max'] - tmp['min']
    tmp['avg_time'] = tmp['time'] / tmp['count']
    dfs.append(tmp)
    
    data_df = pd.concat(dfs, axis=1)
    data_df = data_df.fillna(-1)
    data_df = data_df.reset_index()
    data_df = data_df.set_index('session_id')
    
    return data_df

CATEGORICAL = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']

NUMERICAL = ['elapsed_time','page','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

EVENTS = ['navigate_click', 'person_click', 'cutscene_click', 'object_click',
          'map_hover', 'notification_click', 'map_click', 'observation_click',
          'checkpoint']

dataset_df = feature_engineering(full_data, labels, CATEGORICAL, NUMERICAL)

In [None]:
print("Full prepared dataset shape is {}".format(dataset_df.shape))

In [None]:
dataset_df = dataset_df.loc[:, ~dataset_df.columns.duplicated()]

In [None]:
dataset_df.dtypes

<div style="text-align: center; background-color: #5A96E3; font-family: 'Trebuchet MS', Arial, sans-serif; color: white; padding: 20px; font-size: 40px; font-weight: bold; border-radius: 0 0 0 0; box-shadow: 0px 6px 8px rgba(0, 0, 0, 0.2);">
 Stage 03 - Data Modeling 📌
</div>

In [None]:
def split_dataset(dataset, test_ratio=0.20):
    USER_LIST = dataset.index.unique()
    split = int(len(USER_LIST) * (1 - test_ratio))
    return dataset.loc[USER_LIST[:split]], dataset.loc[USER_LIST[split:]]

train_x, valid_x = split_dataset(dataset_df)
print("{} examples in training, {} examples in testing.".format(len(train_x), len(valid_x)))

VALID_USER_LIST = valid_x.index.unique()
prediction_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST), 18)), index=VALID_USER_LIST)

In [None]:
FEATURES = [c for c in dataset_df.columns if c != 'level_group']
print('We will train with', len(FEATURES), 'features')
ALL_USERS = dataset_df.index.unique()
print('We will train with', len(ALL_USERS), 'users info')

In [None]:
find_models = {}
pipeline = Pipeline([
    ('select_k_best', SelectKBest(score_func=f_classif)),
    ('xgb', XGBClassifier(eval_metric='logloss'))
])

param_grid = {
    'select_k_best__k': [5, 15, 25, 35, len(dataset_df.dtypes)-1],
    'xgb__learning_rate': [0.01, 0.03],
    'xgb__max_depth': [5, 6],
    'xgb__n_estimators': [500],
    'xgb__subsample': [0.8, 1],
    'xgb__colsample_bytree': [0.8],
    'xgb__booster': ['gbtree'],
    'xgb__tree_method': ['hist']
}

for q_no in range(1, 19):
    if q_no <= 3:
        grp = '0-4'
    elif q_no <= 13:
        grp = '5-12'
    elif q_no <= 22:
        grp = '13-22'
    print("### q_no", q_no, "grp", grp)

    train_df = train_x.loc[train_x.level_group == grp]
    valid_df = valid_x.loc[valid_x.level_group == grp]
    train_users = train_df.index.values
    valid_users = valid_df.index.values

    train_labels = labels.loc[labels.q == q_no].set_index('session').loc[train_users]
    valid_labels = labels.loc[labels.q == q_no].set_index('session').loc[valid_users]

    train_df["correct"] = train_labels["correct"]
    valid_df["correct"] = valid_labels["correct"]

    X_train = train_df.drop(columns=["correct", "level_group"])
    y_train = train_df["correct"]
    X_valid = valid_df.drop(columns=["correct", "level_group"])
    y_valid = valid_df["correct"]
    
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring='accuracy',
        cv=3,
        verbose=1,
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    print("Best parameters found: ", grid_search.best_params_)
    find_models[q_no] = grid_search.best_params_
    print("Best accuracy: ", grid_search.best_score_)


print("Found Dictionary:", find_models)

<div style="text-align: center; background-color: #5A96E3; font-family: 'Trebuchet MS', Arial, sans-serif; color: white; padding: 20px; font-size: 40px; font-weight: bold; border-radius: 0 0 0 0; box-shadow: 0px 6px 8px rgba(0, 0, 0, 0.2);">
 Stage 04 - Experiment 📌
</div>

In [None]:
warnings.filterwarnings("ignore", message="Features .* are constant")
warnings.filterwarnings("ignore", message="invalid value encountered in divide")

gkf = GroupKFold(n_splits=5)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS),18)), index=ALL_USERS)
models = {}
evaluation_dict = {}
for i, (train_index, test_index) in enumerate(gkf.split(X=dataset_df, groups=dataset_df.index)):
    print('#'*25)
    print('### Fold',i+1)
    print('#'*25)
    
    for q_no in range(1,19):
        
        if q_no<=3: grp = '0-4'
        elif q_no<=13: grp = '5-12'
        elif q_no<=22: grp = '13-22'

        print("### q_no", q_no, "grp", grp)
        
        train_x = dataset_df.iloc[train_index]
        train_x = train_x.loc[train_x.level_group == grp]
        train_users = train_x.index.values
        train_y = labels.loc[labels.q==q_no].set_index('session').loc[train_users]
        
        valid_x = dataset_df.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = labels.loc[labels.q==q_no].set_index('session').loc[valid_users]
                
        final_model = XGBClassifier(
        learning_rate=find_models[q_no]['xgb__learning_rate'],
        max_depth=find_models[q_no]['xgb__max_depth'],
        n_estimators=find_models[q_no]['xgb__n_estimators'],
        subsample=find_models[q_no]['xgb__subsample'],
        colsample_bytree=find_models[q_no]['xgb__colsample_bytree'],
        # use_label_encoder=False,
        eval_metric='logloss',
        booster = find_models[q_no]['xgb__booster'],
        tree_method= find_models[q_no]['xgb__tree_method']
        # objective = find_models[q_no]['xgb__objective']
        )
        
        k_best = find_models[q_no]['select_k_best__k']
        selector = SelectKBest(score_func=f_classif, k=k_best)
        X_train_selected = selector.fit_transform(train_x[FEATURES], train_y['correct'])
        X_valid_selected = selector.transform(valid_x[FEATURES])
        
        clf = final_model
        clf.fit(X_train_selected, train_y['correct'],
                eval_set=[ (X_valid_selected, valid_y['correct']) ],
                verbose=0)

        models[q_no] = {
        "model": clf,
        "selector": selector
        }

        y_pred = clf.predict(X_valid_selected)
        accuracy = accuracy_score(valid_y['correct'], y_pred)

        if q_no not in evaluation_dict:
            evaluation_dict[q_no] = 0
        evaluation_dict[q_no] += accuracy
        print('Accuracy: ', accuracy)
        
        oof.loc[valid_users, q_no-1] = clf.predict_proba(X_valid_selected)[:,1]
        
    print()

In [None]:
average_accuracy = sum(evaluation_dict.values()) / (len(evaluation_dict) * 5)

print(f"Average Accuracy: {average_accuracy:.4f}")

In [None]:
true = oof.copy()
for k in range(18):
    tmp = labels.loc[labels.q == k+1].set_index('session').loc[ALL_USERS]
    true[k] = tmp.correct.values

In [None]:
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4,0.81,0.01):
    print(f'{threshold:.02f}, ',end='')
    preds = (oof.values.reshape((-1))>threshold).astype('int')
    m = f1_score(true.values.reshape((-1)), preds, average='macro')   
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold


print('\tbest threshold = ',best_threshold)

In [None]:
plt.figure(figsize=(20,5))
plt.plot(thresholds,scores,'-o',color='blue')
plt.scatter([best_threshold], [best_score], color='blue', s=300, alpha=1)
plt.xlabel('Threshold',size=14)
plt.ylabel('Validation F1 Score',size=14)
plt.title(f'Threshold vs. F1_Score with Best F1_Score = {best_score:.3f} at Best Threshold = {best_threshold:.3}',size=18)
plt.show()

In [None]:
print('Applying the best threshold for each question...')
for k in range(18):   
    
    m = f1_score(true[k].values, (oof[k].values>best_threshold).astype('int'), average='macro')
    print(f'Q{k}: F1 =',m)
    
m = f1_score(true.values.reshape((-1)), (oof.values.reshape((-1))>best_threshold).astype('int'), average='macro')
print('==> Overall F1 =',m)

<div style="text-align: center; background-color: #5A96E3; font-family: 'Trebuchet MS', Arial, sans-serif; color: white; padding: 20px; font-size: 40px; font-weight: bold; border-radius: 0 0 0 0; box-shadow: 0px 6px 8px rgba(0, 0, 0, 0.2);">
 Stage 05 - Submission 📌
</div>

In [None]:
import jo_wilder_310
env = jo_wilder_310.make_env()
iter_test = env.iter_test()

In [None]:
limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}
for (test, sample_submission) in iter_test:

    test_df = feature_engineering(test, labels, CATEGORICAL, NUMERICAL)
    test_df = test_df.loc[:, ~test_df.columns.duplicated()]
    test_df["level_group"] = test_df["level_group"].astype("category")
    grp = test_df.level_group.values[0]
    a,b = limits[grp]
    test_df = test_df.drop(columns=["level_group"])
    for t in range(a, b):
        model = models[t]['model']
        selectors = models[t]['selector']
        print(selectors)
        X_test = selectors.transform(test_df[FEATURES])
        
        # y_pred_probs = model.predict(X_test)  
        # y_pred = (y_pred_probs > best_threshold).astype(int)

        y_pred_probs = model.predict_proba(X_test)
        y_pred = (y_pred_probs[:, 1] > best_threshold).astype(int)
    
        mask = sample_submission.session_id.str.contains(f'q{t}')
        sample_submission.loc[mask,'correct'] = y_pred

    env.predict(sample_submission)

In [None]:
! head submission.csv