In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/predict-student-performance-from-game-play/sample_submission.csv
/kaggle/input/predict-student-performance-from-game-play/train_labels.csv
/kaggle/input/predict-student-performance-from-game-play/train.csv
/kaggle/input/predict-student-performance-from-game-play/test.csv
/kaggle/input/predict-student-performance-from-game-play/jo_wilder/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/predict-student-performance-from-game-play/jo_wilder/__init__.py


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support

dtypes = {
    'elapsed_time': int,
    'event_name': 'category',
    'name': 'category',
    'level': int,
    'room_coor_x': float,
    'room_coor_y': float,
    'screen_coor_x': float,
    'screen_coor_y': float,
    'hover_duration': float,
    'text': 'category',
    'fqid': 'category',
    'room_fqid': 'category',
    'text_fqid': 'category',
    'fullscreen': 'category',
    'hq': 'category',
    'music': 'category',
    'level_group': 'category'
}

In [3]:
dataset_df = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', dtype=dtypes)

CATEGORICAL = ['event_name', 'name', 'fqid', 'room_fqid', 'text_fqid']
NUMERICAL = ['elapsed_time', 'level', 'page', 'room_coor_x', 'room_coor_y','screen_coor_x', 'screen_coor_y', 'hover_duration']


In [4]:
def feature_engineer(dataset_df):
    agg_dict = {}
    for c in CATEGORICAL:
        agg_dict[c] = ['nunique']
    for c in NUMERICAL:
        agg_dict[c] = ['mean', 'std']
    dfs = []
    for group in dataset_df['level_group'].unique():
        group_df = dataset_df[dataset_df['level_group'] == group]
        agg_df = group_df.groupby(['session_id'])[NUMERICAL+CATEGORICAL].agg(agg_dict)
        agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
        agg_df = agg_df.fillna(-1)
        agg_df['level_group'] = group
        dfs.append(agg_df)
    dataset_df = pd.concat(dfs).reset_index()
    dataset_df = dataset_df.set_index('session_id')
    return dataset_df

In [5]:
dataset_df = feature_engineer(dataset_df)

In [6]:
ALL_USERS = dataset_df.index.unique()
FEATURES = [c for c in dataset_df.columns if c != 'level_group']

targets = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]) )
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

In [7]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS), 18)), index=ALL_USERS)

In [9]:
for train_index, test_index in kf.split(X=dataset_df):
    for t in range(1, 19):
        if t <= 3:
            grp = '0-4'
        elif t <= 13:
            grp = '5-12'
        elif t <= 22:
            grp = '13-22'
        train_x = dataset_df.iloc[train_index]
        train_x = train_x.loc[train_x.level_group == grp]
        train_users = train_x.index.values
        train_y = targets.loc[targets.q==t].set_index('session').loc[train_users]
        valid_x = dataset_df.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = targets.loc[targets.q==t].set_index('session').loc[valid_users]
        clf = RandomForestClassifier(n_estimators=100)
        clf.fit(train_x[FEATURES], train_y['correct'])
        preds = clf.predict(valid_x[FEATURES])
        oof.loc[valid_users, t-1] = preds

In [10]:
true = oof.copy()
for k in range(18):
    tmp = targets.loc[targets.q == k+1].set_index('session').loc[ALL_USERS]
    true[k] = tmp.correct.values

In [11]:
thresholds = np.arange(0.2, 0.8, 0.01)
f1_scores = []

for threshold in thresholds:
    preds = (oof.values.reshape(-1) > threshold).astype(int)
    f1_score = precision_recall_fscore_support(true.values.reshape(-1), preds, average='macro')[2]
    f1_scores.append(f1_score)
    
best_threshold = thresholds[np.argmax(f1_scores)]
best_f1_score = np.max(f1_scores)

print('Best threshold:', best_threshold)
print('Best macro F1 score:', best_f1_score)

for k in range(18):
    f1_score = precision_recall_fscore_support(true[k], (oof[k] > best_threshold).astype(int), average='macro')[2]
    print(f'Question {k+1} macro F1 score: {f1_score}')
    
macro_f1_score = precision_recall_fscore_support(true.values.reshape(-1), (oof.values.reshape(-1) > best_threshold).astype(int), average='macro')[2]
print('Overall macro F1 score:', macro_f1_score)

Best threshold: 0.2
Best macro F1 score: 0.637642386219949
Question 1 macro F1 score: 0.4735615124630912
Question 2 macro F1 score: 0.49463795470144134
Question 3 macro F1 score: 0.4829152676278886
Question 4 macro F1 score: 0.5190971744610987
Question 5 macro F1 score: 0.5921725797432611
Question 6 macro F1 score: 0.524356228864876
Question 7 macro F1 score: 0.4944352264066215
Question 8 macro F1 score: 0.4862944391019002
Question 9 macro F1 score: 0.5140098328306957
Question 10 macro F1 score: 0.5875832094180031
Question 11 macro F1 score: 0.5146469329166573
Question 12 macro F1 score: 0.46867401700153594
Question 13 macro F1 score: 0.4855489740986948
Question 14 macro F1 score: 0.5155125406991153
Question 15 macro F1 score: 0.5965232480925716
Question 16 macro F1 score: 0.43501994070365213
Question 17 macro F1 score: 0.4354989350988148
Question 18 macro F1 score: 0.48733681462140993
Overall macro F1 score: 0.637642386219949
