In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support

dtypes = {
    'elapsed_time': int,
    'event_name': 'category',
    'name': 'category',
    'level': int,
    'room_coor_x': float,
    'room_coor_y': float,
    'screen_coor_x': float,
    'screen_coor_y': float,
    'hover_duration': float,
    'text': 'category',
    'fqid': 'category',
    'room_fqid': 'category',
    'text_fqid': 'category',
    'fullscreen': 'category',
    'hq': 'category',
    'music': 'category',
    'level_group': 'category'
}

In [None]:
dataset_df = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', dtype=dtypes)

CATEGORICAL = ['event_name', 'name', 'fqid', 'room_fqid', 'text_fqid']
NUMERICAL = ['elapsed_time', 'level', 'page', 'room_coor_x', 'room_coor_y','screen_coor_x', 'screen_coor_y', 'hover_duration']

## Process data

In [None]:
def feature_engineer(dataset_df):

    # Initialize an empty dictionary to store aggregation functions for each feature.
    agg_dict = {}

    # Loop through categorical features and add 'nunique' to aggregation dictionary.
    for c in CATEGORICAL:
        agg_dict[c] = ['nunique']

    # Loop through numerical features and add 'mean' and 'std' to aggregation dictionary.
    for c in NUMERICAL:
        agg_dict[c] = ['mean', 'std']

    # Initialize an empty list to store aggregated dataframes.
    dfs = []

    # Loop through unique values of the 'level_group' column.
    for group in dataset_df['level_group'].unique():

        # Select a subset of the input dataset that corresponds to the current group.
        group_df = dataset_df[dataset_df['level_group'] == group]

        # Group the data by session_id and calculate the aggregates specified in agg_dict.
        agg_df = group_df.groupby(['session_id'])[NUMERICAL+CATEGORICAL].agg(agg_dict)

        # Rename columns to combine original column names and aggregation function names.
        agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]

        # Fill any missing values with -1 and add a new 'level_group' column.
        agg_df = agg_df.fillna(-1)
        agg_df['level_group'] = group

        # Append the aggregated dataframe to the list.
        dfs.append(agg_df)

    # Concatenate all the aggregated dataframes and reset the index to session_id.
    dataset_df = pd.concat(dfs).reset_index()
    dataset_df = dataset_df.set_index('session_id')

    return dataset_df

In [None]:
dataset_df = feature_engineer(dataset_df)

In [None]:
# Extract all unique users from the input dataset.
ALL_USERS = dataset_df.index.unique()

# Extract all features except 'level_group' from the input dataset.
FEATURES = [c for c in dataset_df.columns if c != 'level_group']

# Load target labels from a CSV file.
targets = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')

# Extract the session ID from each target label and convert it to an integer.
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]))

# Extract the question number from each target label and convert it to an integer.
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]))

## Model Training

In [None]:
# Function to determine the level group based on the value of t
def get_level_group(t):
    if t <= 3:
        return '0-4'
    elif t <= 13:
        return '5-12'
    else:
        return '13-22'

# Function to prepare data by filtering rows based on level group and question number (q)
def prepare_data(df, targets, level_group, q):
    df_filtered = df.loc[df.level_group == level_group]
    users = df_filtered.index.values
    y = targets.loc[targets.q == q].set_index('session').loc[users]
    return df_filtered[FEATURES], y['correct']

# Function to train a classifier and make predictions
def train_and_predict(clf, train_x, train_y, valid_x):
    clf.fit(train_x, train_y)
    return clf.predict(valid_x)

In [None]:
# Initialize a KFold object with 5 splits, shuffling the data and setting a random seed of 42
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize DataFrames for out-of-fold predictions
oof_rfc = pd.DataFrame(data=np.zeros((len(ALL_USERS), 18)), index=ALL_USERS)
oof_lr = oof_rfc.copy()

# Iterate over each fold
for train_index, test_index in kf.split(X=dataset_df):
    # Iterate over each question (t)
    for t in range(1, 19):
        # Determine the level group for the current question
        level_group = get_level_group(t)
        
        # Prepare the training and validation datasets
        train_x, train_y = prepare_data(dataset_df.iloc[train_index], targets, level_group, t)
        valid_x, valid_y = prepare_data(dataset_df.iloc[test_index], targets, level_group, t)
        valid_users = valid_x.index.values
        
        # Train RandomForestClassifier and make predictions
        clf_rfc = RandomForestClassifier(n_estimators=100)
        preds_rfc = train_and_predict(clf_rfc, train_x, train_y, valid_x)
        oof_rfc.loc[valid_users, t-1] = preds_rfc
        
        # Train LogisticRegression and make predictions
        clf_lr = LogisticRegression(C=1, max_iter=1000, penalty='l2')
        preds_lr = train_and_predict(clf_lr, train_x, train_y, valid_x)
        oof_lr.loc[valid_users, t-1] = preds_lr

## Model evaluation

In [None]:
def get_true_values(true, targets):
    for k in range(18):
        tmp = targets.loc[targets.q == k+1].set_index('session').loc[ALL_USERS]
        true[k] = tmp.correct.values
    return true

def get_best_threshold_and_f1_score(true, oof):
    thresholds = np.arange(0.2, 0.8, 0.01)
    f1_scores = []

    for threshold in thresholds:
        preds = (oof.values.reshape(-1) > threshold).astype(int)
        f1_score = precision_recall_fscore_support(true.values.reshape(-1), preds, average='macro')[2]
        f1_scores.append(f1_score)

    best_threshold = thresholds[np.argmax(f1_scores)]
    best_f1_score = np.max(f1_scores)

    return best_threshold, best_f1_score

def print_f1_scores_per_question(true, oof, best_threshold):
    for k in range(18):
        f1_score = precision_recall_fscore_support(true[k], (oof[k] > best_threshold).astype(int), average='macro')[2]
        print(f'Question {k+1} macro F1 score: {f1_score}')

def evaluate_model(oof, targets):
    true = oof.copy()
    true = get_true_values(true, targets)
    best_threshold, best_f1_score = get_best_threshold_and_f1_score(true, oof)

    print('Best threshold:', best_threshold)
    print('Best macro F1 score:', best_f1_score)

    print_f1_scores_per_question(true, oof, best_threshold)

    macro_f1_score = precision_recall_fscore_support(true.values.reshape(-1), (oof.values.reshape(-1) > best_threshold).astype(int), average='macro')[2]
    print('Overall macro F1 score:', macro_f1_score)

In [None]:
# Evaluate RandomForest
evaluate_model(oof_rfc, targets)

In [None]:
# Evaluate Logistic regression 

evaluate_model(oof_lr, targets)