In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Importing Libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import LSTM, Dense, Flatten
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
pd.options.mode.chained_assignment = None  # default='warn'
tf.random.set_seed(42)
np.random.seed(42)
keras.backend.clear_session()




In [3]:
# Reference: https://www.kaggle.com/competitions/predict-student-performance-from-game-play/discussion/384359
dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group':'category'}

dataset_df = pd.read_csv('train.csv', dtype=dtypes)
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (26296946, 20)


In [None]:
labels = pd.read_csv('train_labels.csv')
labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]) )
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

In [None]:
# Categorizing Features on the Type
CATEGORICAL = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']
NUMERICAL = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

In [None]:
# Reference: https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook
def feature_engineer(dataset_df):
    dfs = []
    for c in CATEGORICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('mean')
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    dataset_df = pd.concat(dfs,axis=1)
    dataset_df = dataset_df.fillna(-1)
    dataset_df = dataset_df.reset_index()
    dataset_df = dataset_df.set_index('session_id')
    return dataset_df

In [None]:
dataset_df = feature_engineer(dataset_df)
print("Dataset shape is {}".format(dataset_df.shape))

In [None]:
# Splitting DataSet into Train and Valid
def split_dataset(dataset, test_ratio=0.20):
    USER_LIST = dataset.index.unique()
    split = int(len(USER_LIST) * (1 - 0.20))
    return dataset.loc[USER_LIST[:split]], dataset.loc[USER_LIST[split:]]

# Generating Split DataSet
train_x, test_x = split_dataset(dataset_df)
print("{} examples in training, {} examples in validation.".format(
    len(train_x), len(test_x)))

In [None]:
# Splitting the Data into Train and Test
train_x, valid_x = split_dataset(train_x)
print("{} examples in training, {} examples in testing.".format(
    len(train_x), len(valid_x)))

In [None]:
# Function to Split DataSet into Parts
def save_to_multiple_csv_files(data, name_prefix, question, header=None, n_parts=5):
    # Setting the Directory
    # Creating Directory for Each Question
    game_prediction_dir = os.path.join("/kaggle/working/datasets_"+str(question), "student_performance_data")
    os.makedirs(game_prediction_dir, exist_ok=True)
    path_format = os.path.join(game_prediction_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

In [None]:
# Generating CSV files for Each Question and Saving them
train_file_paths_for_questions = []
valid_file_paths_for_questions = []
test_file_paths_for_questions = []

header_cols = train_x.columns
header = ",".join(header_cols)

for q_no in range(1,19):
    
    # Selecting the Group based on Question Number
    if q_no<=3: grp = '0-4'
    elif q_no<=13: grp = '5-12'
    elif q_no<=22: grp = '13-22'
    print("##### Generating CSV for q_no", q_no, "grp", grp)
    
    # Filter the rows in the datasets based on the selected level group. 
    train_df = train_x.loc[train_x.level_group == grp]
    train_users = train_df.index.values
    valid_df = valid_x.loc[valid_x.level_group == grp]
    valid_users = valid_df.index.values
    test_df = test_x.loc[test_x.level_group == grp]
    test_users = test_df.index.values
    
    # Select the labels for the related q_no.
    train_labels = labels.loc[labels.q==q_no].set_index('session').loc[train_users]
    valid_labels = labels.loc[labels.q==q_no].set_index('session').loc[valid_users]
    test_labels = labels.loc[labels.q==q_no].set_index('session').loc[test_users]
    
     # Add the label to the filtered datasets.
    train_df["correct"] = train_labels["correct"]
    valid_df["correct"] = valid_labels["correct"]
    test_df["correct"] = test_labels["correct"]
    
    # Dropping Column Level Group
    train_ds_data = train_df.drop(columns=['level_group'])
    valid_ds_data = valid_df.drop(columns=['level_group'])
    test_ds_data = test_df.drop(columns=['level_group'])
    train_ds_data.reset_index()
    valid_ds_data.reset_index()
    test_ds_data.reset_index()
    
    # Calling function to generate CSVs
    train_filepaths = save_to_multiple_csv_files(train_ds_data.to_numpy(), "train", "q_no_"+str(q_no), header, n_parts=5)
    valid_filepaths = save_to_multiple_csv_files(valid_ds_data.to_numpy(), "valid", "q_no_"+str(q_no), header, n_parts=5)
    test_filepaths = save_to_multiple_csv_files(test_ds_data.to_numpy(), "test", "q_no_"+str(q_no), header, n_parts=5)
    
    # Saving File Paths
    train_file_paths_for_questions.append(train_filepaths)
    valid_file_paths_for_questions.append(valid_filepaths)
    test_file_paths_for_questions.append(test_filepaths)

In [None]:
# Pre Process Function
n_inputs = 21
def preprocess(line):

    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]

    fields = tf.io.decode_csv(line, record_defaults=defs)
    X = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return X, y

In [None]:
# CSV Reader for Train
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,  # number of files or filepaths
                       n_read_threads=None, shuffle_buffer_size=10000,
                       n_parse_threads=5, batch_size=32):
    
    dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1), # skip the header row via map_func
        cycle_length=n_readers, # 'interleave' pull cycle_length(=n_readers) file paths(1 by 1) from the 'dataset'
        num_parallel_calls=n_read_threads) 
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    
    return dataset.prefetch(1)

In [None]:
# Using the Saved CSV Loading the Data and saving them to a List
train_set_list = []
valid_set_list = []
test_set_list = []


for q_no in range(1,19):

    # Select level group for the question based on the q_no.
    if q_no<=3: grp = '0-4'
    elif q_no<=13: grp = '5-12'
    elif q_no<=22: grp = '13-22'
    print("##### Loading CSV for q_no", q_no, "grp", grp)
    
    train_set = csv_reader_dataset(train_file_paths_for_questions[q_no - 1])
    valid_set = csv_reader_dataset(valid_file_paths_for_questions[q_no - 1]) 
    test_set = csv_reader_dataset(test_file_paths_for_questions[q_no - 1])   
    
    train_set_list.append(train_set)
    valid_set_list.append(valid_set)
    test_set_list.append(test_set)

In [None]:
# Training Models for Each Question
test_loss_and_accuracy_list = []
history_models = []
models = {}
f1_score_list = []

for q_no in range(1,19):

    # Select level group for the question based on the q_no.
    if q_no<=3: grp = '0-4'
    elif q_no<=13: grp = '5-12'
    elif q_no<=22: grp = '13-22'
    print("##### Training for q_no", q_no, "grp", grp)
    
    train_set = train_set_list[q_no - 1]
    valid_set = valid_set_list[q_no - 1]
    test_set = test_set_list[q_no - 1]
    
    
    model = keras.models.Sequential([
        keras.layers.LSTM(20, return_sequences=True, input_shape=(21, 1)),
        keras.layers.LSTM(20, return_sequences=True),
        keras.layers.TimeDistributed(keras.layers.Dense(1))
    ])

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    early_stopping = EarlyStopping(monitor='val_accuracy', mode='max', patience=3,  restore_best_weights=True)
    history = model.fit(train_set, epochs=1, validation_data=(valid_set), callbacks=[early_stopping])
    
    # Store the model
    models[f'{grp}_{q_no}'] = model    
    
    # Saving Accuracies
    results = model.evaluate(test_set)
    test_loss_and_accuracy_list.append(results)
    
    # Saving History of Models
    history_models.append(history)
    
    # F1 Score
    y_true_numpy_list = []
    
    predictions = model.predict(test_set, verbose=0)[:, -1][..., np.newaxis]
    predictions = predictions.round().astype(int).flatten()
    
    for x_batch, y_batch in test_set:
    
        y_batch = y_batch.numpy()
        y_batch = y_batch.round().astype(int).flatten()

        y_true_numpy_list.append(y_batch)
        
    y_true_numpy = np.concatenate(y_true_numpy_list)
    f1_score_list.append(f1_score(y_true_numpy, predictions, average='weighted'))

In [None]:
import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

In [None]:
best_threshold = 0.63

limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (test, sample_submission) in iter_test:
    
    # FEATURE ENGINEER TEST DATA
    df = feature_engineer(test)
    
    # INFER TEST DATA
    grp = test.level_group.values[0]
    a, b = limits[grp]
    for t in range(a, b):
        clf = models[f'{grp}_{t}']
        
        # Filter the columns of df based on the condition
        p = df.loc[:, df.columns != 'level_group']
        
        # Make predictions using the model
        predictions = clf.predict(p)  # Use clf instead of model
        
        # Create a mask to select the relevant rows in sample_submission
        mask = sample_submission.session_id.str.contains(f'q{t}')
        
        # Update the 'correct' column in sample_submission
        n_predictions = (predictions > best_threshold).astype(int)
        sample_submission.loc[mask, 'correct'] = n_predictions.flatten()[0]
    
    env.predict(sample_submission[['session_id', 'correct']])

In [None]:
! cat submission.csv