https://www.kaggle.com/code/dungdore1312/session-info-as-sequence-use-lstm-to-predict

# Predict Student Performance from Game Play
- Session_id 별 특정 시간대에 발생한 이벤트와 클릭 좌표, 게임 설정 등을 통해 해당 문제를 맞추었는지, 틀렸는지를 예측하는 문제
- 해당 커널에서는 Session의 각 행을 Sequential data로 해석해 RNN으로 결과를 예측하고자 했다.

## Read the DataFrame

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

In [2]:
pwd

'c:\\Users\\JonghyunShin\\OneDrive - 고려대학교\\KUCC\\머신러닝 스터디(3-2)\\Week5'

In [3]:
# Load the dataset
dtypes = {
    'elapsed_time': np.int32,
    'event_name': 'category', 
    'name': 'category',
    'level': 'category',
    'room_coor_x': np.float32,
    'room_coor_y': np.float32,
    'screen_coor_x': np.float32,
    'screen_coor_y': np.float32,
    'hover_duration': np.float32,
    'text': 'category',
    'fqid': 'category',
    'room_fqid': 'category',
    'text_fqid': 'category',
    'fullscreen': 'category',
    'hq': 'category',
    'music': 'category',
    'level_group': 'category'
}

df = pd.read_csv('./predict-student-performance-from-game-play/train.csv', dtype = dtypes)

# Print the first 5 rows
df.head()

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991394,-159.314682,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4


In [4]:
df.shape

(26296946, 20)

In [5]:
session_1_df = df[df['session_id'] == 20090312431273200]
session_1_df

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991394,-159.314682,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
876,20090312431273200,927,1267357,navigate_click,undefined,22,,927.307251,-10.355928,838.0,335.0,,,tomap,tunic.historicalsociety.entry,,0,0,1,13-22
877,20090312431273200,928,1268292,map_hover,basic,22,,,,,,366.0,,tomap,tunic.historicalsociety.entry,,0,0,1,13-22
878,20090312431273200,929,1269474,map_click,undefined,22,,457.523010,22.141338,443.0,316.0,,,tunic.capitol_2,tunic.historicalsociety.entry,,0,0,1,13-22
879,20090312431273200,930,1270708,navigate_click,undefined,22,,224.190323,-60.268669,404.0,337.0,,,chap4_finale_c,tunic.capitol_2.hall,,0,0,1,13-22


- 해당 session_id 는 881개의 action을 가지고, 우리는 이를 881개의 단어로 이루어진 문서로 바라본다.

## Data Preprocessing

In [6]:
df.set_index(['session_id', 'index'], inplace = True)

In [7]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
session_id,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
20090312431273200,0,0,cutscene_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
20090312431273200,1,1323,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
20090312431273200,2,831,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
20090312431273200,3,1147,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
20090312431273200,4,1863,person_click,basic,0,,-412.991394,-159.314682,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4


In [8]:
df = df[['event_name', 'name','level', 'room_coor_x' ,'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration']] #적절한 column 선택
for col in ['room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration']:
    df[col] = (df[col] - df[col].min()) / (df[col].max()-df[col].min()) #scaling numeric data
    df[col] = df[col].fillna(0)

### Custom OneHotEncoding 구현 
pd.get_dummies를 이용해 one-hot encoding을 구현한다.
1. 데이터의 규모가 큰데, sklearn의 OneHotEncoder은 속도가 느리다
2. 해당 데이터셋의 unique categorical value의 수에 맞춰 encoding을 시행하여 메모리 낭비를 줄일 수 있다. 

In [9]:
import sklearn
# custom one-hot encoding
class GetDummies(sklearn.base.TransformerMixin):

    def __init__(self, dtypes = None):
        self.input_columns = None
        self.final_columns = None
        if dtypes is None:
            dtypes = [object, 'category']
        self.dtypes = dtypes

    def fit(self, X, y=None, **kwargs):
        self.input_columns = list(X.select_dtypes(self.dtypes).columns)
        X = pd.get_dummies(X, columns = self.input_columns)
        self.final_columns = X.columns
        return self

    def transform(self, X, y = None, **kwargs):
        X = pd.get_dummies(X, columns = self.input_columns)
        X_columns = X.columns
        missing = set(self.final_columns) - set(X_columns)
        for c in missing:
            X[c] = 0

        return X[self.final_columns]

    def get_feature_names(self):
        return tuple(self.final_columns)


In [10]:
get_dummies = GetDummies()
df = get_dummies.fit_transform(df)
df.shape

(26296946, 45)

In [11]:
grouped_data = df.groupby('session_id').apply(lambda x:np.array(x))
grouped_data

session_id
20090312431273200    [[0.4850341, 0.5191262, 0.1980198, 0.34305555,...
20090312433251036    [[0.49087286, 0.6860461, 0.20792079, 0.1736111...
20090312455206810    [[0.37456927, 0.5028902, 0.014590933, 0.490277...
20090313091715820    [[0.55998695, 0.48434407, 0.37832204, 0.440277...
20090313571836404    [[0.50470144, 0.62516135, 0.23137051, 0.235416...
                                           ...                        
22100215342220508    [[0.5649326, 0.46508244, 0.33350703, 0.3979166...
22100215460321130    [[0.48160797, 0.7014456, 0.20531526, 0.16875, ...
22100217104993650    [[0.4819611, 0.6579981, 0.19280875, 0.20208333...
22100219442786200    [[0.48564872, 0.5033919, 0.199062, 0.35902777,...
22100221145014656    [[0.42572483, 0.657314, 0.09744658, 0.20277777...
Length: 23562, dtype: object

## Convert to Pytorch Dataloader

In [12]:
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return torch.from_numpy(self.data[idx]).float()

In [13]:
def collate_fn_padd(batch): # 각 sequence length는 다르므로, batch에서 가장 긴길이를 가진 seq에 맞춘다.
    lengths = [t.shape[0] for t in batch]
    try:
        n_features = batch[0].shape[1] # 각 batch의 두 번째 차원이 없으면 1로 만들어줌
    except:
        n_features = 1
    max_length = max(lengths)
    if max_length == 0:
        max_length+=1
    batch_size = len(lengths)

    padded_tensor = torch.zeros(batch_size, max_length, n_features, dtype = torch.float32)
    for i, val in enumerate(batch):
        l = lengths[i]
        if n_features == 1:
            padded_tensor[i, :l] = val.reshape(-1, 1)
        else:
            padded_tensor[i, :l] = val
            
    return padded_tensor


In [14]:
dataset = MyDataset(grouped_data.values)

dataloader = DataLoader(dataset, batch_size = 32, shuffle = True, collate_fn = collate_fn_padd)

## Processing the labels

In [15]:
label_df = pd.read_csv('./predict-student-performance-from-game-play/train_labels.csv')
# label_df의 session_id column의 데이터는 'session_id'_'question_idx' 로 구성됨
label_df['session'] = label_df.session_id.apply(lambda x: int(x.split('_')[0]))
label_df['question_idx'] = label_df.session_id.apply(lambda x:int(x.split('_')[-1][1:]))
label_df.drop('session_id', axis = 1, inplace = True)

pivoted_questions = label_df.pivot(columns = 'question_idx', values = 'correct', index = 'session')
pivoted_questions['total_score'] = pivoted_questions.iloc[:, 0:18].sum(axis = 1)
pivoted_questions.columns = [f'q_{i}' for i in range(1, 19)] + ['total_score']
pivoted_questions

Unnamed: 0_level_0,q_1,q_2,q_3,q_4,q_5,q_6,q_7,q_8,q_9,q_10,q_11,q_12,q_13,q_14,q_15,q_16,q_17,q_18,total_score
session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
20090312431273200,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,16
20090312433251036,0,1,1,1,0,1,1,0,1,0,0,1,0,1,0,1,0,1,10
20090312455206810,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,17
20090313091715820,0,1,1,1,1,0,1,1,1,0,0,1,0,1,0,1,1,1,12
20090313571836404,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22100215342220508,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,16
22100215460321130,0,1,1,1,0,1,1,0,1,0,1,1,0,1,0,1,1,1,12
22100217104993650,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,0,1,1,15
22100219442786200,0,1,1,1,1,1,1,0,1,0,1,1,0,1,0,1,1,1,13


# LSTM Modeling

In [16]:
class StackedLSTM(nn.Module):
    def __init__(self, n_layers, n_hidden, n_features, n_embeddings):
        super(StackedLSTM, self).__init__()
        self.embedding = nn.Linear(n_features, n_embeddings)
        self.lstm = nn.LSTM(n_embeddings, n_hidden, n_layers, batch_first=True)
        self.linear = nn.Linear(n_hidden, 18)
        
    def forward(self, x):

        embed_out = self.embedding(x)
        lstm_out, _ = self.lstm(embed_out)
        out = lstm_out[:, -1, :]
        out = self.linear(out)

        out = torch.sigmoid(out)
        
        return out

n_layers = 3  
n_hidden = 16  
n_embeddings = 16 
n_features = 45 

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = StackedLSTM(n_layers, n_hidden, n_features, n_embeddings).to(device)

In [17]:
from tqdm import tqdm
n_out = 18
batch_size = 32
n_epochs = 3
n_samples = len(grouped_data)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

model.train()
for epoch in range(n_epochs):
    for i, sample in tqdm(enumerate(dataloader)):
        model.zero_grad()
        labels = torch.from_numpy(pivoted_questions.iloc[i*batch_size:(i+1)*batch_size, :18].values).float()
        sample = sample.to(device)
        labels = labels.to(device)

        outputs = model(sample)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        sample = sample.to('cpu')
        labels = labels.to('cpu')
    print(f'Epoch {epoch+1}/{n_epochs}, Loss: {loss.item():.4f}')


737it [02:15,  5.46it/s]


Epoch 1/3, Loss: 0.5185


737it [02:09,  5.69it/s]


Epoch 2/3, Loss: 0.5189


737it [02:05,  5.88it/s]

Epoch 3/3, Loss: 0.5191





In [18]:
dataloader = DataLoader(dataset, batch_size = 32, shuffle = False, collate_fn = collate_fn_padd)

In [19]:
model = StackedLSTM(n_layers, n_hidden, n_features, n_embeddings).to(device)
n_out = 18
batch_size = 32
n_epochs = 10
n_samples = len(grouped_data)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

model.train()
for epoch in range(n_epochs):
    for i, sample in tqdm(enumerate(dataloader)):
        model.zero_grad()
        labels = torch.from_numpy(pivoted_questions.iloc[i*batch_size:(i+1)*batch_size, :18].values).float()
        sample = sample.to(device)
        labels = labels.to(device)

        outputs = model(sample)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        sample = sample.to('cpu')
        labels = labels.to('cpu')
    print(f'Epoch {epoch+1}/{n_epochs}, Loss: {loss.item():.4f}')

737it [02:04,  5.91it/s]


Epoch 1/10, Loss: 0.4753


737it [02:05,  5.87it/s]


Epoch 2/10, Loss: 0.5182


737it [02:06,  5.84it/s]


Epoch 3/10, Loss: 0.4937


737it [02:05,  5.87it/s]


Epoch 4/10, Loss: 0.4680


737it [02:06,  5.82it/s]


Epoch 5/10, Loss: 0.4682


737it [02:04,  5.90it/s]


Epoch 6/10, Loss: 0.4688


737it [01:59,  6.15it/s]


Epoch 7/10, Loss: 0.5196


737it [02:00,  6.12it/s]


Epoch 8/10, Loss: 0.4711


737it [02:00,  6.11it/s]


Epoch 9/10, Loss: 0.5187


737it [02:02,  6.04it/s]

Epoch 10/10, Loss: 0.4697



