In [1]:
import torch
import torch.nn as nn

class EventLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, cat_feat_dim):
        super(EventLSTM, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)

        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim + cat_feat_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )

    def forward(self, event_seq, cat_feats):
        """
        event_seq: [batch_size, seq_len] — последовательность индексов событий
        cat_feats: [batch_size, 21] — агрегированные фичи сессии
        """
        x = self.embedding(event_seq)                # [B, T, embed_dim]
        _, (h_n, _) = self.lstm(x)                   # h_n: [1, B, hidden_dim]
        h_n = h_n.squeeze(0)                         # [B, hidden_dim]

        x_combined = torch.cat([h_n, cat_feats], dim=1)  # [B, hidden_dim + 21]
        out = self.classifier(x_combined)            # [B, 1]
        return torch.sigmoid(out).squeeze(1)         # [B]

In [2]:
vocab_size = 12
embed_dim = 64
hidden_dim = 128
cat_feat_dim = 21
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

models = []

for i in range(18):
    model = EventLSTM(vocab_size, embed_dim, hidden_dim, cat_feat_dim).to(device)
    model.load_state_dict(torch.load(f"models/model_{i}.pt"))
    model.eval() 
    models.append(model)

In [3]:
import pandas as pd

test = pd.read_csv("data/test.csv")

In [4]:
categorical_cols = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']
numerical_cols = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

In [5]:
def feature_engineer(dataset_df, categorical_cols, numerical_cols):
    agg_list = []

    for col in categorical_cols:
        tmp = dataset_df.groupby(['session_id', 'level_group'])[col].nunique()
        tmp.name = f'{col}_nunique'
        agg_list.append(tmp)

    for col in numerical_cols:
        tmp = dataset_df.groupby(['session_id', 'level_group'])[col].mean()
        tmp.name = f'{col}_mean'
        agg_list.append(tmp)

    for col in numerical_cols:
        tmp = dataset_df.groupby(['session_id', 'level_group'])[col].std()
        tmp.name = f'{col}_std'
        agg_list.append(tmp)

    agg_df = pd.concat(agg_list, axis=1).fillna(-1).reset_index()

    agg_df.set_index('session_id', inplace=True)
    agg_df = agg_df.reset_index()

    return agg_df

In [6]:
features_df = feature_engineer(test, categorical_cols, numerical_cols)

In [7]:
event2id = {e: i+1 for i, e in enumerate(test['event_name'].unique())} 
test['event_id'] = test['event_name'].map(event2id)
event2id["padding"] = 0
grouped_sessions = test.groupby(['session_id', 'level_group'])['event_id'].apply(list)
df_sessions = grouped_sessions.reset_index(name='events')
MAX_LEN = 200

def pad_events(ev_list):
    length = len(ev_list)
    if length < MAX_LEN:
        return ev_list + [0] * (MAX_LEN - length)
    else:
        return ev_list[:MAX_LEN]

df_sessions['events'] = df_sessions['events'].apply(pad_events)

In [8]:
merged_df = df_sessions.merge(features_df, on=['session_id', 'level_group'], how='left')

In [9]:
merged_df

Unnamed: 0,session_id,level_group,events,event_name_nunique,name_nunique,fqid_nunique,room_fqid_nunique,text_fqid_nunique,elapsed_time_mean,level_mean,...,screen_coor_y_mean,hover_duration_mean,elapsed_time_std,level_std,page_std,room_coor_x_std,room_coor_y_std,screen_coor_x_std,screen_coor_y_std,hover_duration_std
0,20090109393214576,0-4,"[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 3, 4, 3, ...",11,4,24,6,13,117119.8,2.05,...,374.597015,2006.8,66619.89,1.348194,0.57735,440.144543,145.126812,253.078561,109.49921,2341.772128
1,20090109393214576,13-22,"[3, 3, 3, 3, 3, 9, 9, 9, 9, 10, 3, 10, 3, 1, 1...",11,4,52,12,35,6165666.0,17.918089,...,393.667883,973.864865,227751.2,2.279627,0.723316,629.713345,274.696739,234.320165,122.118739,1462.983181
2,20090109393214576,5-12,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",11,4,43,12,23,2092368.0,7.686567,...,363.129707,2301.315789,2124842.0,1.898028,0.758395,378.100318,136.966952,241.109236,118.302246,4092.914793
3,20090312143683264,0-4,"[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, ...",11,4,27,7,15,142418.3,1.803681,...,413.104575,3066.555556,78257.63,1.221509,0.0,440.053605,177.28106,233.244085,139.175966,5097.845209
4,20090312143683264,13-22,"[3, 3, 3, 3, 3, 9, 9, 9, 9, 9, 10, 3, 8, 8, 8,...",11,6,55,15,38,2195108.0,17.335626,...,415.654303,864.096154,307875.7,2.688642,1.147211,576.561514,256.058964,250.247781,137.83206,1755.180693
5,20090312143683264,5-12,"[3, 3, 3, 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...",11,4,56,13,28,791150.4,8.445172,...,391.784787,1379.492958,245688.4,2.303566,0.787752,414.653052,170.659545,244.272072,147.151081,2012.309284
6,20090312331414616,0-4,"[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 3, 3, ...",11,4,23,6,13,112832.8,1.861538,...,374.118644,2176.818182,61004.48,1.15975,0.0,431.739627,205.922771,230.848314,149.588791,3449.709374
7,20090312331414616,13-22,"[3, 3, 3, 3, 10, 9, 9, 10, 3, 1, 1, 1, 1, 1, 1...",11,4,62,13,34,1262481.0,18.332689,...,407.912473,794.542373,193471.9,2.731803,0.670982,575.032012,236.920471,259.182135,132.65316,1057.100513
8,20090312331414616,5-12,"[3, 3, 2, 2, 3, 3, 9, 9, 9, 9, 9, 10, 3, 3, 1,...",11,4,46,11,21,558520.5,8.459119,...,389.124528,1561.096154,136600.2,2.121669,0.753778,357.487622,145.548413,221.97351,129.969153,3148.359406


Здесь я организовал предсказание вероятностей для каждой группы уровней и вопросов, прогоняя данные через соответствующие модели. Для числовых признаков я применил стандартизацию с помощью StandardScaler, чтобы улучшить качество и стабильность работы модели. Затем преобразовал данные в тензоры PyTorch, передал их в модель и собрал результаты в итоговый DataFrame для сохранения предсказаний.

In [10]:
from sklearn.preprocessing import StandardScaler
import torch
import numpy as np
import pandas as pd

MAX_LEN = 200
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
limits = {'0-4':(1,5), '5-12':(5,14), '13-22':(14,19)}

results = []

for grp, grp_df in merged_df.groupby('level_group'):
    a,b = limits[grp]
    for q_idx in range(a,b):

        model = models[q_idx - 1]
        model.to(device)
        model.eval()
        
        X_seq = np.stack(grp_df['events'].values) 
        

        X_cat = grp_df.drop(columns=['session_id', 'level_group', 'events']).values

        scaler = StandardScaler()
        X_cat_scaled = scaler.fit_transform(X_cat)

        X_seq_t = torch.tensor(X_seq, dtype=torch.long).to(device)
        X_cat_t = torch.tensor(X_cat_scaled, dtype=torch.float).to(device)
        
        with torch.no_grad():
            preds = model(X_seq_t, X_cat_t).cpu().numpy()  
            print(preds)
        
        for session_id, pred in zip(grp_df['session_id'], preds):
            results.append({
                'session_id': session_id,
                'question_id': q_idx,
                'predicted_proba': pred
            })

predictions_df = pd.DataFrame(results)
predictions_df.to_csv('test_predictions.csv', index=False)


[0.8575721  0.7181936  0.78215337]
[0.9887918  0.9811375  0.99200684]
[0.9844007 0.9344813 0.9680034]
[0.8036552  0.8496782  0.87237644]
[0.78482205 0.4982572  0.8138524 ]
[0.53477657 0.22826487 0.6322701 ]
[0.6881648  0.6079809  0.83400756]
[0.66778755 0.6154985  0.8640562 ]
[0.9842127 0.9182452 0.991627 ]
[0.2912431  0.39895302 0.42967018]
[0.85395086 0.52179676 0.8040518 ]
[0.84996855 0.4981818  0.8285451 ]
[0.66579235 0.5480007  0.6409721 ]
[0.7524271  0.55192626 0.7985863 ]
[0.712781   0.41329616 0.5547555 ]
[0.65740645 0.38261735 0.63570815]
[0.9225214 0.6852124 0.8901003]
[0.31929466 0.16041817 0.39855102]
