In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# ——— Data Loading & Filtering ———
df = pd.read_csv(
    "./user_data_qasim_1.csv",
    header=None,
    on_bad_lines='skip',
    encoding='cp1252'
)
df.columns = ["timestamp", "type", "data"]
df['timestamp'] = pd.to_datetime(
    df['timestamp'],
    format='%Y-%m-%d %H:%M:%S',
    errors='coerce'
)
df = df.dropna(subset=['timestamp'])
df = df[
    (df['type'] != 'memoryUsage') &
    (df['type'] != 'tabDuration') &
    (df['type'] != 'resourceUsage') &
    (df['type'] != 'periodicBrowserStats')
]
df = df.dropna()
df.sort_values("timestamp", inplace=True)

# ——— Compute inter-event deltas ———
df['delta'] = df['timestamp'].diff().dt.total_seconds().fillna(0)

# ——— Encode and scale ———
le = LabelEncoder()
df['event_id'] = le.fit_transform(df['type'])
scaler = MinMaxScaler()
df['delta_scaled'] = scaler.fit_transform(df[['delta']])

# ——— Build sequences for Attention-LSTM ———
LOOKBACK = 15
events = df['event_id'].values
deltas = df['delta_scaled'].values

X_events, X_deltas, y = [], [], []
for i in range(len(events) - LOOKBACK):
    X_events.append(events[i:i+LOOKBACK])
    X_deltas.append(deltas[i:i+LOOKBACK].reshape(-1, 1))
    y.append(events[i+LOOKBACK])
X_events = np.array(X_events)
X_deltas = np.array(X_deltas)
y = np.array(y)

num_classes = len(le.classes_)
print(X_deltas)
print(X_events)
print(y)

df.head()

[[[0.00000000e+00]
  [0.00000000e+00]
  [5.40657439e-05]
  ...
  [7.56920415e-04]
  [0.00000000e+00]
  [1.08131488e-04]]

 [[0.00000000e+00]
  [5.40657439e-05]
  [0.00000000e+00]
  ...
  [0.00000000e+00]
  [1.08131488e-04]
  [0.00000000e+00]]

 [[5.40657439e-05]
  [0.00000000e+00]
  [1.62197232e-03]
  ...
  [1.08131488e-04]
  [0.00000000e+00]
  [3.29801038e-03]]

 ...

 [[1.18944637e-03]
  [7.02854671e-04]
  [4.86591696e-04]
  ...
  [3.29801038e-03]
  [1.38408304e-02]
  [2.70328720e-04]]

 [[7.02854671e-04]
  [4.86591696e-04]
  [0.00000000e+00]
  ...
  [1.38408304e-02]
  [2.70328720e-04]
  [9.73183391e-04]]

 [[4.86591696e-04]
  [0.00000000e+00]
  [0.00000000e+00]
  ...
  [2.70328720e-04]
  [9.73183391e-04]
  [0.00000000e+00]]]
[[ 5  3  5 ...  4  3  3]
 [ 3  5  3 ...  3  3  5]
 [ 5  3  1 ...  3  5  5]
 ...
 [10 10  6 ...  8  8 10]
 [10  6  7 ...  8 10  3]
 [ 6  7  3 ... 10  3  5]]
[ 5  5  3 ...  3  5 10]


Unnamed: 0,timestamp,type,data,delta,event_id,delta_scaled
16,2025-03-10 23:22:26,tabSwitched,"{'type': 'tabSwitched', 'fromTab': None, 'toTa...",0.0,5,0.0
17,2025-03-10 23:22:26,tabHighlighted,"{'type': 'tabHighlighted', 'windowId': 8379253...",0.0,3,0.0
19,2025-03-10 23:22:27,tabSwitched,"{'type': 'tabSwitched', 'fromTab': 837925578, ...",1.0,5,5.4e-05
20,2025-03-10 23:22:27,tabHighlighted,"{'type': 'tabHighlighted', 'windowId': 8379253...",0.0,3,0.0
28,2025-03-10 23:22:57,tabCreated,"{'type': 'tabCreated', 'tabId': 837925613, 'ur...",30.0,1,0.001622


In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, LSTM, Dense,
    Concatenate, Softmax, Multiply, Lambda
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Layer

class SumOverTime(Layer):
    def call(self, inputs):
        # inputs: (batch, seq_len, hidden)
        return K.sum(inputs, axis=1)

def build_attn_lstm_model(seq_len, vocab_size, embed_dim, lstm_units, learning_rate):
    # 1) Inputs
    events_in = Input(shape=(seq_len,), name='events')
    deltas_in = Input(shape=(seq_len, 1), name='deltas')

    # 2) Embedding + concat
    x = Embedding(vocab_size, embed_dim, name='embed')(events_in)
    x = Concatenate(name='concat')([x, deltas_in])

    # 3) LSTM returns full sequence
    x = LSTM(lstm_units, return_sequences=True, name='lstm')(x)

    # 4) Bahdanau-style attention scores
    scores = Dense(1, activation='tanh', name='attn_score')(x)    # (batch, seq_len, 1)
    weights = Softmax(axis=1, name='attn_weights')(scores)        # (batch, seq_len, 1)

    # 5) Context vector: weighted sum of all h_t
    context = Multiply(name='attn_mul')([x, weights])             # (batch, seq_len, lstm_units)

    # — wrap the sum in a Keras layer —
    # context = Lambda(lambda t: K.sum(t, axis=1), name='context_sum')(context)  # (batch, lstm_units)

    # Sum over the time dimension (axis=1) → shape (batch, lstm_units)
    # new
    context = SumOverTime(name='context_sum')(context)

    # 6) Final prediction
    out = Dense(vocab_size, activation='softmax', name='output')(context)

    model = Model([events_in, deltas_in], out)
    model.compile(
        optimizer=Adam(learning_rate),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model



In [3]:
import tensorflow.keras.backend as K

param_grid = {
    'embed_dim':     [8, 16],
    'lstm_units':    [16, 32],
    'learning_rate': [1e-3, 5e-4],
    'batch_size':    [32, 64],
    'epochs':        [10, 20]
}

best_loss = np.inf
best_params = None

for embed_dim in param_grid['embed_dim']:
    for lstm_units in param_grid['lstm_units']:
        for lr in param_grid['learning_rate']:
            for batch_size in param_grid['batch_size']:
                for epochs in param_grid['epochs']:
                    K.clear_session()
                    model = build_attn_lstm_model(
                        LOOKBACK, num_classes,
                        embed_dim, lstm_units, lr
                    )
                    history = model.fit(
                        [X_events, X_deltas], y,
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_split=0.2,
                        verbose=0
                    )
                    val_loss = history.history['val_loss'][-1]
                    val_acc  = history.history['val_accuracy'][-1]
                    print(
                        f"embed={embed_dim}, units={lstm_units}, "
                        f"lr={lr}, bs={batch_size}, ep={epochs} "
                        f"→ val_loss={val_loss:.4f}, val_acc={val_acc:.4f}"
                    )
                    if val_loss < best_loss:
                        best_loss = val_loss
                        best_params = (embed_dim, lstm_units, lr, batch_size, epochs)

print("\nBest configuration:", best_params, "with val_loss=", best_loss)

embed=8, units=16, lr=0.001, bs=32, ep=10 → val_loss=1.4513, val_acc=0.3540
embed=8, units=16, lr=0.001, bs=32, ep=20 → val_loss=1.4083, val_acc=0.3975
embed=8, units=16, lr=0.001, bs=64, ep=10 → val_loss=1.5003, val_acc=0.3463
embed=8, units=16, lr=0.001, bs=64, ep=20 → val_loss=1.4449, val_acc=0.3804
embed=8, units=16, lr=0.0005, bs=32, ep=10 → val_loss=1.4997, val_acc=0.3385
embed=8, units=16, lr=0.0005, bs=32, ep=20 → val_loss=1.4483, val_acc=0.3680
embed=8, units=16, lr=0.0005, bs=64, ep=10 → val_loss=1.5328, val_acc=0.3339
embed=8, units=16, lr=0.0005, bs=64, ep=20 → val_loss=1.4834, val_acc=0.3587
embed=8, units=32, lr=0.001, bs=32, ep=10 → val_loss=1.4007, val_acc=0.4410
embed=8, units=32, lr=0.001, bs=32, ep=20 → val_loss=1.3027, val_acc=0.5839
embed=8, units=32, lr=0.001, bs=64, ep=10 → val_loss=1.4921, val_acc=0.3929
embed=8, units=32, lr=0.001, bs=64, ep=20 → val_loss=1.3934, val_acc=0.4037
embed=8, units=32, lr=0.0005, bs=32, ep=10 → val_loss=1.4945, val_acc=0.3509
embed=8

In [4]:
import pickle

# Unpack best hyperparameters
embed_dim, lstm_units, lr, batch_size, epochs = best_params

# Retrain on full dataset
K.clear_session()
best_model = build_attn_lstm_model(
    seq_len=LOOKBACK,
    vocab_size=num_classes,
    embed_dim=embed_dim,
    lstm_units=lstm_units,
    learning_rate=lr
)
best_model.fit(
    [X_events, X_deltas], y,
    batch_size=batch_size,
    epochs=epochs,
    verbose=1
)
# After retraining best_model in the notebook:
best_model.save('attn_lstm_full_model.h5')
print("Saved full Attention-LSTM as HDF5 to attn_lstm_full_model.h5")


# Bundle and pickle
model_bundle = {
    'config':  best_model.get_config(),
    'weights': best_model.get_weights(),
    'classes': le.classes_.tolist()
}

# with open('attn_lstm_best_model.pkl', 'wb') as f:
#     pickle.dump(model_bundle, f)

# print("Saved Attention-Augmented LSTM model to attn_lstm_best_model.pkl")

Epoch 1/20
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3058 - loss: 2.0929
Epoch 2/20
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3402 - loss: 1.5200
Epoch 3/20
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3481 - loss: 1.4949
Epoch 4/20
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3728 - loss: 1.4188
Epoch 5/20
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3708 - loss: 1.4003
Epoch 6/20
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3944 - loss: 1.3810
Epoch 7/20
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3958 - loss: 1.3440
Epoch 8/20
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4403 - loss: 1.3425
Epoch 9/20
[1m101/101[0m [32m━━━━━━━━



Saved full Attention-LSTM as HDF5 to attn_lstm_full_model.h5


In [5]:
# --- Sample extraction ---
idx = 0

# Raw inputs
sample_events = X_events[idx]           # (LOOKBACK,)
sample_deltas = X_deltas[idx].flatten() # (LOOKBACK,)

# Human-readable past sequence
event_names = le.inverse_transform(sample_events)
print("▶ Past events sequence:")
for i, (e, dt) in enumerate(zip(event_names, sample_deltas)):
    print(f"  t-{LOOKBACK-i:2}: event='{e}', Δt={dt:.3f}")

# Actual next event
actual_event = le.inverse_transform([y[idx]])[0]
print(f"\n▶ Actual next event: {actual_event}")

# # --- T-LSTM Prediction ---
# probs_t = best_model.predict(
#     [sample_events.reshape(1,-1), sample_deltas.reshape(1,LOOKBACK,1)]
# )[0]
# top3_t = probs_t.argsort()[-3:][::-1]
# print("\n▶ T-LSTM top-3 predictions:")
# for i in top3_t:
#     ev, p = le.inverse_transform([i])[0], probs_t[i]
#     print(f"  {ev:<20}  p={p:.3f}")

# --- Attention-LSTM Prediction ---
probs_a = best_model.predict(
    [sample_events.reshape(1,-1), sample_deltas.reshape(1,LOOKBACK,1)]
)[0]
top3_a = probs_a.argsort()[-3:][::-1]
print("\n▶ Attn-LSTM top-3 predictions:")
for i in top3_a:
    ev, p = le.inverse_transform([i])[0], probs_a[i]
    print(f"  {ev:<20}  p={p:.3f}")

# --- Attention Weights ---
from tensorflow.keras.models import Model

attn_extractor = Model(
    inputs=best_model.input,
    outputs=best_model.get_layer('attn_weights').output
)
attn_out = attn_extractor.predict(
    [sample_events.reshape(1,-1), sample_deltas.reshape(1,LOOKBACK,1)]
)[0].reshape(-1)

print("\n▶ Attention weights over the past 15 steps:")
for i, w in enumerate(attn_out):
    print(f"  t-{LOOKBACK-i:2}: w={w:.3f}")

▶ Past events sequence:
  t-15: event='tabSwitched', Δt=0.000
  t-14: event='tabHighlighted', Δt=0.000
  t-13: event='tabSwitched', Δt=0.000
  t-12: event='tabHighlighted', Δt=0.000
  t-11: event='tabCreated', Δt=0.002
  t-10: event='tabSwitched', Δt=0.000
  t- 9: event='windowFocused', Δt=0.000
  t- 8: event='tabHighlighted', Δt=0.000
  t- 7: event='tabUpdated', Δt=0.000
  t- 6: event='tabTitleChanged', Δt=0.000
  t- 5: event='tabSwitched', Δt=0.000
  t- 4: event='tabHighlighted', Δt=0.000
  t- 3: event='tabRemoved', Δt=0.001
  t- 2: event='tabHighlighted', Δt=0.000
  t- 1: event='tabHighlighted', Δt=0.000

▶ Actual next event: tabSwitched
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step

▶ Attn-LSTM top-3 predictions:
  tabSwitched           p=0.713
  tabUpdated            p=0.090
  tabTitleChanged       p=0.060
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step

▶ Attention weights over the past 15 steps:
  t-15: w=0.047
  t-14: w=0.043
  