Data Pre Processing

In [1]:
import pandas as pd
import numpy as np

# 1. Load the dataset from CSV
df = pd.read_csv(
    "./user_data_qasim_1.csv",  # path to the event log CSV
    header=None,
    on_bad_lines='skip',        # skip any malformed lines
    encoding="cp1252"           # encoding as used in the original file
)

# Assign column names for clarity
df.columns = ["timestamp", "type", "data"]

# 2. Convert timestamp column to datetime objects
df['timestamp'] = pd.to_datetime(
    df['timestamp'],
    format='%Y-%m-%d %H:%M:%S',  # e.g., "2025-03-10 23:21:05" format
    errors='coerce'             # invalid parsing will be set as NaT
)

# Drop any rows where timestamp conversion failed (NaT values)
df = df.dropna(subset=['timestamp'])

# 3. Filter out unwanted event types that are not needed for prediction
df = df[
    (df['type'] != 'memoryUsage') &
    (df['type'] != 'tabDuration') &
    (df['type'] != 'resourceUsage') &
    (df['type'] != 'periodicBrowserStats')
]

# Drop any rows that are entirely NaN (if any remain after filtering)
df = df.dropna()

# 5. Sort events by timestamp to ensure chronological order of sequences
df.sort_values("timestamp", inplace=True)

# (Optional) Quick check on the first few rows after preprocessing
print("Sample events after preprocessing:")
print(df.head(3))

Sample events after preprocessing:
             timestamp            type  \
16 2025-03-10 23:22:26     tabSwitched   
17 2025-03-10 23:22:26  tabHighlighted   
19 2025-03-10 23:22:27     tabSwitched   

                                                 data  
16  {'type': 'tabSwitched', 'fromTab': None, 'toTa...  
17  {'type': 'tabHighlighted', 'windowId': 8379253...  
19  {'type': 'tabSwitched', 'fromTab': 837925578, ...  


Sequence Preperation for LSTM

In [2]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Extract the sequence of event types from the cleaned DataFrame
events = df['type'].values  # numpy array of event type strings in chronological order

# Encode event types as integers
label_encoder = LabelEncoder()
encoded_events = label_encoder.fit_transform(events)
vocab_size = len(label_encoder.classes_)
print(f"Total unique event types (vocabulary size): {vocab_size}")
print(f"Event type classes: {label_encoder.classes_}")

# Define sequence length (window size for input sequence of events)
seq_length = 10  # chosen based on best performing configuration [oai_citation:8‡file-8z6kkayzrtyvq1o1e3hqbn](file://file-8z6kKaYzrTYVq1o1e3hQBn#:~:text=match%20at%20L1756%20,n)

# Build sequences of events and the corresponding next-event targets
sequences = []
next_events = []
for i in range(len(encoded_events) - seq_length):
    # Extract a sequence of length seq_length
    seq = encoded_events[i : i + seq_length]
    sequences.append(seq)
    # The event immediately following this sequence
    next_events.append(encoded_events[i + seq_length])

# Convert to numpy arrays
sequences = np.array(sequences)
next_events = np.array(next_events)
print(f"Number of training sequences: {sequences.shape[0]}")

# One-hot encode the target (next event) classes for training
y = to_categorical(next_events, num_classes=vocab_size)



Total unique event types (vocabulary size): 12
Event type classes: ['tabAttached' 'tabCreated' 'tabDetached' 'tabHighlighted' 'tabRemoved'
 'tabSwitched' 'tabTitleChanged' 'tabUpdated' 'userIdleStateChanged'
 'windowCreated' 'windowFocused' 'windowRemoved']
Number of training sequences: 3225


Vanilla LSTM Model Architecture

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

# Define hyperparameters for the model
embedding_dim = 8    # dimension of embedding vectors for events (best found)
lstm_units   = 16    # number of LSTM units (best found)
learning_rate = 0.001
batch_size    = 32   # training batch size

# Build the Sequential model
model = Sequential()
# Embedding layer for event sequences
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=seq_length))
# LSTM layer to learn sequence patterns
model.add(LSTM(lstm_units))
# Output layer with softmax activation for multi-class next-event prediction
model.add(Dense(vocab_size, activation='softmax'))

# Compile the model with categorical crossentropy loss and Adam optimizer
optimizer = Adam(learning_rate=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Summarize the model architecture
model.summary()



Training


In [4]:
# Train the model, using 20% of data for validation to monitor performance
epochs = 50  # number of epochs to train (best-performing value)
history = model.fit(
    sequences, 
    y,
    epochs=epochs,
    batch_size=batch_size,
    validation_split=0.2,  # hold out 20% for validation each epoch
    verbose=2  # verbose=2 for epoch-level logging
)

Epoch 1/50
81/81 - 1s - 9ms/step - accuracy: 0.3116 - loss: 2.0733 - val_accuracy: 0.3287 - val_loss: 1.6432
Epoch 2/50
81/81 - 0s - 2ms/step - accuracy: 0.3740 - loss: 1.5157 - val_accuracy: 0.4016 - val_loss: 1.4626
Epoch 3/50
81/81 - 0s - 1ms/step - accuracy: 0.5519 - loss: 1.3911 - val_accuracy: 0.5721 - val_loss: 1.3818
Epoch 4/50
81/81 - 0s - 1ms/step - accuracy: 0.5969 - loss: 1.3093 - val_accuracy: 0.5922 - val_loss: 1.3170
Epoch 5/50
81/81 - 0s - 1ms/step - accuracy: 0.6101 - loss: 1.2374 - val_accuracy: 0.5907 - val_loss: 1.2516
Epoch 6/50
81/81 - 0s - 2ms/step - accuracy: 0.6198 - loss: 1.1858 - val_accuracy: 0.6047 - val_loss: 1.2120
Epoch 7/50
81/81 - 0s - 1ms/step - accuracy: 0.6225 - loss: 1.1548 - val_accuracy: 0.6016 - val_loss: 1.1867
Epoch 8/50
81/81 - 0s - 2ms/step - accuracy: 0.6310 - loss: 1.1291 - val_accuracy: 0.6093 - val_loss: 1.1647
Epoch 9/50
81/81 - 0s - 1ms/step - accuracy: 0.6364 - loss: 1.1054 - val_accuracy: 0.6233 - val_loss: 1.1478
Epoch 10/50
81/81 -

In [5]:
# Save the trained model to an H5 file for later use (e.g., in a Flask app)
model.save("saved_vanilla_lstm_new.h5")
print("Model saved as saved_vanilla_lstm.h5")



Model saved as saved_vanilla_lstm.h5
