In [1]:
import pandas as pd

# Read CSV
df = pd.read_csv(
    "./user_data_qasim_1.csv",
    header=None,
    on_bad_lines='skip',        # Skip malformed lines if needed
    encoding="cp1252"           # Adjust encoding as necessary
)

# Assign column names
df.columns = ["timestamp", "type", "data"]

# 1) Convert the timestamp column to datetime
#    The format argument below matches "2025-03-10 23:21:05"
df['timestamp'] = pd.to_datetime(df['timestamp'], 
                                 format='%Y-%m-%d %H:%M:%S', 
                                 errors='coerce')

# 2) Drop any rows where timestamp conversion failed (NaT)
df = df.dropna(subset=['timestamp'])

# 3) Filter out unwanted event types
df = df[
    (df['type'] != 'memoryUsage') & 
    (df['type'] != 'tabDuration') &
    (df['type'] != 'resourceUsage') &
    (df['type'] != 'periodicBrowserStats')
]

# 4) Drop any rows entirely filled with NaN (if any remain)
df = df.dropna()

# 5) Sort the DataFrame by timestamp to maintain chronological order
df.sort_values("timestamp", inplace=True)

df.head()


print(f"Total number of records after processing: {len(df)}")
#Print the entire DataFrame
print(df.to_string(index=False))


Total number of records after processing: 3235
          timestamp                 type                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [2]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder


# Extract the sequence of event types (these are our states)
events = df["type"].tolist()
# print(events)
# events = ['tabCreated','tabUpdated','tabSwitched','tabRemoved']
# Encode event types as integers using LabelEncoder
label_encoder = LabelEncoder()
encoded_events = label_encoder.fit_transform(events)
vocab_size = len(label_encoder.classes_)
print("Vocabulary:", label_encoder.classes_)

# --------------------------------------
# Create Training Sequences for the RNN
# --------------------------------------

# Define the sequence length: number of past events to use for predicting the next event
seq_length = 5

# Build sequences and corresponding next events
sequences = []
next_events = []
for i in range(len(encoded_events) - seq_length):
    sequences.append(encoded_events[i:i+seq_length])
    next_events.append(encoded_events[i+seq_length])
sequences = np.array(sequences)
next_events = np.array(next_events)

# One-hot encode the target events
y = to_categorical(next_events, num_classes=vocab_size)

# --------------------------
# Build and Train the RNN
# --------------------------

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=8, input_length=seq_length))
model.add(LSTM(16))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(sequences, y, epochs=50, verbose=2)

# --------------------------
# Temperature-Based Sampling
# --------------------------

def sample_with_temperature(preds, temperature=1.0):
    """
    Samples an index from a probability array after applying temperature scaling.
    
    Args:
        preds (np.array): The softmax output predictions.
        temperature (float): Temperature value to adjust randomness.
        
    Returns:
        int: The sampled index.
    """
    preds = np.asarray(preds).astype('float64')
    # Add a small value to avoid log(0)
    epsilon = 1e-8
    preds = np.log(preds + epsilon) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

# --------------------------
# Make a Prediction with Temperature Scaling
# --------------------------

# Use the last 'seq_length' events to predict the next event
last_sequence = encoded_events[-seq_length:]
last_sequence = np.array(last_sequence).reshape(1, seq_length)
pred = model.predict(last_sequence)[0]

# Adjust temperature (e.g., 1.5 for more variation; lower for more deterministic predictions)
temperature = 1
predicted_index = sample_with_temperature(pred, temperature)
predicted_event = label_encoder.inverse_transform([predicted_index])
print("Predicted next event with temperature:", predicted_event[0])



Vocabulary: ['tabAttached' 'tabCreated' 'tabDetached' 'tabHighlighted' 'tabRemoved'
 'tabSwitched' 'tabTitleChanged' 'tabUpdated' 'userIdleStateChanged'
 'windowCreated' 'windowFocused' 'windowRemoved']
Epoch 1/50




101/101 - 1s - 7ms/step - accuracy: 0.3266 - loss: 1.9833
Epoch 2/50
101/101 - 0s - 986us/step - accuracy: 0.4084 - loss: 1.5358
Epoch 3/50
101/101 - 0s - 962us/step - accuracy: 0.5291 - loss: 1.4048
Epoch 4/50
101/101 - 0s - 970us/step - accuracy: 0.5755 - loss: 1.2847
Epoch 5/50
101/101 - 0s - 953us/step - accuracy: 0.5892 - loss: 1.2170
Epoch 6/50
101/101 - 0s - 946us/step - accuracy: 0.6015 - loss: 1.1771
Epoch 7/50
101/101 - 0s - 932us/step - accuracy: 0.6130 - loss: 1.1455
Epoch 8/50
101/101 - 0s - 934us/step - accuracy: 0.6257 - loss: 1.1186
Epoch 9/50
101/101 - 0s - 1ms/step - accuracy: 0.6334 - loss: 1.0974
Epoch 10/50
101/101 - 0s - 1ms/step - accuracy: 0.6406 - loss: 1.0851
Epoch 11/50
101/101 - 0s - 933us/step - accuracy: 0.6449 - loss: 1.0703
Epoch 12/50
101/101 - 0s - 926us/step - accuracy: 0.6458 - loss: 1.0635
Epoch 13/50
101/101 - 0s - 908us/step - accuracy: 0.6477 - loss: 1.0546
Epoch 14/50
101/101 - 0s - 911us/step - accuracy: 0.6483 - loss: 1.0454
Epoch 15/50
101/10

In [3]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import backend as K

# Extract the sequence of event types (states)
events = df["type"].tolist()

# Encode event types as integers
label_encoder = LabelEncoder()
encoded_events = label_encoder.fit_transform(events)
vocab_size = len(label_encoder.classes_)
print("Vocabulary:", label_encoder.classes_)

# -------------------------------
# Define the hyperparameter grid
# -------------------------------
seq_length_options = [1,2,3,4, 5, 7,10,15,25]      # Different sequence lengths to try
epochs_options = [10,20, 50, 100,500]      # Different numbers of training epochs
temperature_options = [0.01,0.1,0.5,1.0, 1.5,2,5,10]  # Different temperatures for sampling

results = []

# Temperature-based sampling function (unchanged)
def sample_with_temperature(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    epsilon = 1e-8  # Avoid log(0)
    preds = np.log(preds + epsilon) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

# -------------------------------
# Loop over hyperparameter combinations
# -------------------------------
for seq_length in seq_length_options:
    # Re-create sequences for current sequence length
    sequences = []
    next_events = []
    for i in range(len(encoded_events) - seq_length):
        sequences.append(encoded_events[i:i + seq_length])
        next_events.append(encoded_events[i + seq_length])
    sequences = np.array(sequences)
    next_events = np.array(next_events)
    
    # One-hot encode target events
    y = to_categorical(next_events, num_classes=vocab_size)
    
    for epochs in epochs_options:
        for temperature in temperature_options:
            # Build the model
            model = Sequential()
            model.add(Embedding(input_dim=vocab_size, output_dim=8, input_length=seq_length))
            model.add(LSTM(16))
            model.add(Dense(vocab_size, activation='softmax'))
            model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
            
            # Train the model
            history = model.fit(sequences, y, epochs=epochs, verbose=0)
            final_loss = history.history['loss'][-1]
            final_accuracy = history.history['accuracy'][-1]
            
            # Predict the next event using the last seq_length events
            last_sequence = encoded_events[-seq_length:]
            last_sequence = np.array(last_sequence).reshape(1, seq_length)
            pred = model.predict(last_sequence)[0]
            predicted_index = sample_with_temperature(pred, temperature)
            predicted_event = label_encoder.inverse_transform([predicted_index])[0]
            
            # Record the results for this configuration
            result = {
                "seq_length": seq_length,
                "epochs": epochs,
                "temperature": temperature,
                "loss": final_loss,
                "accuracy": final_accuracy,
                "predicted_event": predicted_event
            }
            results.append(result)
            print(f"Seq_len: {seq_length}, Epochs: {epochs}, Temperature: {temperature} => Loss: {final_loss:.4f}, Accuracy: {final_accuracy:.4f}, Predicted: {predicted_event}")
            
            # Clear the Keras session to free up memory for the next run
            K.clear_session()

# Optionally, determine the best configuration based on lowest loss
best_config = min(results, key=lambda x: x["loss"])
print("\nBest configuration (lowest loss):")
print(best_config)


Vocabulary: ['tabAttached' 'tabCreated' 'tabDetached' 'tabHighlighted' 'tabRemoved'
 'tabSwitched' 'tabTitleChanged' 'tabUpdated' 'userIdleStateChanged'
 'windowCreated' 'windowFocused' 'windowRemoved']




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Seq_len: 1, Epochs: 10, Temperature: 0.01 => Loss: 1.1207, Accuracy: 0.6571, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
Seq_len: 1, Epochs: 10, Temperature: 0.1 => Loss: 1.1224, Accuracy: 0.6404, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
Seq_len: 1, Epochs: 10, Temperature: 0.5 => Loss: 1.1212, Accuracy: 0.6571, Predicted: userIdleStateChanged




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Seq_len: 1, Epochs: 10, Temperature: 1.0 => Loss: 1.1361, Accuracy: 0.6404, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Seq_len: 1, Epochs: 10, Temperature: 1.5 => Loss: 1.1205, Accuracy: 0.6571, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
Seq_len: 1, Epochs: 10, Temperature: 2 => Loss: 1.1193, Accuracy: 0.6568, Predicted: tabRemoved




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Seq_len: 1, Epochs: 10, Temperature: 5 => Loss: 1.1307, Accuracy: 0.6404, Predicted: userIdleStateChanged




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Seq_len: 1, Epochs: 10, Temperature: 10 => Loss: 1.1286, Accuracy: 0.6574, Predicted: tabCreated




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Seq_len: 1, Epochs: 20, Temperature: 0.01 => Loss: 1.1025, Accuracy: 0.6571, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Seq_len: 1, Epochs: 20, Temperature: 0.1 => Loss: 1.1020, Accuracy: 0.6571, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Seq_len: 1, Epochs: 20, Temperature: 0.5 => Loss: 1.1035, Accuracy: 0.6571, Predicted: tabUpdated




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Seq_len: 1, Epochs: 20, Temperature: 1.0 => Loss: 1.1017, Accuracy: 0.6571, Predicted: windowFocused




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
Seq_len: 1, Epochs: 20, Temperature: 1.5 => Loss: 1.1038, Accuracy: 0.6571, Predicted: userIdleStateChanged




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
Seq_len: 1, Epochs: 20, Temperature: 2 => Loss: 1.1030, Accuracy: 0.6571, Predicted: tabHighlighted




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Seq_len: 1, Epochs: 20, Temperature: 5 => Loss: 1.1022, Accuracy: 0.6571, Predicted: tabTitleChanged




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Seq_len: 1, Epochs: 20, Temperature: 10 => Loss: 1.1151, Accuracy: 0.6571, Predicted: tabUpdated




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
Seq_len: 1, Epochs: 50, Temperature: 0.01 => Loss: 1.0958, Accuracy: 0.6574, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
Seq_len: 1, Epochs: 50, Temperature: 0.1 => Loss: 1.0957, Accuracy: 0.6574, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Seq_len: 1, Epochs: 50, Temperature: 0.5 => Loss: 1.0949, Accuracy: 0.6571, Predicted: tabRemoved




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Seq_len: 1, Epochs: 50, Temperature: 1.0 => Loss: 1.0946, Accuracy: 0.6574, Predicted: tabHighlighted




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Seq_len: 1, Epochs: 50, Temperature: 1.5 => Loss: 1.0966, Accuracy: 0.6574, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
Seq_len: 1, Epochs: 50, Temperature: 2 => Loss: 1.0959, Accuracy: 0.6574, Predicted: tabHighlighted




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Seq_len: 1, Epochs: 50, Temperature: 5 => Loss: 1.0951, Accuracy: 0.6574, Predicted: windowFocused




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
Seq_len: 1, Epochs: 50, Temperature: 10 => Loss: 1.0953, Accuracy: 0.6571, Predicted: windowCreated




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Seq_len: 1, Epochs: 100, Temperature: 0.01 => Loss: 1.0920, Accuracy: 0.6574, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Seq_len: 1, Epochs: 100, Temperature: 0.1 => Loss: 1.0924, Accuracy: 0.6574, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
Seq_len: 1, Epochs: 100, Temperature: 0.5 => Loss: 1.0919, Accuracy: 0.6574, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Seq_len: 1, Epochs: 100, Temperature: 1.0 => Loss: 1.0931, Accuracy: 0.6574, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step
Seq_len: 1, Epochs: 100, Temperature: 1.5 => Loss: 1.0926, Accuracy: 0.6574, Predicted: tabRemoved




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
Seq_len: 1, Epochs: 100, Temperature: 2 => Loss: 1.0920, Accuracy: 0.6574, Predicted: tabTitleChanged




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Seq_len: 1, Epochs: 100, Temperature: 5 => Loss: 1.0931, Accuracy: 0.6574, Predicted: tabUpdated




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Seq_len: 1, Epochs: 100, Temperature: 10 => Loss: 1.0927, Accuracy: 0.6574, Predicted: tabTitleChanged




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Seq_len: 1, Epochs: 500, Temperature: 0.01 => Loss: 1.0875, Accuracy: 0.6574, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Seq_len: 1, Epochs: 500, Temperature: 0.1 => Loss: 1.0877, Accuracy: 0.6574, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Seq_len: 1, Epochs: 500, Temperature: 0.5 => Loss: 1.0874, Accuracy: 0.6574, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Seq_len: 1, Epochs: 500, Temperature: 1.0 => Loss: 1.0876, Accuracy: 0.6568, Predicted: tabHighlighted




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
Seq_len: 1, Epochs: 500, Temperature: 1.5 => Loss: 1.0875, Accuracy: 0.6571, Predicted: userIdleStateChanged




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
Seq_len: 1, Epochs: 500, Temperature: 2 => Loss: 1.0874, Accuracy: 0.6571, Predicted: windowFocused




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Seq_len: 1, Epochs: 500, Temperature: 5 => Loss: 1.0872, Accuracy: 0.6574, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Seq_len: 1, Epochs: 500, Temperature: 10 => Loss: 1.0880, Accuracy: 0.6574, Predicted: tabRemoved




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Seq_len: 2, Epochs: 10, Temperature: 0.01 => Loss: 1.1333, Accuracy: 0.6384, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Seq_len: 2, Epochs: 10, Temperature: 0.1 => Loss: 1.1203, Accuracy: 0.6465, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
Seq_len: 2, Epochs: 10, Temperature: 0.5 => Loss: 1.1195, Accuracy: 0.6499, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
Seq_len: 2, Epochs: 10, Temperature: 1.0 => Loss: 1.1272, Accuracy: 0.6458, Predicted: tabHighlighted




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Seq_len: 2, Epochs: 10, Temperature: 1.5 => Loss: 1.1290, Accuracy: 0.6477, Predicted: tabHighlighted




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
Seq_len: 2, Epochs: 10, Temperature: 2 => Loss: 1.1212, Accuracy: 0.6418, Predicted: windowFocused




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
Seq_len: 2, Epochs: 10, Temperature: 5 => Loss: 1.1414, Accuracy: 0.6146, Predicted: tabHighlighted




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
Seq_len: 2, Epochs: 10, Temperature: 10 => Loss: 1.1356, Accuracy: 0.6372, Predicted: windowFocused




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
Seq_len: 2, Epochs: 20, Temperature: 0.01 => Loss: 1.0876, Accuracy: 0.6533, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Seq_len: 2, Epochs: 20, Temperature: 0.1 => Loss: 1.0844, Accuracy: 0.6548, Predicted: tabHighlighted




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
Seq_len: 2, Epochs: 20, Temperature: 0.5 => Loss: 1.0890, Accuracy: 0.6560, Predicted: tabSwitched




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Seq_len: 2, Epochs: 20, Temperature: 1.0 => Loss: 1.0946, Accuracy: 0.6483, Predicted: tabCreated




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
Seq_len: 2, Epochs: 20, Temperature: 1.5 => Loss: 1.0839, Accuracy: 0.6570, Predicted: tabUpdated




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Seq_len: 2, Epochs: 20, Temperature: 2 => Loss: 1.0864, Accuracy: 0.6557, Predicted: windowFocused




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Seq_len: 2, Epochs: 20, Temperature: 5 => Loss: 1.0849, Accuracy: 0.6567, Predicted: userIdleStateChanged




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
Seq_len: 2, Epochs: 20, Temperature: 10 => Loss: 1.0851, Accuracy: 0.6560, Predicted: tabRemoved




KeyboardInterrupt: 

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Embedding, LSTM, Dense, Input, Reshape, Concatenate
)
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras import backend as K


# Extract the event sequence
events = df["type"].tolist()

# Encode event types as integers
label_encoder = LabelEncoder()
encoded_events = label_encoder.fit_transform(events)
vocab_size = len(label_encoder.classes_)

print("Vocabulary (Event Types):", label_encoder.classes_)

# Compute time deltas (in seconds) between consecutive events
# Fill the first delta with 0 or a small value
df['time_delta'] = df["timestamp"].diff().dt.total_seconds().fillna(0)

# scale the time deltas if they are large or have huge variance
scaler = MinMaxScaler()  # or StandardScaler()
scaled_time_deltas = scaler.fit_transform(df[['time_delta']])  # shape: (N, 1)

time_deltas = scaled_time_deltas.flatten()  # shape: (N,)


seq_length_options = [1, 5, 7, 10, 15]
epochs_options = [10, 20, 50]
temperature_options = [0.5, 1.0, 1.5]

results = []

# Temperature-based sampling function
def sample_with_temperature(preds, temperature=1.0):
    """
    Adjusts a probability distribution 'preds' by 'temperature' and
    samples an index from the resulting distribution.
    
    Lower temperature -> more deterministic.
    Higher temperature -> more random.
    """
    preds = np.asarray(preds).astype('float64')
    epsilon = 1e-8  # to avoid log(0)
    preds = np.log(preds + epsilon) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)


for seq_length in seq_length_options:
    

    sequences_events = []
    sequences_times = []
    next_events = []
    
    # We will create sequences of length `seq_length` for both events and time deltas,
    # then the next event (single label) is what we want to predict.
    for i in range(len(encoded_events) - seq_length):
        # event window
        event_seq = encoded_events[i : i + seq_length]
        # time delta window
        time_seq = time_deltas[i : i + seq_length]
        # the next event
        next_e = encoded_events[i + seq_length]
        
        sequences_events.append(event_seq)
        sequences_times.append(time_seq)
        next_events.append(next_e)
    
    sequences_events = np.array(sequences_events)
    sequences_times = np.array(sequences_times)
    next_events = np.array(next_events)
    
    # One-hot encode the target events
    y = to_categorical(next_events, num_classes=vocab_size)
    
    for epochs in epochs_options:
        for temperature in temperature_options:

            # Input for event sequences
            event_input = Input(shape=(seq_length,), name='event_input')
            
            # Embed the events
            x = Embedding(input_dim=vocab_size,
                          output_dim=8,  # embedding size for events
                          input_length=seq_length)(event_input)
            
            # Input for time deltas
            time_input = Input(shape=(seq_length,), name='time_input')
            
            # Reshape time deltas to match [batch_size, seq_length, 1]
            t = Reshape((seq_length, 1))(time_input)
            
            # Concatenate event embedding + time delta
            # The shape is now [batch_size, seq_length, 8 + 1]
            merged = Concatenate(axis=-1)([x, t])
            
            # LSTM layer
            lstm_out = LSTM(16)(merged)
            
            # Final output layer: next event probabilities
            output = Dense(vocab_size, activation='softmax')(lstm_out)
            
            # Build and compile the model
            model = Model(inputs=[event_input, time_input], outputs=output)
            model.compile(loss='categorical_crossentropy', 
                          optimizer='adam', 
                          metrics=['accuracy'])
            

            history = model.fit(
                [sequences_events, sequences_times],  # two inputs
                y,
                epochs=epochs,
                verbose=0
            )
            
            # Get final training loss & accuracy
            final_loss = history.history['loss'][-1]
            final_accuracy = history.history['accuracy'][-1]
            

            # Use the last seq_length events & time deltas as context
            last_sequence_events = encoded_events[-seq_length:]
            last_sequence_times = time_deltas[-seq_length:]
            
            # Reshape to batch of 1
            last_sequence_events = np.array(last_sequence_events).reshape(1, seq_length)
            last_sequence_times = np.array(last_sequence_times).reshape(1, seq_length)
            
            pred_probs = model.predict([last_sequence_events, last_sequence_times])[0]
            predicted_index = sample_with_temperature(pred_probs, temperature)
            predicted_event = label_encoder.inverse_transform([predicted_index])[0]

            result = {
                "seq_length": seq_length,
                "epochs": epochs,
                "temperature": temperature,
                "loss": final_loss,
                "accuracy": final_accuracy,
                "predicted_event": predicted_event
            }
            results.append(result)
            print(f"Seq_len: {seq_length}, Epochs: {epochs}, Temp: {temperature} => "
                  f"Loss: {final_loss:.4f}, Acc: {final_accuracy:.4f}, Predicted: {predicted_event}")
            
            # Clear session to free memory before next run
            K.clear_session()


best_config = min(results, key=lambda x: x["loss"])
print("\nBest configuration (lowest loss):")
print(best_config)


Vocabulary (Event Types): ['tabAttached' 'tabCreated' 'tabDetached' 'tabHighlighted' 'tabRemoved'
 'tabSwitched' 'tabTitleChanged' 'tabUpdated' 'userIdleStateChanged'
 'windowCreated' 'windowFocused' 'windowRemoved']
Seq_len: 1, Epochs: 10, Temp: 0.5 => Loss: 1.1353, Acc: 0.6571, Predicted: tabSwitched
Seq_len: 1, Epochs: 10, Temp: 1.0 => Loss: 1.1303, Acc: 0.6503, Predicted: tabTitleChanged
Seq_len: 1, Epochs: 10, Temp: 1.5 => Loss: 1.1330, Acc: 0.6463, Predicted: tabHighlighted
Seq_len: 1, Epochs: 20, Temp: 0.5 => Loss: 1.1043, Acc: 0.6571, Predicted: tabSwitched
Seq_len: 1, Epochs: 20, Temp: 1.0 => Loss: 1.1018, Acc: 0.6571, Predicted: tabHighlighted
Seq_len: 1, Epochs: 20, Temp: 1.5 => Loss: 1.1022, Acc: 0.6571, Predicted: tabUpdated
Seq_len: 1, Epochs: 50, Temp: 0.5 => Loss: 1.0956, Acc: 0.6571, Predicted: tabSwitched
Seq_len: 1, Epochs: 50, Temp: 1.0 => Loss: 1.0953, Acc: 0.6574, Predicted: tabSwitched
Seq_len: 1, Epochs: 50, Temp: 1.5 => Loss: 1.0953, Acc: 0.6574, Predicted: win

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Embedding, LSTM, Dense, Input, Reshape, Concatenate
)
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras import backend as K

# -----------------------------------------------------------------------------
# DATA PREPARATION
# -----------------------------------------------------------------------------
# Extract the event sequence from the 'type' column
events = df["type"].tolist()

# Encode event types as integers
label_encoder = LabelEncoder()
encoded_events = label_encoder.fit_transform(events)
vocab_size = len(label_encoder.classes_)
print("Vocabulary (Event Types):", label_encoder.classes_)

# Compute time deltas (in seconds) between consecutive events.
# The first entry, for which no previous event exists, is set to 0.
df['time_delta'] = df["timestamp"].diff().dt.total_seconds().fillna(0)

# Scale the time deltas using MinMaxScaler
scaler = MinMaxScaler()
scaled_time_deltas = scaler.fit_transform(df[['time_delta']])
time_deltas = scaled_time_deltas.flatten()

# -----------------------------------------------------------------------------
# SET HYPERPARAMETERS
# -----------------------------------------------------------------------------
seq_length_options = [1, 5, 7, 10, 15]
epochs_options = [10, 20, 50]
temperature_options = [0.5, 1.0, 1.5]

results = []

# -----------------------------------------------------------------------------
# TEMPERATURE-BASED SAMPLING FUNCTION
# -----------------------------------------------------------------------------
def sample_with_temperature(preds, temperature=1.0):
    """
    Adjusts a probability distribution 'preds' by 'temperature' and
    samples an index from the resulting distribution.
    
    Lower temperature -> more deterministic.
    Higher temperature -> more random.
    """
    preds = np.asarray(preds).astype('float64')
    epsilon = 1e-8  # to avoid log(0)
    preds = np.log(preds + epsilon) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

# -----------------------------------------------------------------------------
# MODEL TRAINING & VALIDATION WITH 80:20 SPLIT
# -----------------------------------------------------------------------------
for seq_length in seq_length_options:
    
    sequences_events = []
    sequences_times = []
    next_events = []
    
    # Create sequences of length `seq_length` from both events and time deltas.
    # The target for each sequence is the event that comes immediately after.
    for i in range(len(encoded_events) - seq_length):
        event_seq = encoded_events[i : i + seq_length]
        time_seq = time_deltas[i : i + seq_length]
        next_e = encoded_events[i + seq_length]
        
        sequences_events.append(event_seq)
        sequences_times.append(time_seq)
        next_events.append(next_e)
    
    sequences_events = np.array(sequences_events)
    sequences_times = np.array(sequences_times)
    next_events = np.array(next_events)
    
    # One-hot encode the target events
    y = to_categorical(next_events, num_classes=vocab_size)
    
    for epochs in epochs_options:
        for temperature in temperature_options:

            # -----------------------------------------------------------------
            # Build the Model
            # -----------------------------------------------------------------
            # Event input and embedding
            event_input = Input(shape=(seq_length,), name='event_input')
            x = Embedding(input_dim=vocab_size,
                          output_dim=8,  # embedding size for events
                          input_length=seq_length)(event_input)
            
            # Time delta input and reshaping
            time_input = Input(shape=(seq_length,), name='time_input')
            t = Reshape((seq_length, 1))(time_input)
            
            # Merge the event embedding and time delta
            merged = Concatenate(axis=-1)([x, t])
            
            # Process the merged input with an LSTM layer
            lstm_out = LSTM(16)(merged)
            
            # Output layer predicting probability distribution over events
            output = Dense(vocab_size, activation='softmax')(lstm_out)
            
            model = Model(inputs=[event_input, time_input], outputs=output)
            model.compile(loss='categorical_crossentropy', 
                          optimizer='adam', 
                          metrics=['accuracy'])
            
            # -----------------------------------------------------------------
            # Training the Model with an 80:20 split (validation_split)
            # -----------------------------------------------------------------
            # The validation_split parameter reserves 20% of the data as a hold-out set.
            history = model.fit(
                [sequences_events, sequences_times],
                y,
                epochs=epochs,
                verbose=0,
                validation_split=0.2  # 20% for validation
            )
            
            # Evaluate performance on the validation set (80% training / 20% validation)
            final_val_loss = history.history['val_loss'][-1]
            final_val_accuracy = history.history['val_accuracy'][-1]
            
            # -----------------------------------------------------------------
            # Inference: Predict the next event using the latest seq_length events
            # -----------------------------------------------------------------
            last_sequence_events = encoded_events[-seq_length:]
            last_sequence_times = time_deltas[-seq_length:]
            
            # Reshape to a batch of 1 for prediction
            last_sequence_events = np.array(last_sequence_events).reshape(1, seq_length)
            last_sequence_times = np.array(last_sequence_times).reshape(1, seq_length)
            
            pred_probs = model.predict([last_sequence_events, last_sequence_times])[0]
            predicted_index = sample_with_temperature(pred_probs, temperature)
            predicted_event = label_encoder.inverse_transform([predicted_index])[0]
            
            # Record the results
            result = {
                "seq_length": seq_length,
                "epochs": epochs,
                "temperature": temperature,
                "validation_loss": final_val_loss,
                "validation_accuracy": final_val_accuracy,
                "predicted_event": predicted_event
            }
            results.append(result)
            print(f"Seq_len: {seq_length}, Epochs: {epochs}, Temp: {temperature} => "
                  f"Val Loss: {final_val_loss:.4f}, Val Acc: {final_val_accuracy:.4f}, "
                  f"Predicted: {predicted_event}")
            
            # Clear session to free memory before next run
            K.clear_session()

# Display the best configuration based on the lowest validation loss
best_config = min(results, key=lambda x: x["validation_loss"])
print("\nBest configuration (lowest validation loss):")
print(best_config)

Vocabulary (Event Types): ['tabAttached' 'tabCreated' 'tabDetached' 'tabHighlighted' 'tabRemoved'
 'tabSwitched' 'tabTitleChanged' 'tabUpdated' 'userIdleStateChanged'
 'windowCreated' 'windowFocused' 'windowRemoved']
Seq_len: 1, Epochs: 10, Temp: 0.5 => Val Loss: 1.1709, Val Acc: 0.6445, Predicted: tabUpdated
Seq_len: 1, Epochs: 10, Temp: 1.0 => Val Loss: 1.1599, Val Acc: 0.6461, Predicted: tabAttached
Seq_len: 1, Epochs: 10, Temp: 1.5 => Val Loss: 1.1786, Val Acc: 0.6445, Predicted: tabSwitched
Seq_len: 1, Epochs: 20, Temp: 0.5 => Val Loss: 1.1674, Val Acc: 0.6522, Predicted: tabSwitched
Seq_len: 1, Epochs: 20, Temp: 1.0 => Val Loss: 1.1379, Val Acc: 0.6522, Predicted: tabTitleChanged
Seq_len: 1, Epochs: 20, Temp: 1.5 => Val Loss: 1.1454, Val Acc: 0.6522, Predicted: tabSwitched
Seq_len: 1, Epochs: 50, Temp: 0.5 => Val Loss: 1.1381, Val Acc: 0.6538, Predicted: windowFocused
Seq_len: 1, Epochs: 50, Temp: 1.0 => Val Loss: 1.1332, Val Acc: 0.6538, Predicted: tabUpdated
Seq_len: 1, Epochs:

In [None]:
import numpy as np
import pandas as pd
import pickle
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Reshape, Concatenate
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras import backend as K

# -------------------------------------------------------------------------
# 1) DATA PREPARATION
# -------------------------------------------------------------------------
# assume `df` is your DataFrame with columns ["type","timestamp",...]
events = df["type"].tolist()
label_encoder = LabelEncoder()
encoded_events = label_encoder.fit_transform(events)
vocab_size = len(label_encoder.classes_)
print("Vocabulary:", label_encoder.classes_)

# compute & scale time deltas
df['time_delta'] = df["timestamp"].diff().dt.total_seconds().fillna(0)
scaler = MinMaxScaler()
time_deltas = scaler.fit_transform(df[['time_delta']]).flatten()

# save encoder & scaler for later
with open('label_encoder.pkl','wb') as f:
    pickle.dump(label_encoder, f)
with open('scaler.pkl','wb') as f:
    pickle.dump(scaler, f)


# -------------------------------------------------------------------------
# 2) HYPERPARAM GRID
# -------------------------------------------------------------------------
seq_length_options   = [1, 5, 7]
epochs_options       = [10, 20, 50]
temperature_options  = [0.5, 1.0, 1.5]

# to track best model
best_val_loss = np.Inf
best_config   = {}
# -------------------------------------------------------------------------
# sampling helper
# -------------------------------------------------------------------------
def sample_with_temperature(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    epsilon = 1e-8
    preds = np.log(preds + epsilon) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)


# -------------------------------------------------------------------------
# 3) GRID SEARCH + SAVE BEST
# -------------------------------------------------------------------------
for seq_length in seq_length_options:
    # build your sequences once per seq_length
    seq_events, seq_times, next_events = [], [], []
    for i in range(len(encoded_events) - seq_length):
        seq_events.append(encoded_events[i:i+seq_length])
        seq_times.append(time_deltas[i:i+seq_length])
        next_events.append(encoded_events[i+seq_length])

    X_events = np.array(seq_events)
    X_times  = np.array(seq_times)
    y        = to_categorical(next_events, num_classes=vocab_size)

    for epochs in epochs_options:
        for temp in temperature_options:

            # build model
            ev_in = Input(shape=(seq_length,), name='event_input')
            x     = Embedding(vocab_size, 8, input_length=seq_length)(ev_in)
            t_in  = Input(shape=(seq_length,), name='time_input')
            t     = Reshape((seq_length,1))(t_in)
            merged= Concatenate(axis=-1)([x, t])
            lstm  = LSTM(16)(merged)
            out   = Dense(vocab_size, activation='softmax')(lstm)
            model = Model([ev_in, t_in], out)
            model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

            # train
            history = model.fit(
                [X_events, X_times], y,
                epochs=epochs,
                verbose=0,
                validation_split=0.2
            )
            val_loss = history.history['val_loss'][-1]
            val_acc  = history.history['val_accuracy'][-1]
            print(f"[{seq_length=}, {epochs=}, temp={temp}] → val_loss={val_loss:.4f}, val_acc={val_acc:.4f}")

            # if best so far, save
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_config = {
                    'seq_length': seq_length,
                    'epochs': epochs,
                    'temperature': temp,
                    'val_loss': val_loss,
                    'val_acc': val_acc
                }
                model.save('best_model.h5')
                print("  ✨ New best model saved:", best_config)

            K.clear_session()

print("\n=== BEST OVERALL CONFIG ===")
print(best_config)
print("Best model stored as: best_model.h5")

# ──────────────────────────────────────
# Now pickle best_config for server.py
# ──────────────────────────────────────
import pickle

with open('best_config.pkl', 'wb') as f:
    pickle.dump(best_config, f)

print("Also saved best_config to best_config.pkl")


Vocabulary: ['tabAttached' 'tabCreated' 'tabDetached' 'tabHighlighted' 'tabRemoved'
 'tabSwitched' 'tabTitleChanged' 'tabUpdated' 'userIdleStateChanged'
 'windowCreated' 'windowFocused' 'windowRemoved']
[seq_length=1, epochs=10, temp=0.5] → val_loss=1.1684, val_acc=0.6275
  ✨ New best model saved: {'seq_length': 1, 'epochs': 10, 'temperature': 0.5, 'val_loss': 1.1684389114379883, 'val_acc': 0.6275116205215454}


  saving_api.save_model(


[seq_length=1, epochs=10, temp=1.0] → val_loss=1.1688, val_acc=0.6461
[seq_length=1, epochs=10, temp=1.5] → val_loss=1.1746, val_acc=0.6306
[seq_length=1, epochs=20, temp=0.5] → val_loss=1.1450, val_acc=0.6522
  ✨ New best model saved: {'seq_length': 1, 'epochs': 20, 'temperature': 0.5, 'val_loss': 1.1449682712554932, 'val_acc': 0.6522411108016968}
[seq_length=1, epochs=20, temp=1.0] → val_loss=1.1388, val_acc=0.6538
  ✨ New best model saved: {'seq_length': 1, 'epochs': 20, 'temperature': 1.0, 'val_loss': 1.1387676000595093, 'val_acc': 0.6537867188453674}
[seq_length=1, epochs=20, temp=1.5] → val_loss=1.1548, val_acc=0.6522
[seq_length=1, epochs=50, temp=0.5] → val_loss=1.1421, val_acc=0.6522
[seq_length=1, epochs=50, temp=1.0] → val_loss=1.1363, val_acc=0.6538
  ✨ New best model saved: {'seq_length': 1, 'epochs': 50, 'temperature': 1.0, 'val_loss': 1.1362656354904175, 'val_acc': 0.6537867188453674}
[seq_length=1, epochs=50, temp=1.5] → val_loss=1.1366, val_acc=0.6538
[seq_length=5, ep