In [2]:
import os
import json
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import StandardScaler

def load_data(directory):
    session_data = []
    for user_folder in os.listdir(directory):
        user_path = os.path.join(directory, user_folder)
        if os.path.isdir(user_path):
            for date_folder in os.listdir(user_path):
                date_path = os.path.join(user_path, date_folder)
                activity_file = 'activity_log.json'
                file_path = os.path.join(date_path, activity_file)
                if os.path.isfile(file_path):
                    with open(file_path, 'r') as file:
                        file_data = json.load(file)
                        if isinstance(file_data, list):
                            session_data.append(file_data)  # Each file is one session
    return session_data

def preprocess_data(session_data, threshold=5):
    all_features = []
    all_labels = []
    for session in session_data:
        features = []
        label = None
        for event in session:
            # Check if the event is a focus level event and extract 
            if 'focus_level' in event['type']:
                focus_level = event['data']['level']
                label = 1 if focus_level > threshold else 0
                if features and label is not None:  # Ensure there is data to add before resetting
                    all_features.append(features)
                    all_labels.append(label)
                # Reset features and label for a new session starting after this event
                features = []
            # Extract features based on event type
            else:
                event_type = event['type']
                if event_type == 'active_window':
                    continue
                time_delta = event.get('time_delta', event['data'].get('time_delta', 0))
                if event_type == 'gaze_data':
                    position = event['data'].get('adjusted_gaze_start_position', [0, 0])
                elif event_type == 'mouse_movement':
                    start_position = event['data'].get('start_position', [0, 0])
                    end_position = event['data'].get('end_position', [0, 0])
                    position = [(s + e) / 2 for s, e in zip(start_position, end_position)]  # Average position
                elif event_type == 'mouse_click':
                    position = event['data'].get('position', [0, 0])
                elif event_type == 'keyboard_session':
                    start_time = event['data'].get('start_time', event['timestamp'])
                    end_time = event['data'].get('end_time', event['timestamp'])
                    duration = (np.datetime64(end_time) - np.datetime64(start_time)).astype('timedelta64[ms]').astype(int)
                    position = [duration, 0] 
                else:
                    position = [0, 0]
                button = event['data'].get('button', 'None')

                feature = [event['timestamp'], event_type, position, button, time_delta]
                features.append(feature)
    return all_features, all_labels

def encode_features(features):
    all_categories = []
    all_time_deltas = []
    for session in features:
        all_categories.extend([[feat[1], feat[3]] for feat in session])
        all_time_deltas.extend([feat[4] for feat in session])

    encoder = OneHotEncoder()
    encoder.fit(all_categories)

    scaler = StandardScaler()
    all_time_deltas = np.array(all_time_deltas).reshape(-1, 1)
    scaler.fit(all_time_deltas)
    all_time_deltas_normalized = scaler.transform(all_time_deltas).flatten()

    all_sessions = []
    time_delta_index = 0
    for session in features:
        categorical_features = np.array([[feat[1], feat[3]] for feat in session])
        categorical_encoded = encoder.transform(categorical_features).toarray()
        position_data = np.array([feat[2] for feat in session])
        time_deltas = np.array([all_time_deltas_normalized[time_delta_index:time_delta_index+len(session)]])
        time_delta_index += len(session)
        encoded_session = np.hstack((position_data, categorical_encoded, time_deltas.T))
        all_sessions.append(encoded_session)

    return all_sessions, encoder, scaler

def create_sequences(features, labels, sequence_length=100):

    # Padding sequences
    padded_features = pad_sequences(features, maxlen=sequence_length, padding='post', dtype='float32')
    padded_labels = np.array(labels)  # No need to pad labels as there is one per sequence
    print(padded_features.shape, padded_labels.shape)
    return padded_features, padded_labels

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, BatchNormalization
import numpy as np

def build_and_train_model(X_train, y_train, X_test, y_test):
    # Convert lists to numpy arrays if not already
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)
    
    # Check if sequences array is not empty
    if X_train.size > 0:
        print(X_train.shape)
        model = Sequential()
        
        # Add bidirectional LSTMs and more layers
        model.add(Bidirectional(LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2]))))
        model.add(BatchNormalization())
        model.add(Dropout(0.2))
        model.add(Bidirectional(LSTM(128, return_sequences=True)))
        model.add(Dropout(0.2))
        model.add(Bidirectional(LSTM(84)))
        model.add(Dropout(0.2))
        

        # Add dense layers
        model.add(Dense(128, activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        
        # Compile the model with an optimizer and loss function
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        # Train the model
        model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test))
        return model
    else:
        print("No valid sequences to train on.")
        return None

import pickle
def save_model_and_preprocessors(model, encoder, scaler, model_path, encoder_path, scaler_path):
    # Save the Keras model
    model.save(model_path)
    # Save the preprocessors
    with open(encoder_path, 'wb') as f:
        pickle.dump(encoder, f)
    with open(scaler_path, 'wb') as f:
        pickle.dump(scaler, f)

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model
import os

def main():
    focus_level_directory = '../../focus_level/'
    synthetic_data_directory = 'synthetic_data/'

    # Load data from both directories
    session_data_focus = load_data(focus_level_directory)
    session_data_synthetic = load_data(synthetic_data_directory)
    print("Loaded", len(session_data_focus), "focus level sessions and", len(session_data_synthetic), "synthetic sessions.")

    # Combine the data into one list
    session_data = session_data_focus + session_data_synthetic

    # Preprocess the combined data
    processed_data, labels = preprocess_data(session_data)
    encoded_features, encoder, scaler = encode_features(processed_data)
    X, y = create_sequences(encoded_features, labels)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Build and train a new model with the training set
    model = build_and_train_model(X_train, y_train, X_test, y_test)

    # Save model and preprocessors
    version = "_v2"
    model_path = f'model{version}.h5'
    encoder_path = f'encoder{version}.pkl'
    scaler_path = f'scaler{version}.pkl'
    save_model_and_preprocessors(model, encoder, scaler, model_path, encoder_path, scaler_path)

    return model, encoder, scaler

In [3]:
if __name__ == '__main__':
    model = main()


Loaded 10 focus level sessions and 2000 synthetic sessions.
(2029, 100, 15) (2029,)
(1623, 100, 15)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
