In [118]:
import os
import json
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

def load_data(directory):
    session_data = []
    for user_folder in os.listdir(directory):
        print("Loading data for user", user_folder)
        user_path = os.path.join(directory, user_folder)
        if os.path.isdir(user_path):
            for date_folder in os.listdir(user_path):
                print("Loading data for date", date_folder)
                date_path = os.path.join(user_path, date_folder)
                activity_file = 'events_data.json'
                print("Loading data from", date_path)
                file_path = os.path.join(date_path, activity_file)
                if os.path.isfile(file_path):
                    with open(file_path, 'r') as file:
                        file_data = json.load(file)
                        if isinstance(file_data, list):
                            session_data.append(file_data)  # Each file is one session
    return session_data

def preprocess_data(session_data, threshold=5):
    all_features = []
    all_labels = []
    for session in session_data:
        print("Processing session with", len(session), "events")
        features = []
        label = None
        for event in session:
            # Check if the event is a focus level event and extract 
            if 'focus_level' in event['type']:
                focus_level = event['data']['level']
                label = 1 if focus_level > threshold else 0
                if features and label is not None:  # Ensure there is data to add before resetting
                    print("Adding session with", len(features), "events")
                    all_features.append(features)
                    all_labels.append(label)
                # Reset features and label for a new session starting after this event
                features = []
            # Extract features based on event type
            else:
                event_type = event['type']
                if event_type == 'gaze_data':
                    position = event['data'].get('adjusted_gaze_start_position', [0, 0])
                elif event_type == 'mouse_movement':
                    start_position = event['data'].get('start_position', [0, 0])
                    end_position = event['data'].get('end_position', [0, 0])
                    position = [(s + e) / 2 for s, e in zip(start_position, end_position)]  # Average position
                elif event_type == 'mouse_click':
                    position = event['data'].get('position', [0, 0])
                else:
                    position = [0, 0]

                button = event['data'].get('button', 'None')
                feature = [event['timestamp'], event_type, position, button]
                features.append(feature)
    return all_features, all_labels

def encode_features(features):
    # Collect all categories for fitting the encoder
    all_categories = []
    for session in features:
        all_categories.extend([[feat[1], feat[3]] for feat in session])
    encoder = OneHotEncoder()
    encoder.fit(all_categories)  # Fit encoder to all categories once

    all_sessions = []
    for session in features:
        categorical_features = np.array([[feat[1], feat[3]] for feat in session])
        categorical_encoded = encoder.transform(categorical_features).toarray()
        position_data = np.array([feat[2] for feat in session])
        encoded_session = np.hstack((position_data, categorical_encoded))
        all_sessions.append(encoded_session)

    return all_sessions


def create_sequences(features, labels, sequence_length=100):

    # Padding sequences
    padded_features = pad_sequences(features, maxlen=sequence_length, padding='post', dtype='float32')
    padded_labels = np.array(labels)  # No need to pad labels as there is one per sequence
    print(padded_features.shape, padded_labels.shape)
    return padded_features, padded_labels

def build_and_train_model(sequences, labels):
    # Convert list to numpy array if not already
    sequences = np.array(sequences)
    
    # Check if sequences array is not empty
    if sequences.size > 0:
        print(sequences.shape)
        model = Sequential()
        # Ensure the input_shape matches the actual feature shape per timestep
        model.add(LSTM(50, return_sequences=True, input_shape=(sequences.shape[1], sequences.shape[2])))
        model.add(LSTM(50))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        # Convert labels to numpy array if not already
        labels = np.array(labels)
        
        model.fit(sequences, labels, epochs=20, batch_size=32)
        return model
    else:
        print("No valid sequences to train on.")
        return None

def main():
    directory = '../../focus_level/'
    session_data = load_data(directory)
    processed_data, labels = preprocess_data(session_data)
    encoded_features = encode_features(processed_data)
    print("Number of encoded features:", len(encoded_features))
    X, y = create_sequences(encoded_features, labels)
    print(y)

    model = build_and_train_model(X, y)
    return model

In [101]:
if __name__ == '__main__':
    model = main()


Adding session with 1496 events
Adding session with 5846 events
Adding session with 4256 events
Adding session with 217 events
Adding session with 6886 events
Adding session with 2813 events
Adding session with 901 events
Adding session with 4936 events
Adding session with 2436 events
Adding session with 8169 events
Adding session with 8224 events
Adding session with 11880 events
Adding session with 9712 events
Number of encoded features: 13
(13, 100, 13) (13,)
[0 1 0 0 0 0 0 1 0 1 1 1 1]
(13, 100, 13)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [116]:
import os
import json
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences

def predict_focus(model, sequences):
    return model.predict(sequences)

In [120]:
directory = '../../focus_level/'
data = load_data(directory)
print("Number of sessions:", len(data))
processed_features, processed_labels = preprocess_data(data)
encoded_features = encode_features(processed_features)
X, y = create_sequences(encoded_features, processed_labels)
print(X.shape, y.shape)
predictions = predict_focus(model, X)

Loading data for user test
Loading data for date test
Loading data from ../../focus_level/test\test
Loading data for user wgoud
Loading data for date 2024-04-12
Loading data from ../../focus_level/wgoud\2024-04-12
Loading data for date 2024-04-14
Loading data from ../../focus_level/wgoud\2024-04-14
Loading data for date 2024-04-16
Loading data from ../../focus_level/wgoud\2024-04-16
Loading data for date 2024-04-21
Loading data from ../../focus_level/wgoud\2024-04-21
Loading data for date 2024-04-23
Loading data from ../../focus_level/wgoud\2024-04-23
Loading data for date 2024-04-25
Loading data from ../../focus_level/wgoud\2024-04-25
Loading data for date 2024-04-29
Loading data from ../../focus_level/wgoud\2024-04-29
Number of sessions: 1
Processing session with 1254 events
Adding session with 422 events
Adding session with 581 events
Adding session with 248 events
(3, 100, 9) (3,)
(3, 100, 9) (3,)
