In [16]:
import os
import json
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import StandardScaler

def load_data(directory):
    session_data = []
    for user_folder in os.listdir(directory):
        print("Loading data for user", user_folder)
        user_path = os.path.join(directory, user_folder)
        if os.path.isdir(user_path):
            for date_folder in os.listdir(user_path):
                print("Loading data for date", date_folder)
                date_path = os.path.join(user_path, date_folder)
                activity_file = 'activity_log.json'
                print("Loading data from", date_path)
                file_path = os.path.join(date_path, activity_file)
                if os.path.isfile(file_path):
                    with open(file_path, 'r') as file:
                        file_data = json.load(file)
                        if isinstance(file_data, list):
                            session_data.append(file_data)  # Each file is one session
    return session_data

def preprocess_data(session_data, threshold=5):
    all_features = []
    all_labels = []
    for session in session_data:
        print("Processing session with", len(session), "events")
        features = []
        label = None
        for event in session:
            # Check if the event is a focus level event and extract 
            if 'focus_level' in event['type']:
                focus_level = event['data']['level']
                label = 1 if focus_level > threshold else 0
                if features and label is not None:  # Ensure there is data to add before resetting
                    print("Adding session with", len(features), "events")
                    all_features.append(features)
                    all_labels.append(label)
                # Reset features and label for a new session starting after this event
                features = []
            # Extract features based on event type
            else:
                event_type = event['type']
                time_delta = event['data'].get('time_delta', 0)
                if event_type == 'gaze_data':
                    position = event['data'].get('adjusted_gaze_start_position', [0, 0])
                elif event_type == 'mouse_movement':
                    start_position = event['data'].get('start_position', [0, 0])
                    end_position = event['data'].get('end_position', [0, 0])
                    position = [(s + e) / 2 for s, e in zip(start_position, end_position)]  # Average position
                elif event_type == 'mouse_click':
                    position = event['data'].get('position', [0, 0])
                elif event_type == 'keyboard_session':
                    position = event['data'].get('key_stokes', [0, 0])
                else:
                    position = [0, 0]

                button = event['data'].get('button', 'None')
                feature = [event['timestamp'], event_type, position, button, time_delta]
                features.append(feature)
    return all_features, all_labels



def encode_features(features):
    # Collect all categories for fitting the encoder
    all_categories = []
    all_time_deltas = []
    for session in features:
        all_categories.extend([[feat[1], feat[3]] for feat in session])
        all_time_deltas.extend([feat[4] for feat in session])  # Extract time deltas

    encoder = OneHotEncoder()
    encoder.fit(all_categories)  # Fit encoder to all categories once

    # Normalize time deltas
    scaler = StandardScaler()
    all_time_deltas = np.array(all_time_deltas).reshape(-1, 1)  # Reshape for scaling
    scaler.fit(all_time_deltas)
    all_time_deltas_normalized = scaler.transform(all_time_deltas).flatten()

    all_sessions = []
    time_delta_index = 0
    for session in features:
        categorical_features = np.array([[feat[1], feat[3]] for feat in session])
        categorical_encoded = encoder.transform(categorical_features).toarray()
        position_data = np.array([feat[2] for feat in session])
        time_deltas = np.array([all_time_deltas_normalized[time_delta_index:time_delta_index+len(session)]])
        time_delta_index += len(session)
        encoded_session = np.hstack((position_data, categorical_encoded, time_deltas.T))  # Append time deltas
        all_sessions.append(encoded_session)

    return all_sessions, encoder, scaler



def create_sequences(features, labels, sequence_length=100):

    # Padding sequences
    padded_features = pad_sequences(features, maxlen=sequence_length, padding='post', dtype='float32')
    padded_labels = np.array(labels)  # No need to pad labels as there is one per sequence
    print(padded_features.shape, padded_labels.shape)
    return padded_features, padded_labels

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, BatchNormalization
import numpy as np

def build_and_train_model(sequences, labels):
    # Convert list to numpy array if not already
    sequences = np.array(sequences)
    
    # Check if sequences array is not empty
    if sequences.size > 0:
        print(sequences.shape)
        model = Sequential()
        
        # Add bidirectional LSTMs and more layers
        model.add(Bidirectional(LSTM(128, return_sequences=True, input_shape=(sequences.shape[1], sequences.shape[2]))))
        model.add(BatchNormalization())
        model.add(Dropout(0.2))
        model.add(Bidirectional(LSTM(128, return_sequences=True)))
        model.add(Dropout(0.2))
        model.add(Bidirectional(LSTM(128)))
        model.add(Dropout(0.2))
        
        # More dense layers and complex network topology
        model.add(Dense(128, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dropout(0.2))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        
        # Compile the model with an optimizer and loss function
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        # Convert labels to numpy array if not already
        labels = np.array(labels)
        
        # Train the model
        model.fit(sequences, labels, epochs=50, batch_size=64)
        return model
    else:
        print("No valid sequences to train on.")
        return None

import pickle
def save_model_and_preprocessors(model, encoder, scaler, model_path, encoder_path, scaler_path):
    # Save the Keras model
    model.save(model_path)
    # Save the preprocessors
    with open(encoder_path, 'wb') as f:
        pickle.dump(encoder, f)
    with open(scaler_path, 'wb') as f:
        pickle.dump(scaler, f)

def main():
    directory = '../../focus_level/'
    session_data = load_data(directory)
    processed_data, labels = preprocess_data(session_data)
    encoded_features, encoder, scaler = encode_features(processed_data)
    X, y = create_sequences(encoded_features, labels)

    model = build_and_train_model(X, y)

    # Save model and preprocessors
    model_path = 'model.h5'
    encoder_path = 'encoder.pkl'
    scaler_path = 'scaler.pkl'
    save_model_and_preprocessors(model, encoder, scaler, model_path, encoder_path, scaler_path)

    return model, encoder, scaler

In [17]:
import pickle
def save_model_and_preprocessors(model, encoder, scaler, model_path, encoder_path, scaler_path):
    model.save(model_path)
    with open(encoder_path, 'wb') as f:
        pickle.dump(encoder, f)
    with open(scaler_path, 'wb') as f:
        pickle.dump(scaler, f)

In [18]:
if __name__ == '__main__':
    model = main()


Loading data for user User
Loading data for date 2024-05-11
Loading data from ../../focus_level/User\2024-05-11
Loading data for user wgoud
Loading data for date 2024-04-12
Loading data from ../../focus_level/wgoud\2024-04-12
Loading data for date 2024-04-14
Loading data from ../../focus_level/wgoud\2024-04-14
Loading data for date 2024-04-16
Loading data from ../../focus_level/wgoud\2024-04-16
Loading data for date 2024-04-21
Loading data from ../../focus_level/wgoud\2024-04-21
Loading data for date 2024-04-23
Loading data from ../../focus_level/wgoud\2024-04-23
Loading data for date 2024-04-25
Loading data from ../../focus_level/wgoud\2024-04-25
Loading data for date 2024-04-29
Loading data from ../../focus_level/wgoud\2024-04-29
Loading data for date 2024-04-30
Loading data from ../../focus_level/wgoud\2024-04-30
Processing session with 30027 events
Adding session with 11603 events
Adding session with 6213 events
Adding session with 652 events
Adding session with 6940 events
Adding 

In [21]:
import os
import json
import numpy as np
import pickle
from tensorflow.keras.models import load_model
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load Model and Preprocessors
def load_model_and_preprocessors(model_path, encoder_path, scaler_path):
    model = load_model(model_path)
    with open(encoder_path, 'rb') as f:
        encoder = pickle.load(f)
    with open(scaler_path, 'rb') as f:
        scaler = pickle.load(f)
    return model, encoder, scaler

# Process a single JSON file
def process_json_file(file_path, encoder, scaler, threshold=5):
    with open(file_path, 'r') as file:
        session_data = json.load(file)

    features = []
    for event in session_data:
        if event['type'] != 'focus_level':
            event_type = event['type']
            time_delta = event['data'].get('time_delta', 0)
            if event_type == 'gaze_data':
                position = event['data'].get('adjusted_gaze_start_position', [0, 0])
            elif event_type == 'mouse_movement':
                start_position = event['data'].get('start_position', [0, 0])
                end_position = event['data'].get('end_position', [0, 0])
                position = [(s + e) / 2 for s, e in zip(start_position, end_position)]
            elif event_type == 'mouse_click':
                position = event['data'].get('position', [0, 0])
            elif event_type == 'keyboard_session':
                position = event['data'].get('key_stokes', [0, 0])
            else:
                position = [0, 0]

            button = event['data'].get('button', 'None')
            feature = [event['timestamp'], event_type, position, button, time_delta]
            features.append(feature)

    # Encode and scale features
    if features:
        categorical_features = np.array([[feat[1], feat[3]] for feat in features])
        position_data = np.array([feat[2] for feat in features])
        time_deltas = np.array([feat[4] for feat in features]).reshape(-1, 1)
        time_deltas_normalized = scaler.transform(time_deltas).flatten()

        categorical_encoded = encoder.transform(categorical_features).toarray()
        encoded_session = np.hstack((position_data, categorical_encoded, time_deltas_normalized[:, np.newaxis]))

        # Padding sequences
        padded_features = pad_sequences([encoded_session], maxlen=100, padding='post', dtype='float32')

        return padded_features
    return None

# Predict function
def predict_from_json(file_path, model, encoder, scaler):
    processed_features = process_json_file(file_path, encoder, scaler)
    if processed_features is not None:
        prediction = model.predict(processed_features)
        return prediction
    return "No valid data to predict."

# Main entry to test the model with a specified JSON file
if __name__ == '__main__':
    model_path = 'model.h5'
    encoder_path = 'encoder.pkl'
    scaler_path = 'scaler.pkl'
    json_file_path = 'events_data.json'

    model, encoder, scaler = load_model_and_preprocessors(model_path, encoder_path, scaler_path)
    prediction = predict_from_json(json_file_path, model, encoder, scaler)
    print("Prediction:", prediction)

Prediction: [[0.9831607]]
