In [71]:
import os
import json
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

def build_and_train_model(sequences, labels):
    # Convert list to numpy array if not already
    sequences = np.array(sequences)
    
    # Check if sequences array is not empty
    if sequences.size > 0:
        model = Sequential()
        # Ensure the input_shape matches the actual feature shape per timestep
        model.add(LSTM(50, return_sequences=True, input_shape=(sequences.shape[1], sequences.shape[2])))
        model.add(LSTM(50))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        # Convert labels to numpy array if not already
        labels = np.array(labels)
        
        model.fit(sequences, labels, epochs=20, batch_size=32)
        return model
    else:
        print("No valid sequences to train on.")
        return None


def load_data(directory):
    session_data = []
    for user_folder in os.listdir(directory):
        user_path = os.path.join(directory, user_folder)
        if os.path.isdir(user_path):
            for date_folder in os.listdir(user_path):
                date_path = os.path.join(user_path, date_folder)
                activity_file = 'activity_log.json'
                file_path = os.path.join(date_path, activity_file)
                if os.path.isfile(file_path):
                    with open(file_path, 'r') as file:
                        file_data = json.load(file)
                        if isinstance(file_data, list):
                            session_data.append(file_data)  # Each file is one session
    return session_data

def preprocess_data(session_data):
    all_features = []
    all_labels = []
    for session in session_data:
        features = []
        label = None
        for event in session:
            if 'level' in event['data']:  # Assuming focus level is in 'data'
                label = event['data']['level']
            else:
                feature = [
                    event['timestamp'],
                    event['type'],
                    event['data'].get('position', [0, 0]),  # Defaulting to [0, 0] if none
                    event['data'].get('button', 'None')  # Defaulting to 'None' if none
                ]
                features.append(feature)
        if features and label is not None:
            all_features.append(features)
            all_labels.append(label)
    
    return all_features, all_labels

def encode_features(features):
    # Flatten the feature list for encoding then reshape
    all_sessions = []
    for session in features:
        encoder = OneHotEncoder()
        categorical_features = np.array([[feat[1], feat[3]] for feat in session])
        categorical_encoded = encoder.fit_transform(categorical_features).toarray()

        position_data = np.array([feat[2] for feat in session])
        encoded_session = np.hstack((position_data, categorical_encoded))
        all_sessions.append(encoded_session)
    
    return all_sessions

def create_sequences(features, labels, sequence_length=100):
    # Padding sequences
    padded_features = pad_sequences(features, maxlen=sequence_length, padding='post', dtype='float32')
    padded_labels = np.array(labels)  # No need to pad labels as there is one per sequence
    return padded_features, padded_labels

def build_and_train_model(sequences, labels):
    # Convert list to numpy array if not already
    sequences = np.array(sequences)
    
    # Check if sequences array is not empty
    if sequences.size > 0:
        model = Sequential()
        # Ensure the input_shape matches the actual feature shape per timestep
        model.add(LSTM(50, return_sequences=True, input_shape=(sequences.shape[1], sequences.shape[2])))
        model.add(LSTM(50))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        # Convert labels to numpy array if not already
        labels = np.array(labels)
        
        model.fit(sequences, labels, epochs=20, batch_size=32)
        return model
    else:
        print("No valid sequences to train on.")
        return None

def main():
    directory = '../../focus_level/'
    session_data = load_data(directory)
    print("Number of sessions:", len(session_data))
    processed_data, labels = preprocess_data(session_data)
    encoded_features = encode_features(processed_data)
    print("Number of encoded features:", len(encoded_features))
    X, y = create_sequences(encoded_features, labels)

    build_and_train_model(X, y)



In [72]:
if __name__ == '__main__':
    main()


Number of sessions: 1
Number of encoded features: 1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
