In [9]:
import json
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from keras.models import Sequential
from keras.layers import LSTM, Dense

In [17]:
import json
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense

def fetchData(directory='../../focus_level/'):
    data = []
    # Walk through the directory and read each JSON file
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r') as f:
                    # Load and extend the main data list
                    data.extend(json.load(f))
    return data

def split_data_into_sequences(data):
    sequences = []
    current_sequence = []
    labels = []

    for item in data:
        if item['type'] == 'focus_level':
            # Sequence ends here, append the label and reset
            labels.append(item['data']['level'])
            sequences.append(current_sequence)
            current_sequence = []
        else:
            current_sequence.append(item)

    return sequences, labels

def prepare_features(sequences, max_sequence_length=10):
    scaler = MinMaxScaler()
    # Predefine all possible event types for consistent one-hot encoding
    all_types = set(item['type'] for sequence in sequences for item in sequence)
    enc = OneHotEncoder(categories=[list(all_types)])
    enc.fit(np.array(list(all_types)).reshape(-1, 1))  # Fit encoder to all types
    
    all_features = []

    for sequence in sequences:
        if sequence:
            df = pd.DataFrame(sequence)
            df['start_positions'] = df['data'].apply(lambda x: x.get('start_position', [0, 0]))
            df['end_positions'] = df['data'].apply(lambda x: x.get('end_position', [0, 0]))
            df['time_deltas'] = df.get('time_delta', 0).fillna(0)
            
            positions = np.vstack(df['start_positions'].tolist() + df['end_positions'].tolist())
            positions = scaler.fit_transform(positions).reshape(-1, 4)
            
            # Encode all types within the dataframe using the pre-fitted encoder
            type_encoded = enc.transform(df[['type']].values).toarray()
            time_deltas = df['time_deltas'].values.reshape(-1, 1)
            
            # Check shapes
            print("Positions shape:", positions.shape)
            print("Type encoded shape:", type_encoded.shape)
            print("Time deltas shape:", time_deltas.shape)

            features = np.hstack((positions, type_encoded, time_deltas))
            
            print("Features shape:", features.shape)  # Debugging output
            all_features.append(features)

    all_features_padded = pad_sequences(all_features, maxlen=max_sequence_length, dtype='float32', padding='post')

    # Debugging output to check the shape after padding
    for f in all_features_padded:
        print("Padded Features shape:", f.shape)
        
    return all_features_padded


def create_model_input(all_features, labels, window_size=10):
    final_sequences = []
    final_labels = []

    for features, label in zip(all_features, labels):
        final_sequences.append(features[:window_size])
        final_labels.append(label)

    return np.array(final_sequences), np.array(final_labels)

def build_and_train_model(sequences, labels):
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=(sequences.shape[1], sequences.shape[2])))
    model.add(LSTM(50))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(sequences, labels, epochs=20, batch_size=32)
    return model

# Main execution flow
data = fetchData()
sequences, labels = split_data_into_sequences(data)
all_features = prepare_features(sequences)
sequences, labels = create_model_input(all_features, labels)

# Training the model
print("Training model with", sequences.shape[0], "data points.")
model = build_and_train_model(sequences, labels)
print("Model training complete.")


Positions shape: (1626, 4)
Type encoded shape: (1626, 5)
Time deltas shape: (1626, 1)
Features shape: (1626, 10)
Positions shape: (1583, 4)
Type encoded shape: (1583, 5)
Time deltas shape: (1583, 1)
Features shape: (1583, 10)
Positions shape: (1562, 4)
Type encoded shape: (1562, 5)
Time deltas shape: (1562, 1)
Features shape: (1562, 10)
Positions shape: (407, 4)
Type encoded shape: (407, 5)
Time deltas shape: (407, 1)
Features shape: (407, 10)
Positions shape: (47, 4)
Type encoded shape: (47, 5)
Time deltas shape: (47, 1)
Features shape: (47, 10)
Positions shape: (88, 4)
Type encoded shape: (88, 5)
Time deltas shape: (88, 1)
Features shape: (88, 10)
Positions shape: (3221, 4)
Type encoded shape: (3221, 5)
Time deltas shape: (3221, 1)
Features shape: (3221, 10)
Positions shape: (29, 4)
Type encoded shape: (29, 5)
Time deltas shape: (29, 1)
Features shape: (29, 10)
Positions shape: (3, 4)
Type encoded shape: (3, 5)
Time deltas shape: (3, 1)
Features shape: (3, 10)
Positions shape: (3, 4)