In [1]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from google.colab import files, drive
import pandas as pd
import numpy as np

# Mount Google Drive
drive.mount('/content/gdrive')

uploaded = files.upload()

# Use 'results.csv' instead of 'LSTMData.csv'
LSTMData = pd.read_csv('FixedTrainData.csv')

# These cleanup steps aren't required anymore based on the format we created:
# LSTMData = LSTMData[~LSTMData['prev_cluster_seq'].str.contains('NA')]
# LSTMData = LSTMData.dropna(subset=['next_cluster'])

# Update column names to match the dataset's structure
LSTMData['sequence'] = LSTMData['sequence'].apply(lambda x: [int(i) for i in x.strip("[]").split(",")])

def create_model():
    model = Sequential()
    model.add(LSTM(100, input_shape=(10, 1)))
    model.add(Dense(6, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Use 'speaker_id' instead of 'next_cluster_speaker_id'
speaker_ids = LSTMData['speaker_id'].unique()

results = []
counter = 1
for speaker_id in speaker_ids:
    train_data = LSTMData[LSTMData['speaker_id'] != speaker_id]
    test_data = LSTMData[LSTMData['speaker_id'] == speaker_id]

    X_train = np.array(train_data['sequence'].tolist()).reshape(-1, 10, 1)
    y_train = to_categorical(train_data['target'] - 1, num_classes=6)

    X_test = np.array(test_data['sequence'].tolist()).reshape(-1, 10, 1)
    y_test = to_categorical(test_data['target'] - 1, num_classes=6)

    model = create_model()

    print(f"Running Training Session {counter} for Speaker ID {speaker_id}")
    model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)

    score = model.evaluate(X_test, y_test, verbose=1)
    print(f'Accuracy for speaker {speaker_id}: {score[1]*100}%')

    results.append({'speaker_id': speaker_id, 'score': score[1]})

    predictions = model.predict(X_test)
    pred_df = pd.DataFrame({
        'input_sequence': test_data['sequence'].tolist(),
        'predicted_cluster': np.argmax(predictions, axis=1) + 1,
        'actual_cluster': np.argmax(y_test, axis=1) + 1,
        'seg_id': test_data['seg_id'].tolist(),
        'speaker_id': speaker_id
    })

    pred_df.to_csv(f'/content/gdrive/My Drive/FixedLSTMPredictions{speaker_id}.csv', index=False)

    counter += 1


Mounted at /content/gdrive


Saving FixedTrainData.csv to FixedTrainData.csv
Running Training Session 1 for Speaker ID B-01
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for speaker B-01: 73.52746725082397%
Running Training Session 2 for Speaker ID B-02
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for speaker B-02: 74.22360181808472%
Running Training Session 3 for Speaker ID B-03
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for speaker B-03: 67.93892979621887%
Running Training Session 4 for Speaker ID B-04
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for speaker B-04: 63.63636255264282%
Running Training Session 5 for Speaker ID B-06
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
E