In [None]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from google.colab import files, drive
import pandas as pd
import numpy as np

# Mount Google Drive
drive.mount('/content/gdrive')

uploaded = files.upload()

# Use 'results.csv' instead of 'LSTMData.csv'
LSTMData = pd.read_csv('FixedTrainData.csv')

# These cleanup steps aren't required anymore based on the format we created:
# LSTMData = LSTMData[~LSTMData['prev_cluster_seq'].str.contains('NA')]
# LSTMData = LSTMData.dropna(subset=['next_cluster'])

# Update column names to match the dataset's structure
LSTMData['sequence'] = LSTMData['sequence'].apply(lambda x: [int(i) for i in x.strip("[]").split(",")])

def create_model():
    model = Sequential()
    model.add(LSTM(100, input_shape=(10, 1)))
    model.add(Dense(6, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Use 'speaker_id' instead of 'next_cluster_speaker_id'
speaker_ids = LSTMData['speaker_id'].unique()

results = []
counter = 1
for speaker_id in speaker_ids:
    train_data = LSTMData[LSTMData['speaker_id'] != speaker_id]
    test_data = LSTMData[LSTMData['speaker_id'] == speaker_id]

    X_train = np.array(train_data['sequence'].tolist()).reshape(-1, 10, 1)
    y_train = to_categorical(train_data['target'] - 1, num_classes=6)

    X_test = np.array(test_data['sequence'].tolist()).reshape(-1, 10, 1)
    y_test = to_categorical(test_data['target'] - 1, num_classes=6)

    model = create_model()

    print(f"Running Training Session {counter} for Speaker ID {speaker_id}")
    model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)

    score = model.evaluate(X_test, y_test, verbose=1)
    print(f'Accuracy for speaker {speaker_id}: {score[1]*100}%')

    results.append({'speaker_id': speaker_id, 'score': score[1]})

    predictions = model.predict(X_test)
    pred_df = pd.DataFrame({
        'input_sequence': test_data['sequence'].tolist(),
        'predicted_cluster': np.argmax(predictions, axis=1) + 1,
        'actual_cluster': np.argmax(y_test, axis=1) + 1,
        'seg_id': test_data['seg_id'].tolist(),
        'speaker_id': speaker_id
    })

    pred_df.to_csv(f'/content/gdrive/My Drive/FixedLSTMPredictions{speaker_id}.csv', index=False)

    counter += 1


Mounted at /content/gdrive


Saving FixedTrainData.csv to FixedTrainData.csv
Running Training Session 1 for Speaker ID B-01
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for speaker B-01: 73.52746725082397%
Running Training Session 2 for Speaker ID B-02
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for speaker B-02: 74.22360181808472%
Running Training Session 3 for Speaker ID B-03
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for speaker B-03: 67.93892979621887%
Running Training Session 4 for Speaker ID B-04
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for speaker B-04: 63.63636255264282%
Running Training Session 5 for Speaker ID B-06
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
E

In [None]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
import pandas as pd
import numpy as np
from google.colab import files, drive

# Mount Google Drive
drive.mount('/content/gdrive')
uploaded = files.upload()

LSTMData = pd.read_csv('FixedTrainData.csv')
LSTMData['sequence'] = LSTMData['sequence'].apply(lambda x: [int(i) for i in x.strip("[]").split(",")])

def create_model():
    model = Sequential()
    model.add(LSTM(100, input_shape=(10, 1)))
    model.add(Dense(6, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Perform stratified shuffle split for train-test split with equal proportions of IDS and ADS
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(LSTMData, LSTMData["speech_register"]):
    strat_train_set = LSTMData.loc[train_index]
    strat_test_set = LSTMData.loc[test_index]

# K-Fold Cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_no = 1
for train_index, val_index in kf.split(strat_train_set, strat_train_set['speech_register']):
    X_train = np.array(strat_train_set.iloc[train_index]['sequence'].tolist()).reshape(-1, 10, 1)
    y_train = to_categorical(strat_train_set.iloc[train_index]['target'] - 1, num_classes=6)
    X_val = np.array(strat_train_set.iloc[val_index]['sequence'].tolist()).reshape(-1, 10, 1)
    y_val = to_categorical(strat_train_set.iloc[val_index]['target'] - 1, num_classes=6)

    model = create_model()

    print(f"Training on Fold {fold_no}")
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64, verbose=1)
    fold_no += 1

# Train on the entire training set after K-Fold Cross-validation
X_train_full = np.array(strat_train_set['sequence'].tolist()).reshape(-1, 10, 1)
y_train_full = to_categorical(strat_train_set['target'] - 1, num_classes=6)
model = create_model()
model.fit(X_train_full, y_train_full, epochs=10, batch_size=64, verbose=1)

# Predict on test set
X_test = np.array(strat_test_set['sequence'].tolist()).reshape(-1, 10, 1)
predictions = model.predict(X_test)
pred_df = pd.DataFrame({
    'input_sequence': strat_test_set['sequence'].tolist(),
    'predicted_cluster': np.argmax(predictions, axis=1) + 1,
    'actual_cluster': strat_test_set['target'].tolist(),
    'seg_id': strat_test_set['seg_id'].tolist(),
    'speaker_id': strat_test_set['speaker_id'].tolist()
})
pred_df.to_csv('/content/gdrive/My Drive/FixedLSTMPredictionsTrainonEntireDataSet.csv', index=False)



Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Saving FixedTrainData.csv to FixedTrainData (2).csv
Training on Fold 1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training on Fold 2
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training on Fold 3
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training on Fold 4
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training on Fold 5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training on Fold 6
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training on Fold 7
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training on Fold 8
Epoch 1/10
Epoch 2/

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

# Loading Data
LSTMData = pd.read_csv('FixedTrainData.csv')
LSTMData['sequence'] = LSTMData['sequence'].apply(lambda x: [int(i) for i in x.strip("[]").split(",")])

# Define the model
def create_model():
    model = Sequential()
    model.add(LSTM(100, input_shape=(10, 1)))
    model.add(Dense(6, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Stratified Train-Test Split (Ensuring equal proportions of IDS and ADS)
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(LSTMData, LSTMData["speech_register"]):
    strat_train_set = LSTMData.loc[train_index]
    strat_test_set = LSTMData.loc[test_index]

# K-Fold Cross-Validation on the training data
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
for train_index, val_index in kf.split(strat_train_set, strat_train_set["speech_register"]):
    train_data = strat_train_set.iloc[train_index]
    val_data = strat_train_set.iloc[val_index]

    X_train = np.array(train_data['sequence'].tolist()).reshape(-1, 10, 1)
    y_train = to_categorical(train_data['target'] - 1, num_classes=6)

    X_val = np.array(val_data['sequence'].tolist()).reshape(-1, 10, 1)
    y_val = to_categorical(val_data['target'] - 1, num_classes=6)

    model = create_model()
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64, verbose=1)

# Predict on Test Set using the last model from k-fold cross-validation
X_test = np.array(strat_test_set['sequence'].tolist()).reshape(-1, 10, 1)
predictions = model.predict(X_test)
predicted_clusters = np.argmax(predictions, axis=1) + 1

# Storing predictions in a DataFrame
pred_df = pd.DataFrame({
    'input_sequence': strat_test_set['sequence'].tolist(),
    'predicted_cluster': predicted_clusters,
    'actual_cluster': strat_test_set['target'].tolist(),
    'seg_id': strat_test_set['seg_id'].tolist(),
    'speaker_id': strat_test_set['speaker_id'].tolist(),
    'speech_register': strat_test_set['speech_register'].tolist()
})

# Saving predictions to CSV
pred_df.to_csv('/content/gdrive/My Drive/FixedLSTMPredictionsNotTrainOnEntireDataset.csv', index=False)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

In [None]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split, StratifiedKFold
import pandas as pd
import numpy as np
from google.colab import files, drive

# Mount Google Drive
drive.mount('/content/gdrive')
uploaded = files.upload()

LSTMData = pd.read_csv('FixedTrainData.csv')
LSTMData['sequence'] = LSTMData['sequence'].apply(lambda x: [int(i) for i in x.strip("[]").split(",")])

def create_model():
    model = Sequential()
    model.add(LSTM(100, input_shape=(10, 1)))
    model.add(Dense(6, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Custom train-test split for each speaker
train_dataframes = []
test_dataframes = []

unique_speakers = LSTMData['speaker_id'].unique()
for speaker in unique_speakers:
    speaker_data = LSTMData[LSTMData['speaker_id'] == speaker]
    train_speaker, test_speaker = train_test_split(speaker_data, test_size=0.2, stratify=speaker_data['speech_register'], random_state=42)
    train_dataframes.append(train_speaker)
    test_dataframes.append(test_speaker)

# Concatenate individual splits to form the final train and test datasets
strat_train_set = pd.concat(train_dataframes)
strat_test_set = pd.concat(test_dataframes)

# K-Fold Cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_no = 1
for train_index, val_index in kf.split(strat_train_set, strat_train_set['speech_register']):
    X_train = np.array(strat_train_set.iloc[train_index]['sequence'].tolist()).reshape(-1, 10, 1)
    y_train = to_categorical(strat_train_set.iloc[train_index]['target'] - 1, num_classes=6)
    X_val = np.array(strat_train_set.iloc[val_index]['sequence'].tolist()).reshape(-1, 10, 1)
    y_val = to_categorical(strat_train_set.iloc[val_index]['target'] - 1, num_classes=6)

    model = create_model()

    print(f"Training on Fold {fold_no}")
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64, verbose=1)
    fold_no += 1

# Predict on test set
X_test = np.array(strat_test_set['sequence'].tolist()).reshape(-1, 10, 1)
predictions = model.predict(X_test)
pred_df = pd.DataFrame({
    'input_sequence': strat_test_set['sequence'].tolist(),
    'predicted_cluster': np.argmax(predictions, axis=1) + 1,
    'actual_cluster': strat_test_set['target'].tolist(),
    'seg_id': strat_test_set['seg_id'].tolist(),
    'speaker_id': strat_test_set['speaker_id'].tolist()
})
pred_df.to_csv('/content/gdrive/My Drive/EqualNumbersofIDSandADSandEverySpeaker.csv', index=False)


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Saving FixedTrainData.csv to FixedTrainData (2).csv
Training on Fold 1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training on Fold 2
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training on Fold 3
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training on Fold 4
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training on Fold 5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training on Fold 6
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training on Fold 7
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training on Fold 8
Epoch 1/10
Epoch 2/

In [None]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
import pandas as pd
import numpy as np
from google.colab import files, drive

# Mount Google Drive
drive.mount('/content/gdrive')
uploaded = files.upload()

LSTMData = pd.read_csv('FixedTrainData.csv')
LSTMData['sequence'] = LSTMData['sequence'].apply(lambda x: [int(i) for i in x.strip("[]").split(",")])

def create_model():
    model = Sequential()
    model.add(LSTM(100, input_shape=(10, 1)))
    model.add(Dense(6, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Perform stratified shuffle split for train-test split with equal proportions of IDS and ADS
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(LSTMData, LSTMData["speech_register"]):
    strat_train_set = LSTMData.loc[train_index]
    strat_test_set = LSTMData.loc[test_index]

# K-Fold Cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_no = 1
for train_index, val_index in kf.split(strat_train_set, strat_train_set['speech_register']):
    X_train = np.array(strat_train_set.iloc[train_index]['sequence'].tolist()).reshape(-1, 10, 1)
    y_train = to_categorical(strat_train_set.iloc[train_index]['target'] - 1, num_classes=6)
    X_val = np.array(strat_train_set.iloc[val_index]['sequence'].tolist()).reshape(-1, 10, 1)
    y_val = to_categorical(strat_train_set.iloc[val_index]['target'] - 1, num_classes=6)

    model = create_model()

    print(f"Training on Fold {fold_no}")
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64, verbose=1)
    fold_no += 1

# Train on the entire training set after K-Fold Cross-validation
X_train_full = np.array(strat_train_set['sequence'].tolist()).reshape(-1, 10, 1)
y_train_full = to_categorical(strat_train_set['target'] - 1, num_classes=6)
model = create_model()
model.fit(X_train_full, y_train_full, epochs=10, batch_size=64, verbose=1)

# Predict on test set
X_test = np.array(strat_test_set['sequence'].tolist()).reshape(-1, 10, 1)
predictions = model.predict(X_test)
pred_df = pd.DataFrame({
    'input_sequence': strat_test_set['sequence'].tolist(),
    'predicted_cluster': np.argmax(predictions, axis=1) + 1,
    'actual_cluster': strat_test_set['target'].tolist(),
    'seg_id': strat_test_set['seg_id'].tolist(),
    'speaker_id': strat_test_set['speaker_id'].tolist()
})
pred_df.to_csv('/content/gdrive/My Drive/FixedLSTMPredictions.csv', index=False)


In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from google.colab import files, drive

# Mount Google Drive
drive.mount('/content/gdrive')
uploaded = files.upload()
# Loading Data
LSTMData = pd.read_csv('TrainData6Clusters.csv')
LSTMData['sequence'] = LSTMData['sequence'].apply(lambda x: [int(i) for i in x.strip("[]").split(",")])

# Define the model
def create_model():
    model = Sequential()
    model.add(LSTM(100, input_shape=(10, 1)))
    model.add(Dense(6, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

all_predictions = []  # A list to store predictions from each of the 5 LSTMs

# Stratified Shuffle Split for 5 different train-test splits
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

for train_index, test_index in sss.split(LSTMData, LSTMData["speech_register"]):
    strat_train_set = LSTMData.loc[train_index]
    strat_test_set = LSTMData.loc[test_index]

    # K-Fold Cross-Validation on the training data
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    for train_index, val_index in kf.split(strat_train_set, strat_train_set["speech_register"]):
        train_data = strat_train_set.iloc[train_index]
        val_data = strat_train_set.iloc[val_index]

        X_train = np.array(train_data['sequence'].tolist()).reshape(-1, 10, 1)
        y_train = to_categorical(train_data['target'] - 1, num_classes=6)

        X_val = np.array(val_data['sequence'].tolist()).reshape(-1, 10, 1)
        y_val = to_categorical(val_data['target'] - 1, num_classes=6)

        model = create_model()
        model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64, verbose=1)

    # Predict on Test Set using the last model from k-fold cross-validation
    X_test = np.array(strat_test_set['sequence'].tolist()).reshape(-1, 10, 1)
    predictions = model.predict(X_test)
    predicted_clusters = np.argmax(predictions, axis=1) + 1

    # Storing predictions in a DataFrame
    pred_df = pd.DataFrame({
        'input_sequence': strat_test_set['sequence'].tolist(),
        'predicted_cluster': predicted_clusters,
        'actual_cluster': strat_test_set['target'].tolist(),
        'seg_id': strat_test_set['seg_id'].tolist(),
        'speaker_id': strat_test_set['speaker_id'].tolist(),
        'speech_register': strat_test_set['speech_register'].tolist()
    })

    all_predictions.append(pred_df)

# Concatenate all predictions
final_predictions_df = pd.concat(all_predictions)

# Save to CSV
final_predictions_df.to_csv('Unbalanced_6_Predictions.csv', index=False)
files.download('Unbalanced_6_Predictions.csv')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Saving TrainData6Clusters.csv to TrainData6Clusters (1).csv
Saving TrainData12Clusters.csv to TrainData12Clusters.csv
Saving TrainData24Clusters.csv to TrainData24Clusters.csv
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epo

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from google.colab import files, drive

# Mount Google Drive
# drive.mount('/content/gdrive')
# uploaded = files.upload()
# Loading Data
LSTMData = pd.read_csv('TrainData12Clusters.csv')
LSTMData['sequence'] = LSTMData['sequence'].apply(lambda x: [int(i) for i in x.strip("[]").split(",")])

# Define the model
def create_model():
    model = Sequential()
    model.add(LSTM(100, input_shape=(10, 1)))
    model.add(Dense(12, activation='softmax'))  # Adjusted to 12 clusters
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

all_predictions = []  # A list to store predictions from each of the 5 LSTMs

# Stratified Shuffle Split for 5 different train-test splits
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

for train_index, test_index in sss.split(LSTMData, LSTMData["speech_register"]):
    strat_train_set = LSTMData.loc[train_index]
    strat_test_set = LSTMData.loc[test_index]

    # K-Fold Cross-Validation on the training data
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    for train_index, val_index in kf.split(strat_train_set, strat_train_set["speech_register"]):
        train_data = strat_train_set.iloc[train_index]
        val_data = strat_train_set.iloc[val_index]

        X_train = np.array(train_data['sequence'].tolist()).reshape(-1, 10, 1)
        y_train = to_categorical(train_data['target'] - 1, num_classes=12)  # Adjusted to 12 clusters

        X_val = np.array(val_data['sequence'].tolist()).reshape(-1, 10, 1)
        y_val = to_categorical(val_data['target'] - 1, num_classes=12)  # Adjusted to 12 clusters

        model = create_model()
        model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64, verbose=1)

    # Predict on Test Set using the last model from k-fold cross-validation
    X_test = np.array(strat_test_set['sequence'].tolist()).reshape(-1, 10, 1)
    predictions = model.predict(X_test)
    predicted_clusters = np.argmax(predictions, axis=1) + 1

    # Storing predictions in a DataFrame
    pred_df = pd.DataFrame({
        'input_sequence': strat_test_set['sequence'].tolist(),
        'predicted_cluster': predicted_clusters,
        'actual_cluster': strat_test_set['target'].tolist(),
        'seg_id': strat_test_set['seg_id'].tolist(),
        'speaker_id': strat_test_set['speaker_id'].tolist(),
        'speech_register': strat_test_set['speech_register'].tolist()
    })

    all_predictions.append(pred_df)

# Concatenate all predictions
final_predictions_df = pd.concat(all_predictions)

# Save to CSV
final_predictions_df.to_csv('Unbalanced_12_Predictions.csv', index=False)
files.download('Unbalanced_12_Predictions.csv')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from google.colab import files, drive


# uploaded = files.upload()


# Loading Data
LSTMData = pd.read_csv('TrainData24Clusters.csv')
LSTMData['sequence'] = LSTMData['sequence'].apply(lambda x: [int(i) for i in x.strip("[]").split(",")])

# Define the model
def create_model():
    model = Sequential()
    model.add(LSTM(100, input_shape=(10, 1)))
    model.add(Dense(24, activation='softmax'))  # Adjusted to 24 clusters
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

all_predictions = []  # A list to store predictions from each of the 5 LSTMs

# Stratified Shuffle Split for 5 different train-test splits
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

for train_index, test_index in sss.split(LSTMData, LSTMData["speech_register"]):
    strat_train_set = LSTMData.loc[train_index]
    strat_test_set = LSTMData.loc[test_index]

    # K-Fold Cross-Validation on the training data
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    for train_index, val_index in kf.split(strat_train_set, strat_train_set["speech_register"]):
        train_data = strat_train_set.iloc[train_index]
        val_data = strat_train_set.iloc[val_index]

        X_train = np.array(train_data['sequence'].tolist()).reshape(-1, 10, 1)
        y_train = to_categorical(train_data['target'] - 1, num_classes=24)  # Adjusted to 24 clusters

        X_val = np.array(val_data['sequence'].tolist()).reshape(-1, 10, 1)
        y_val = to_categorical(val_data['target'] - 1, num_classes=24)  # Adjusted to 24 clusters

        model = create_model()
        model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64, verbose=1)

    # Predict on Test Set using the last model from k-fold cross-validation
    X_test = np.array(strat_test_set['sequence'].tolist()).reshape(-1, 10, 1)
    predictions = model.predict(X_test)
    predicted_clusters = np.argmax(predictions, axis=1) + 1

    # Storing predictions in a DataFrame
    pred_df = pd.DataFrame({
        'input_sequence': strat_test_set['sequence'].tolist(),
        'predicted_cluster': predicted_clusters,
        'actual_cluster': strat_test_set['target'].tolist(),
        'seg_id': strat_test_set['seg_id'].tolist(),
        'speaker_id': strat_test_set['speaker_id'].tolist(),
        'speech_register': strat_test_set['speech_register'].tolist()
    })

    all_predictions.append(pred_df)

# Concatenate all predictions
final_predictions_df = pd.concat(all_predictions)

# Save to CSV
final_predictions_df.to_csv('Unbalanced_24_Predictions.csv', index=False)
files.download('Unbalanced_24_Predictions.csv')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download('Speech_Register_Balanced_24_Predictions.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from google.colab import files, drive


uploaded = files.upload()


# Loading Data
LSTMData = pd.read_csv('Speech_RegisterBalancedTrainData6Clusters.csv')
LSTMData['sequence'] = LSTMData['sequence'].apply(lambda x: [int(i) for i in x.strip("[]").split(",")])

# Define the model
def create_model():
    model = Sequential()
    model.add(LSTM(100, input_shape=(10, 1)))
    model.add(Dense(6, activation='softmax'))  # Adjusted to 24 clusters
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

all_predictions = []  # A list to store predictions from each of the 5 LSTMs

# Stratified Shuffle Split for 5 different train-test splits
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

for train_index, test_index in sss.split(LSTMData, LSTMData["speech_register"]):
    strat_train_set = LSTMData.loc[train_index]
    strat_test_set = LSTMData.loc[test_index]

    # K-Fold Cross-Validation on the training data
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    for train_index, val_index in kf.split(strat_train_set, strat_train_set["speech_register"]):
        train_data = strat_train_set.iloc[train_index]
        val_data = strat_train_set.iloc[val_index]

        X_train = np.array(train_data['sequence'].tolist()).reshape(-1, 10, 1)
        y_train = to_categorical(train_data['target'] - 1, num_classes=24)  # Adjusted to 24 clusters

        X_val = np.array(val_data['sequence'].tolist()).reshape(-1, 10, 1)
        y_val = to_categorical(val_data['target'] - 1, num_classes=24)  # Adjusted to 24 clusters

        model = create_model()
        model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64, verbose=1)

    # Predict on Test Set using the last model from k-fold cross-validation
    X_test = np.array(strat_test_set['sequence'].tolist()).reshape(-1, 10, 1)
    predictions = model.predict(X_test)
    predicted_clusters = np.argmax(predictions, axis=1) + 1

    # Storing predictions in a DataFrame
    pred_df = pd.DataFrame({
        'input_sequence': strat_test_set['sequence'].tolist(),
        'predicted_cluster': predicted_clusters,
        'actual_cluster': strat_test_set['target'].tolist(),
        'seg_id': strat_test_set['seg_id'].tolist(),
        'speaker_id': strat_test_set['speaker_id'].tolist(),
        'speech_register': strat_test_set['speech_register'].tolist()
    })

    all_predictions.append(pred_df)

# Concatenate all predictions
final_predictions_df = pd.concat(all_predictions)

# Save to CSV
final_predictions_df.to_csv('Speech_Register_Balanced_24_Predictions.csv', index=False)