In [None]:
import torch, torchaudio
from google.colab import drive
import os, pathlib, glob
import opensmile
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings(
    "ignore",
    message=".*TorchCodec.*",
    category=UserWarning,
)
warnings.filterwarnings(
    "ignore",
    message=".*StreamReader.*deprecated.*",
    category=UserWarning,
)
drive.mount('/content/drive')

Mounted at /content/drive


# Sequencing dataset


In [None]:
import os

SPLITCLIPS="/content/drive/MyDrive/WuHaoAllenCentad/stress_id_wav_filtered_split"
FEATURESCSV="/content/drive/MyDrive/WuHaoAllenCentad/extracted_opensmile_features_split.csv"
fdf=pd.read_csv(FEATURESCSV)
stressidtestp="/content/drive/MyDrive/WuHaoAllenCentad/stressidtest2.csv"
stressidtest=pd.read_csv(stressidtestp)
stressidtrainp="/content/drive/MyDrive/WuHaoAllenCentad/stressidtrainbalanced2.csv"
stressidtrain=pd.read_csv(stressidtrainp)

id_and_categorical_cols = ["file", "basename"]
opensmile_feature_cols = [col for col in fdf.columns
                          if col not in id_and_categorical_cols]

all_segments_data = []
print(f"Iterating through files in: {SPLITCLIPS}")
for filename in os.listdir(SPLITCLIPS):
    filepath = os.path.join(SPLITCLIPS, filename)
    if os.path.isfile(filepath):
      filename_full = filename.split('/')[-1]
      original_file_id = '_'.join(filename_full.split('_')[:2])
      start_time_str = filename_full.split("_")[2].strip("s")
      segment_index = int(start_time_str) // 10

      features = fdf.loc[fdf['file'] == filepath].iloc[0]

      all_segments_data.append({
          "original_file_id": original_file_id,
          "segment_index": segment_index,
          "features": features,
      })

grouped_by_original_file = {}
for segment in all_segments_data:
    orig_id = segment["original_file_id"]
    if orig_id not in grouped_by_original_file:
        grouped_by_original_file[orig_id] = []
    grouped_by_original_file[orig_id].append(segment)

combined_stress_df = pd.concat([stressidtrain, stressidtest])
stress_labels_map = combined_stress_df.set_index('subject/task')['binary-stress'].to_dict()

filtered_grouped_by_original_file = {}
y_labels_for_sequences = []

for orig_id, segments in grouped_by_original_file.items():
    if orig_id in stress_labels_map:
        filtered_grouped_by_original_file[orig_id] = segments

grouped_by_original_file = filtered_grouped_by_original_file

for orig_id in grouped_by_original_file:
    grouped_by_original_file[orig_id].sort(key=lambda x: x["segment_index"])

X_sequences = []
y_sequences = []
original_file_ids_list = []

sorted_opensmile_features = sorted(opensmile_feature_cols)

import numpy as np

for orig_id in grouped_by_original_file:
    current_X = []
    segments = grouped_by_original_file[orig_id]
    for segment in segments:
        feature_vector = [segment["features"][col] for col in sorted_opensmile_features]
        current_X.append(feature_vector)

    if current_X:
        X_sequences.append(np.array(current_X))
        y_sequences.append(np.array([stress_labels_map[orig_id]], dtype=np.float32))
        original_file_ids_list.append(orig_id)

print(f"Total number of sequences (original audio files): {len(X_sequences)}")
if X_sequences:
    print(f"Shape of X[0] sequence (features): {X_sequences[0].shape} (sequence_length, num_opensmile_features)")
    print(f"Shape of y[0] sequence (label): {y_sequences[0].shape} (1,)")

newdf = {
    "X_sequences": X_sequences,
    "y_sequences": y_sequences,
    "original_file_ids": original_file_ids_list,
    "feature_names": sorted_opensmile_features,
    "label_names": ["binary_stress"]
}

print(f"# of original audio files processed: {len(newdf['X_sequences'])}")
if newdf['X_sequences']:
    print(f"# of features per segment: {len(newdf['feature_names'])}")
    print(f"# of labels per sequence: {len(newdf['label_names'])}")
    print(f"Shape of features for the first file (sequence): {newdf['X_sequences'][0].shape}")
    print(f"Shape of label for the first file: {newdf['y_sequences'][0].shape}")

# Split into train and test based on stressidtrainbalanced2.csv and stressidtest2.csv

In [None]:
train_ids = set(stressidtrain['subject/task'].values)
test_ids = set(stressidtest['subject/task'].values)

X_train, y_train, original_file_ids_train = [], [], []
X_test, y_test, original_file_ids_test = [], [], []

for i, file_id in enumerate(newdf['original_file_ids']):
    if file_id in train_ids:
        X_train.append(newdf['X_sequences'][i])
        y_train.append(newdf['y_sequences'][i])
        original_file_ids_train.append(file_id)
    elif file_id in test_ids:
        X_test.append(newdf['X_sequences'][i])
        y_test.append(newdf['y_sequences'][i])
        original_file_ids_test.append(file_id)
    else:
        print(f"{file_id} not found, skipping.")

traindf = {
    "X_sequences": X_train,
    "y_sequences": y_train,
    "original_file_ids": original_file_ids_train,
    "feature_names": newdf['feature_names'],
    "label_names": newdf['label_names']
}

testdf = {
    "X_sequences": X_test,
    "y_sequences": y_test,
    "original_file_ids": original_file_ids_test,
    "feature_names": newdf['feature_names'],
    "label_names": newdf['label_names']
}

print(f"Number of sequences in traindf: {len(traindf['X_sequences'])}")
print(f"Number of sequences in testdf: {len(testdf['X_sequences'])}")

# Create Dataset loader / batcher

In [None]:
import numpy as np

newdf_lookup = {}
for i, file_id in enumerate(newdf['original_file_ids']):
    newdf_lookup[file_id] = (
        newdf['X_sequences'][i],
        newdf['y_sequences'][i]
    )

X_train, y_train, original_file_ids_train = [], [], []
X_test, y_test, original_file_ids_test = [], [], []

for file_id in stressidtrain['subject/task']:
    if file_id in newdf_lookup:
        x_seq, y_seq = newdf_lookup[file_id]
        X_train.append(x_seq)
        y_train.append(y_seq)
        original_file_ids_train.append(file_id)
    else:
        print(f"Warning: Training file_id '{file_id}' not found in newdf_lookup. Skipping.")

for file_id in stressidtest['subject/task'].unique():
    if file_id in newdf_lookup:
        x_seq, y_seq = newdf_lookup[file_id]
        X_test.append(x_seq)
        y_test.append(y_seq)
        original_file_ids_test.append(file_id)
    else:
        print(f"Warning: Test file_id '{file_id}' not found in newdf_lookup. Skipping.")

traindf = {
    "X_sequences": X_train,
    "y_sequences": y_train,
    "original_file_ids": original_file_ids_train,
    "feature_names": newdf['feature_names'],
    "label_names": newdf['label_names']
}

testdf = {
    "X_sequences": X_test,
    "y_sequences": y_test,
    "original_file_ids": original_file_ids_test,
    "feature_names": newdf['feature_names'],
    "label_names": newdf['label_names']
}

print(f"Number of sequences in traindf: {len(traindf['X_sequences'])}")
print(f"Number of sequences in testdf: {len(testdf['X_sequences'])}")

unique_train_original_ids = set(stressidtrain['subject/task'].unique())
unique_test_original_ids = set(stressidtest['subject/task'].unique())

assert len(unique_train_original_ids.intersection(unique_test_original_ids)) == 0, "Train and Test sets have overlapping original_file_ids!"
print("Train and Test sets have no overlapping original_file_ids based on unique subject/task identifiers.")

Number of sequences in traindf: 440
Number of sequences in testdf: 70
Train and Test sets have no overlapping original_file_ids based on unique subject/task identifiers.


In [None]:
import joblib
import os

TRAIN_DATA_FILE = "/content/drive/MyDrive/WuHaoAllenCentad/traindf_sequenced_data.joblib"
TEST_DATA_FILE = "/content/drive/MyDrive/WuHaoAllenCentad/testdf_sequenced_data.joblib"

os.makedirs(os.path.dirname(TRAIN_DATA_FILE), exist_ok=True)
os.makedirs(os.path.dirname(TEST_DATA_FILE), exist_ok=True)

joblib.dump(traindf, TRAIN_DATA_FILE)
joblib.dump(testdf, TEST_DATA_FILE)

print(f"Training data saved to: {TRAIN_DATA_FILE}")
print(f"Testing data saved to: {TEST_DATA_FILE}")

Training data saved to: /content/drive/MyDrive/WuHaoAllenCentad/traindf_sequenced_data.joblib
Testing data saved to: /content/drive/MyDrive/WuHaoAllenCentad/testdf_sequenced_data.joblib


In [1]:
import joblib
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from google.colab import drive
drive.mount('/content/drive')
TRAIN_DATA_FILE = "/content/drive/MyDrive/WuHaoAllenCentad/traindf_sequenced_data.joblib"
TEST_DATA_FILE = "/content/drive/MyDrive/WuHaoAllenCentad/testdf_sequenced_data.joblib"

traindf = joblib.load(TRAIN_DATA_FILE)
testdf = joblib.load(TEST_DATA_FILE)

print(f"Training data loaded from: {TRAIN_DATA_FILE}")
print(f"Testing data loaded from: {TEST_DATA_FILE}")

class StressDataset(Dataset):
    def __init__(self, X_sequences, y_sequences):
        self.X_sequences = X_sequences
        self.y_sequences = y_sequences

    def __len__(self):
        return len(self.X_sequences)

    def __getitem__(self, idx):
        x_tensor = torch.from_numpy(self.X_sequences[idx]).float()
        y_tensor = torch.from_numpy(self.y_sequences[idx]).float()
        return x_tensor, y_tensor

def custom_collate_fn(batch):
    x_batch, y_batch = zip(*batch)
    padded_x = pad_sequence(x_batch, batch_first=True, padding_value=0.0)
    stacked_y = torch.stack(y_batch, dim=0)
    return padded_x, stacked_y

train_dataset = StressDataset(traindf['X_sequences'], traindf['y_sequences'])
test_dataset = StressDataset(testdf['X_sequences'], testdf['y_sequences'])

batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate_fn)

print(f"\nNumber of sequences in train_dataset: {len(train_dataset)}")
print(f"Number of sequences in test_dataset: {len(test_dataset)}")
print(f"Train DataLoader created with {len(train_loader)} batches.")
print(f"Test DataLoader created with {len(test_loader)} batches.")

Mounted at /content/drive
Training data loaded from: /content/drive/MyDrive/WuHaoAllenCentad/traindf_sequenced_data.joblib
Testing data loaded from: /content/drive/MyDrive/WuHaoAllenCentad/testdf_sequenced_data.joblib

Number of sequences in train_dataset: 440
Number of sequences in test_dataset: 70
Train DataLoader created with 14 batches.
Test DataLoader created with 3 batches.


# Model definition

In [2]:
import torch.nn as nn

class StressLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=1, bidirectional=False, dropout_prob=0.2):
        super().__init__()
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout_prob if num_layers > 1 else 0
        )
        self.output_dim = hidden_dim * (2 if bidirectional else 1)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(self.output_dim, 1)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)

        if self.bidirectional:
            last_hidden = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
        else:
            last_hidden = h_n[-1,:,:]

        last_hidden = self.dropout(last_hidden)

        logits = self.fc(last_hidden)
        return logits

# Training model

In [17]:
import torch.optim as optim
import torch.nn as nn
import numpy as np

input_dim = len(traindf['feature_names'])

model = StressLSTM(input_dim=input_dim, hidden_dim=128, num_layers=3, bidirectional=False, dropout_prob=0.2)

y_train_flat = np.concatenate(traindf['y_sequences'])
num_pos = np.sum(y_train_flat == 1)
num_neg = np.sum(y_train_flat == 0)

print(f"# stress: {num_pos}")
print(f"# non stressed {num_neg}")

pos_weight_value = torch.tensor([num_neg / num_pos], dtype=torch.float32) if num_pos > 0 else torch.tensor([1.0])

print(f"Calculated positive class weight: {pos_weight_value.item():.4f}")

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_value)

optimizer = optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 50
print(f"Starting training for {num_epochs} epochs...")

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        preds = model(batch_X)

        loss = criterion(preds, batch_y)

        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * batch_X.size(0)

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss / len(train_dataset):.4f}")

print("Training complete.")

# stress: 220
# non stressed 220
Calculated positive class weight: 1.0000
Starting training for 50 epochs...
Epoch 1/50, Loss: 0.6968
Epoch 2/50, Loss: 0.6799
Epoch 3/50, Loss: 0.6297
Epoch 4/50, Loss: 0.5871
Epoch 5/50, Loss: 0.5610
Epoch 6/50, Loss: 0.4952
Epoch 7/50, Loss: 0.4384
Epoch 8/50, Loss: 0.4042
Epoch 9/50, Loss: 0.3408
Epoch 10/50, Loss: 0.3179
Epoch 11/50, Loss: 0.4014
Epoch 12/50, Loss: 0.2893
Epoch 13/50, Loss: 0.2007
Epoch 14/50, Loss: 0.2340
Epoch 15/50, Loss: 0.1966
Epoch 16/50, Loss: 0.1372
Epoch 17/50, Loss: 0.1391
Epoch 18/50, Loss: 0.2490
Epoch 19/50, Loss: 0.1516
Epoch 20/50, Loss: 0.1241
Epoch 21/50, Loss: 0.1045
Epoch 22/50, Loss: 0.1028
Epoch 23/50, Loss: 0.1139
Epoch 24/50, Loss: 0.1711
Epoch 25/50, Loss: 0.1683
Epoch 26/50, Loss: 0.1023
Epoch 27/50, Loss: 0.0847
Epoch 28/50, Loss: 0.0994
Epoch 29/50, Loss: 0.1181
Epoch 30/50, Loss: 0.0999
Epoch 31/50, Loss: 0.0519
Epoch 32/50, Loss: 0.0434
Epoch 33/50, Loss: 0.0882
Epoch 34/50, Loss: 0.0776
Epoch 35/50, Los

## Evaluate LSTM Model

In [10]:
import torch
import os

MODEL_SAVE_PATH = "/content/drive/MyDrive/WuHaoAllenCentad/stress_lstm_modellayer3.pth"
os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)
torch.save(model.state_dict(), MODEL_SAVE_PATH)
print(f"Trained model saved to: {MODEL_SAVE_PATH}")

Trained model saved to: /content/drive/MyDrive/WuHaoAllenCentad/stress_lstm_modellayer3.pth


In [19]:
import torch
import os
import joblib

# MODEL_SAVE_PATH = "/content/drive/MyDrive/WuHaoAllenCentad/stress_lstm_model3.pth"
MODEL_SAVE_PATH = "/content/drive/MyDrive/WuHaoAllenCentad/stress_lstm_modellayer3.pth"

input_dim = len(traindf['feature_names'])
model = StressLSTM(input_dim=input_dim, hidden_dim=128, num_layers=3, bidirectional=False, dropout_prob=0.2)

model.load_state_dict(torch.load(MODEL_SAVE_PATH))
model.eval()
print(f"model loaded from: {MODEL_SAVE_PATH}")


model loaded from: /content/drive/MyDrive/WuHaoAllenCentad/stress_lstm_modellayer3.pth


In [20]:
from sklearn.metrics import balanced_accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
import numpy as np

model.eval()
all_probs = []
all_labels = []
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        logits = model(batch_X)
        probs = torch.sigmoid(logits)
        all_probs.extend(probs.cpu().numpy().flatten())
        all_labels.extend(batch_y.cpu().numpy().flatten())
all_probs = np.array(all_probs)
all_labels = np.array(all_labels)

final_threshold = 0.5

all_preds = (all_probs >= final_threshold).astype(int)

bal_acc = balanced_accuracy_score(all_labels, all_preds)
f1_macro = f1_score(all_labels, all_preds, average='macro', zero_division=0)
auc_roc = roc_auc_score(all_labels, all_probs)

precision_class1 = precision_score(all_labels, all_preds, pos_label=1, zero_division=0)
recall_class1 = recall_score(all_labels, all_preds, pos_label=1, zero_division=0)
precision_class0 = precision_score(all_labels, all_preds, pos_label=0, zero_division=0)
recall_class0 = recall_score(all_labels, all_preds, pos_label=0, zero_division=0)

print("--- Comprehensive Evaluation Metrics ---")
print(f"Balanced Accuracy: {bal_acc:.4f}")
print(f"F1 (macro): {f1_macro:.4f}")
print(f"AUC (ROC-AUC): {auc_roc:.4f}")
print(f"\nPrecision (Class 1 - Stressed): {precision_class1:.4f}")
print(f"Recall (Class 1 - Stressed): {recall_class1:.4f}")
print(f"Precision (Class 0 - Non-Stressed): {precision_class0:.4f}")
print(f"Recall (Class 0 - Non-Stressed): {recall_class0:.4f}")

--- Comprehensive Evaluation Metrics ---
Balanced Accuracy: 0.6560
F1 (macro): 0.6578
AUC (ROC-AUC): 0.7317

Precision (Class 1 - Stressed): 0.6939
Recall (Class 1 - Stressed): 0.8293
Precision (Class 0 - Non-Stressed): 0.6667
Recall (Class 0 - Non-Stressed): 0.4828


# Getting predicted probabilities for ensemble

In [None]:
import torch
import os

MODEL_SAVE_PATH = "/content/drive/MyDrive/WuHaoAllenCentad/stress_lstm_model.pth"

os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True);

torch.save(model.state_dict(), MODEL_SAVE_PATH);

print(f"Trained model saved to: {MODEL_SAVE_PATH}")

NameError: name 'model' is not defined

In [21]:
import torch
import numpy as np

MODEL_SAVE_PATH = "/content/drive/MyDrive/WuHaoAllenCentad/stress_lstm_modellayer3.pth"

input_dim = len(traindf['feature_names'])
model = StressLSTM(input_dim=input_dim, hidden_dim=128, num_layers=3, bidirectional=False, dropout_prob=0.2)

model.load_state_dict(torch.load(MODEL_SAVE_PATH))
model.eval()
print(f"model loaded from: {MODEL_SAVE_PATH}")


model.eval()

train_ids = []
train_probs = []

print(f"Predicting probabilities for {len(traindf['X_sequences'])} training sequences...")
for i, X_sequence in enumerate(traindf['X_sequences']):
    original_file_id = traindf['original_file_ids'][i]

    feature_tensor = torch.from_numpy(X_sequence).float().unsqueeze(0)

    with torch.no_grad():
        logits = model(feature_tensor)

    probs = torch.sigmoid(logits)

    probability = probs.item()

    train_ids.append(original_file_id)
    train_probs.append(probability)

print("Probability prediction for training data complete.")
print(f"Collected {len(train_ids)} train IDs and {len(train_probs)} train probabilities.")

print("\nFirst 5 predicted training probabilities:")
for i in range(min(5, len(train_ids))):
    print(f"ID: {train_ids[i]}, Probability: {train_probs[i]:.4f}")

model loaded from: /content/drive/MyDrive/WuHaoAllenCentad/stress_lstm_modellayer3.pth
Predicting probabilities for 440 training sequences...
Probability prediction for training data complete.
Collected 440 train IDs and 440 train probabilities.

First 5 predicted training probabilities:
ID: tmvd_Stroop, Probability: 0.0048
ID: 9t6n_Counting2, Probability: 0.9987
ID: kycf_Counting3, Probability: 0.0039
ID: t6v9_Math, Probability: 0.9993
ID: y8c3_Speaking, Probability: 0.0603


In [22]:
import torch
import numpy as np

model.eval()

test_ids = []
test_probs = []

for i, X_sequence in enumerate(testdf['X_sequences']):
    original_file_id = testdf['original_file_ids'][i]

    feature_tensor = torch.from_numpy(X_sequence).float().unsqueeze(0)

    with torch.no_grad():
        logits = model(feature_tensor)

    probs = torch.sigmoid(logits)

    probability = probs.item()

    test_ids.append(original_file_id)
    test_probs.append(probability)


In [23]:
import pandas as pd
import os

train_predictions_df = pd.DataFrame({
    'subject/task': train_ids,
    'stress_probability': train_probs,
    'dataset': 'train'
})

test_predictions_df = pd.DataFrame({
    'subject/task': test_ids,
    'stress_probability': test_probs,
    'dataset': 'test'
})

all_predictions_df = pd.concat([train_predictions_df, test_predictions_df], ignore_index=True);

SAVE_CSV_PATH = "/content/drive/MyDrive/WuHaoAllenCentad/audiosegmentationopensmilepredictions.csv"

os.makedirs(os.path.dirname(SAVE_CSV_PATH), exist_ok=True);

all_predictions_df.to_csv(SAVE_CSV_PATH, index=False);

print(f"All stress predictions saved to: {SAVE_CSV_PATH}")
print("First 5 rows of the combined predictions DataFrame:")
print(all_predictions_df.head())
print("Last 5 rows of the combined predictions DataFrame:")
print(all_predictions_df.tail())

All stress predictions saved to: /content/drive/MyDrive/WuHaoAllenCentad/audiosegmentationopensmilepredictions.csv
First 5 rows of the combined predictions DataFrame:
     subject/task  stress_probability dataset
0     tmvd_Stroop            0.004798   train
1  9t6n_Counting2            0.998699   train
2  kycf_Counting3            0.003889   train
3       t6v9_Math            0.999322   train
4   y8c3_Speaking            0.060315   train
Last 5 rows of the combined predictions DataFrame:
       subject/task  stress_probability dataset
505  y9z6_Counting3            0.997287    test
506       y9z6_Math            0.996912    test
507    y9z6_Reading            0.996053    test
508   y9z6_Speaking            0.999095    test
509     y9z6_Stroop            0.992305    test


# One item prediction

In [None]:
import pandas as pd
import shutil

def analyze_stress_segments(main_mp3_path, chunk_duration_sec=30):
    if not os.path.exists(main_mp3_path):
        print(f"File not found: {main_mp3_path}")
        return None

    print(f"Loading {main_mp3_path}...")
    audio = AudioSegment.from_mp3(main_mp3_path)
    total_duration_sec = len(audio) / 1000
    print(f"Total duration: {total_duration_sec:.2f} seconds")

    results = []
    chunk_len_ms = chunk_duration_sec * 1000

    chunk_temp_dir = "temp_30s_chunks"
    if os.path.exists(chunk_temp_dir):
        shutil.rmtree(chunk_temp_dir)
    os.makedirs(chunk_temp_dir)

    try:
        for i, start_ms in enumerate(range(0, len(audio), chunk_len_ms)):
            end_ms = min(start_ms + chunk_len_ms, len(audio))

            if end_ms - start_ms < 10000:
                print(f"Skipping tail segment {start_ms/1000:.1f}s-{end_ms/1000:.1f}s (too short)")
                continue

            chunk = audio[start_ms:end_ms]

            chunk_filename = f"chunk_{i}_{start_ms//1000}-{end_ms//1000}.mp3"
            chunk_path = os.path.join(chunk_temp_dir, chunk_filename)
            chunk.export(chunk_path, format="mp3")

            print(f"Analyzing Window {i+1}: {start_ms/1000:.1f}s - {end_ms/1000:.1f}s")

            result = predict_stress_sliding_window(chunk_path)

            if result is not None:
                pred, prob = result
                results.append({
                    "window_index": i + 1,
                    "start_time_s": start_ms / 1000,
                    "end_time_s": end_ms / 1000,
                    "predicted_class": pred,
                    "stress_probability": prob
                })

            os.remove(chunk_path)

    finally:
        if os.path.exists(chunk_temp_dir):
            shutil.rmtree(chunk_temp_dir)

    if not results:
        print("No results generated.")
        return None

    df_res = pd.DataFrame(results)

    df_sorted = df_res.sort_values(by="stress_probability", ascending=False)

    print("\n--- Top 5 Most Stressed Windows ---")
    display(df_sorted.head(5))

    most_stressed = df_sorted.iloc[0]
    print(f"\nMost stressed window found: {most_stressed['start_time_s']}s - {most_stressed['end_time_s']}s")
    print(f"Probability: {most_stressed['stress_probability']:.4f}")

    return df_res

target_file = "/content/Job Interview.mp3"
stress_df = analyze_stress_segments(target_file)

In [None]:
from pydub import AudioSegment
import os
import shutil
import numpy as np
import torch
def predict_stress_from_mp3(mp3_filepath):
    df_opensmile = smile.process_file(mp3_filepath)
    sorted_opensmile_features = newdf['feature_names']
    feature_vector = [df_opensmile[col].iloc[0] for col in sorted_opensmile_features]
    feature_array = np.array(feature_vector)

    feature_tensor = torch.from_numpy(feature_array).float().unsqueeze(0).unsqueeze(0)

    model.eval()

    with torch.no_grad():
        logits = model(feature_tensor)

    probs = torch.sigmoid(logits)

    prediction = (probs > 0.5).int().item()

    return prediction

def predict_stress_sliding_window(mp3_filepath):
    WINDOW_SIZE = 10000
    STEP = 5000

    temp_dir = "temp_inference_segments"
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
    os.makedirs(temp_dir)

    try:
        audio = AudioSegment.from_mp3(mp3_filepath)

        start = 0
        clip_paths = []

        filename = os.path.basename(mp3_filepath)
        name, _ = os.path.splitext(filename)

        while start + WINDOW_SIZE <= len(audio):
            end = start + WINDOW_SIZE
            subclip = audio[start:end]

            out_name = f"{name}_s{start//1000}_e{end//1000}.mp3"
            out_path = os.path.join(temp_dir, out_name)

            subclip.export(out_path, format="mp3")
            clip_paths.append(out_path)

            start += STEP

        if not clip_paths:
            print(f"Audio file {mp3_filepath} is too short (< 10s) to extract segments.")
            return None

        features_list = []
        sorted_opensmile_features = newdf['feature_names']

        print(f"Processing {len(clip_paths)} segments...")
        for cp in clip_paths:
            df_opensmile = smile.process_file(cp)

            feature_vector = [df_opensmile[col].iloc[0] for col in sorted_opensmile_features]
            features_list.append(feature_vector)

        X_sequence = np.array(features_list)
        X_tensor = torch.from_numpy(X_sequence).float().unsqueeze(0)

        model.eval()
        with torch.no_grad():
            logits = model(X_tensor)
            probs = torch.sigmoid(logits)
            prediction = (probs > 0.5).int().item()

        return prediction, probs.item()

    finally:
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)

onesample = "/content/drive/MyDrive/stressedjob3.mp3"
if os.path.exists(onesample):
    pred, prob = predict_stress_sliding_window(onesample)
    print(f"\nFile: {onesample}")
    print(f"Prediction: {pred} (Stressed)" if pred == 1 else f"Prediction: {pred} (Not Stressed)")
    print(f"Probability: {prob:.4f}")
else:
    print(f"Sample file not found: {onesample}")