In [7]:
import cv2
import mediapipe as mp #face detector
import math
import numpy as np
import time
import datetime
from collections import Counter

import warnings
warnings.simplefilter("ignore", UserWarning)

import torch
import torch.nn as  nn
import torch.nn.functional as F
from PIL import Image
from torchvision import transforms

# Import tkinter for GUI
import tkinter as tk
from tkinter import ttk, scrolledtext, filedialog
import PIL.Image, PIL.ImageTk
from threading import Thread, Event
import os

# Additional imports for audio processing
import pyaudio
import wave
import threading
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
import queue
import soundfile as sf
import librosa

# Import transformers for audio processing
from transformers import pipeline

#### Model architectures

In [8]:
class Bottleneck(nn.Module):
    expansion = 4
    def __init__(self, in_channels, out_channels, i_downsample=None, stride=1):
        super(Bottleneck, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, padding=0, bias=False)
        self.batch_norm1 = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.99)
        
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding='same', bias=False)
        self.batch_norm2 = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.99)
        
        self.conv3 = nn.Conv2d(out_channels, out_channels*self.expansion, kernel_size=1, stride=1, padding=0, bias=False)
        self.batch_norm3 = nn.BatchNorm2d(out_channels*self.expansion, eps=0.001, momentum=0.99)
        
        self.i_downsample = i_downsample
        self.stride = stride
        self.relu = nn.ReLU()
        
    def forward(self, x):
        identity = x.clone()
        x = self.relu(self.batch_norm1(self.conv1(x)))
        
        x = self.relu(self.batch_norm2(self.conv2(x)))
        
        x = self.conv3(x)
        x = self.batch_norm3(x)
        
        #downsample if needed
        if self.i_downsample is not None:
            identity = self.i_downsample(identity)
        #add identity
        x+=identity
        x=self.relu(x)
        
        return x

class Conv2dSame(torch.nn.Conv2d):

    def calc_same_pad(self, i: int, k: int, s: int, d: int) -> int:
        return max((math.ceil(i / s) - 1) * s + (k - 1) * d + 1 - i, 0)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        ih, iw = x.size()[-2:]

        pad_h = self.calc_same_pad(i=ih, k=self.kernel_size[0], s=self.stride[0], d=self.dilation[0])
        pad_w = self.calc_same_pad(i=iw, k=self.kernel_size[1], s=self.stride[1], d=self.dilation[1])

        if pad_h > 0 or pad_w > 0:
            x = F.pad(
                x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2]
            )
        return F.conv2d(
            x,
            self.weight,
            self.bias,
            self.stride,
            self.padding,
            self.dilation,
            self.groups,
        )

class ResNet(nn.Module):
    def __init__(self, ResBlock, layer_list, num_classes, num_channels=3):
        super(ResNet, self).__init__()
        self.in_channels = 64

        self.conv_layer_s2_same = Conv2dSame(num_channels, 64, 7, stride=2, groups=1, bias=False)
        self.batch_norm1 = nn.BatchNorm2d(64, eps=0.001, momentum=0.99)
        self.relu = nn.ReLU()
        self.max_pool = nn.MaxPool2d(kernel_size = 3, stride=2)
        
        self.layer1 = self._make_layer(ResBlock, layer_list[0], planes=64, stride=1)
        self.layer2 = self._make_layer(ResBlock, layer_list[1], planes=128, stride=2)
        self.layer3 = self._make_layer(ResBlock, layer_list[2], planes=256, stride=2)
        self.layer4 = self._make_layer(ResBlock, layer_list[3], planes=512, stride=2)
        
        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.fc1 = nn.Linear(512*ResBlock.expansion, 512)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(512, num_classes)

    def extract_features(self, x):
        x = self.relu(self.batch_norm1(self.conv_layer_s2_same(x)))
        x = self.max_pool(x)
        # print(x.shape)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        x = self.avgpool(x)
        x = x.reshape(x.shape[0], -1)
        x = self.fc1(x)
        return x
        
    def forward(self, x):
        x = self.extract_features(x)
        x = self.relu1(x)
        x = self.fc2(x)
        return x
        
    def _make_layer(self, ResBlock, blocks, planes, stride=1):
        ii_downsample = None
        layers = []
        
        if stride != 1 or self.in_channels != planes*ResBlock.expansion:
            ii_downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, planes*ResBlock.expansion, kernel_size=1, stride=stride, bias=False, padding=0),
                nn.BatchNorm2d(planes*ResBlock.expansion, eps=0.001, momentum=0.99)
            )
            
        layers.append(ResBlock(self.in_channels, planes, i_downsample=ii_downsample, stride=stride))
        self.in_channels = planes*ResBlock.expansion
        
        for i in range(blocks-1):
            layers.append(ResBlock(self.in_channels, planes))
            
        return nn.Sequential(*layers)
        
def ResNet50(num_classes, channels=3):
    return ResNet(Bottleneck, [3,4,6,3], num_classes, channels)


class LSTMPyTorch(nn.Module):
    def __init__(self):
        super(LSTMPyTorch, self).__init__()
        
        self.lstm1 = nn.LSTM(input_size=512, hidden_size=512, batch_first=True, bidirectional=False)
        self.lstm2 = nn.LSTM(input_size=512, hidden_size=256, batch_first=True, bidirectional=False)
        self.fc = nn.Linear(256, 7)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)        
        x = self.fc(x[:, -1, :])
        x = self.softmax(x)
        return x

#### Sub functions

In [9]:
def pth_processing(fp):
    class PreprocessInput(torch.nn.Module):
        def init(self):
            super(PreprocessInput, self).init()

        def forward(self, x):
            x = x.to(torch.float32)
            x = torch.flip(x, dims=(0,))
            x[0, :, :] -= 91.4953
            x[1, :, :] -= 103.8827
            x[2, :, :] -= 131.0912
            return x

    def get_img_torch(img):
        
        ttransform = transforms.Compose([
            transforms.PILToTensor(),
            PreprocessInput()
        ])
        img = img.resize((224, 224), Image.Resampling.NEAREST)
        img = ttransform(img)
        img = torch.unsqueeze(img, 0)
        return img
    return get_img_torch(fp)

def tf_processing(fp):
    def preprocess_input(x):
        x_temp = np.copy(x)
        x_temp = x_temp[..., ::-1]
        x_temp[..., 0] -= 91.4953
        x_temp[..., 1] -= 103.8827
        x_temp[..., 2] -= 131.0912
        return x_temp

    def get_img_tf(img):
        img = cv2.resize(img, (224,224), interpolation=cv2.INTER_NEAREST)
        img = tf.keras.utils.img_to_array(img)
        img = preprocess_input(img)
        img = np.array([img])
        return img

    return get_img_tf(fp)

def norm_coordinates(normalized_x, normalized_y, image_width, image_height):
    
    x_px = min(math.floor(normalized_x * image_width), image_width - 1)
    y_px = min(math.floor(normalized_y * image_height), image_height - 1)
    
    return x_px, y_px

def get_box(fl, w, h):
    idx_to_coors = {}
    for idx, landmark in enumerate(fl.landmark):
        landmark_px = norm_coordinates(landmark.x, landmark.y, w, h)

        if landmark_px:
            idx_to_coors[idx] = landmark_px

    x_min = np.min(np.asarray(list(idx_to_coors.values()))[:,0])
    y_min = np.min(np.asarray(list(idx_to_coors.values()))[:,1])
    endX = np.max(np.asarray(list(idx_to_coors.values()))[:,0])
    endY = np.max(np.asarray(list(idx_to_coors.values()))[:,1])

    (startX, startY) = (max(0, x_min), max(0, y_min))
    (endX, endY) = (min(w - 1, endX), min(h - 1, endY))
    
    return startX, startY, endX, endY

In [10]:
class AudioVisualizer:
    def __init__(self, frame, height=100, width=200):
        self.frame = frame
        self.height = height
        self.width = width
        
        # Create matplotlib figure
        self.fig = Figure(figsize=(width/100, height/100), dpi=100)
        self.ax = self.fig.add_subplot(111)
        
        # Initial empty plot
        self.line, = self.ax.plot([], [], color='#3498db', linewidth=2)
        self.ax.set_ylim(-0.5, 0.5)
        self.ax.set_xlim(0, 100)
        self.ax.axis('off')
        self.fig.subplots_adjust(left=0, right=1, top=1, bottom=0, wspace=0, hspace=0)
        
        # Create canvas
        self.canvas = FigureCanvasTkAgg(self.fig, master=self.frame)
        self.canvas.get_tk_widget().pack(fill=tk.BOTH, expand=True)
        
        # Buffer for audio data
        self.buffer = [0] * 100
    
    def update(self, audio_data):
        # Convert audio data to normalized values
        if audio_data is not None and len(audio_data) > 0:
            # Ensure we're getting the right number of samples
            samples = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
            samples = samples / 32768.0  # Normalize
            
            # Update buffer (sliding window)
            self.buffer = self.buffer[len(samples):] + samples.tolist()[:100]
            
            # Update plot
            self.line.set_data(range(len(self.buffer)), self.buffer)
            self.canvas.draw_idle()

class AudioProcessor:
    def __init__(self, visualizer=None, callback=None):
        self.CHUNK = 1024
        self.FORMAT = pyaudio.paInt16
        self.CHANNELS = 1
        self.RATE = 16000  # 16kHz for model
        self.recording = False
        self.frames = []
        self.visualizer = visualizer
        self.callback = callback
        self.audio_queue = queue.Queue()
        self.audio = pyaudio.PyAudio()
        self.stream = None
        
        # Speech characteristics
        self.speech_metrics = {
            "avg_pitch": 0,
            "pitch_variation": 0,
            "speaking_rate": 0,
            "volume": 0,
            "clarity": 0,
            "samples_processed": 0
        }
        
        # Voice emotions (derived from metrics only)
        self.voice_emotions = {
            "confidence": 0,
            "engagement": 0,
            "hesitation": 0,
            "enthusiasm": 0
        }
        
        # Flag for periodic processing
        self.last_process_time = 0
        self.processing_interval = 5  # Process every 5 seconds

    def start_recording(self):
        if self.recording:
            return
        self.recording = True
        self.frames = []
        self.last_process_time = time.time()
        
        # Reset metrics
        self.speech_metrics = {
            "avg_pitch": 0,
            "pitch_variation": 0,
            "speaking_rate": 0,
            "volume": 0,
            "clarity": 0,
            "samples_processed": 0
        }
        
        self.voice_emotions = {
            "confidence": 0,
            "engagement": 0,
            "hesitation": 0,
            "enthusiasm": 0
        }
        
        def callback(in_data, frame_count, time_info, status):
            if self.recording:
                self.frames.append(in_data)
                self.audio_queue.put(in_data)
                
                # Update visualizer if available
                if self.visualizer:
                    self.visualizer.update(in_data)
                
                # Process audio periodically
                current_time = time.time()
                if current_time - self.last_process_time >= self.processing_interval:
                    self.process_audio_chunk()
                    self.last_process_time = current_time
            return (in_data, pyaudio.paContinue)
        
        try:
            self.stream = self.audio.open(
                format=self.FORMAT,
                channels=self.CHANNELS,
                rate=self.RATE,
                input=True,
                frames_per_buffer=self.CHUNK,
                stream_callback=callback
            )
            print("Audio recording started")
        except Exception as e:
            print(f"Error starting audio stream: {e}")
            self.recording = False

    def stop_recording(self):
        if not self.recording:
            return
        self.recording = False
        
        if self.stream:
            try:
                self.stream.stop_stream()
                self.stream.close()
                self.stream = None
                print("Audio recording stopped")
            except Exception as e:
                print(f"Error stopping audio stream: {e}")

    def process_audio_chunk(self):
        # Always process/log, even if not enough frames
        enough_data = len(self.frames) >= 50
        # Take last ~3 seconds of audio, or whatever is available
        recent_frames = self.frames[-50:] if enough_data else self.frames
        try:
            if recent_frames:
                audio_data = np.frombuffer(b''.join(recent_frames), dtype=np.int16).astype(np.float32)
                audio_data = audio_data / 32768.0  # Normalize to [-1, 1]
            else:
                audio_data = np.array([])

            # Only use direct speech metrics (no HuggingFace model)
            direct_metrics_emotions = {
                "confidence": 0,
                "engagement": 0,
                "hesitation": 0,
                "enthusiasm": 0
            }

            if len(audio_data) > 0:
                # Calculate volume (RMS amplitude)
                volume = np.sqrt(np.mean(audio_data**2))
                # In AudioProcessor.process_audio_chunk, after volume calculation
                if volume > 0:
                    volume_db = 20 * np.log10(volume)
                else:
                    volume_db = -100.0
                self.speech_metrics["volume_db"] = volume_db
                try:
                    # Extract pitch information
                    pitches, magnitudes = librosa.piptrack(y=audio_data, sr=self.RATE)
                    pitch_values = []
                    for i in range(pitches.shape[1]):
                        index = magnitudes[:, i].argmax()
                        pitch = pitches[index, i]
                        if pitch > 0:
                            pitch_values.append(pitch)
                    
                    if pitch_values:
                        # Calculate basic speech metrics
                        avg_pitch = np.mean(pitch_values)
                        pitch_variation = np.std(pitch_values)
                        alpha = 0.7  # Smoothing factor
                        
                        # Update speech metrics with new values
                        self.speech_metrics["avg_pitch"] = alpha * avg_pitch + (1 - alpha) * self.speech_metrics["avg_pitch"]
                        self.speech_metrics["pitch_variation"] = alpha * pitch_variation + (1 - alpha) * self.speech_metrics["pitch_variation"]
                        self.speech_metrics["volume"] = alpha * volume + (1 - alpha) * self.speech_metrics["volume"]
                        self.speech_metrics["samples_processed"] += 1
                        
                        # Calculate speaking rate from zero crossings
                        zero_crossings = librosa.zero_crossings(audio_data)
                        speaking_rate = sum(zero_crossings) / len(audio_data) * self.RATE / 100
                        self.speech_metrics["speaking_rate"] = alpha * speaking_rate + (1 - alpha) * self.speech_metrics["speaking_rate"]
                        
                        # Clarity (spectral centroid)
                        spectral_centroids = librosa.feature.spectral_centroid(y=audio_data, sr=self.RATE)[0]
                        self.speech_metrics["clarity"] = min(1.0, np.mean(spectral_centroids) / 3000)
                        
                        # Calculate emotions from direct speech metrics
                        # 1. Confidence: Based on volume and steady pitch
                        norm_pitch_var = min(1.0, pitch_variation / 200)  # Cap at 200Hz variation
                        steady_factor = 1.0 - norm_pitch_var * 0.5  # Steadier voice = more confident (but with less impact)
                        volume_factor = min(1.0, volume * 12)  # Louder = more confident, up to a limit
                        direct_metrics_emotions["confidence"] = 0.8 * volume_factor + 0.2 * steady_factor
                        
                        # 2. Enthusiasm: Based on pitch variation and speaking rate
                        norm_speaking_rate = min(1.0, speaking_rate / 15)  # Good rate is around 10-15
                        direct_metrics_emotions["enthusiasm"] = 0.6 * norm_pitch_var + 0.4 * norm_speaking_rate
                        
                        # 3. Hesitation: Inverse of speaking rate and confidence
                        slow_factor = 1.0 - min(1.0, speaking_rate / 8)  # Slower = more hesitation
                        direct_metrics_emotions["hesitation"] = 0.6 * slow_factor + 0.4 * (1.0 - volume_factor)
                        
                        # 4. Engagement: Based on clarity, pitch variation, and balanced speaking rate
                        rate_balance = 1.0 - abs(norm_speaking_rate - 0.5) * 2  # Penalize too fast/slow
                        direct_metrics_emotions["engagement"] = 0.4 * self.speech_metrics["clarity"] + 0.3 * norm_pitch_var + 0.3 * rate_balance
                        
                except Exception as e:
                    print(f"Error in audio metrics: {e}")

            # Use only direct metrics for emotions
            for emotion in self.voice_emotions:
                self.voice_emotions[emotion] = min(1.0, max(0.0, direct_metrics_emotions[emotion]))

            # Notify callback if available (always log an interval)
            if self.callback:
                self.callback(self.speech_metrics, self.voice_emotions)
        except Exception as e:
            print(f"Error processing audio chunk: {e}")

#### Emotion Logging GUI Application

In [11]:
class EmotionLoggingApp:
    def __init__(self, window, window_title):
        self.window = window
        self.window.title(window_title)
        self.window.geometry("1200x800")  # Increased height for audio components
        
        # Initialize models
        self.init_models()
        
        # Create the UI elements
        self.create_ui()
        
        # Initialize variables
        self.is_running = False
        self.stop_event = Event()
        self.emotion_logs = []
        self.current_emotions = []
        self.speech_logs = []
        self.logging_start_time = None
        self.last_log_time = None
        self.transcript_file_path = None
        self.stopwatch_active = False
        self.elapsed_time = 0
        
        # Start video capture
        self.cap = cv2.VideoCapture(0)
        self.update()
        
        self.window.protocol("WM_DELETE_WINDOW", self.on_closing)
        self.window.mainloop()
    
    def init_models(self):
        # MediaPipe setup
        self.mp_face_mesh = mp.solutions.face_mesh
        
        # Model settings
        name_backbone_model = 'FER_static_ResNet50_AffectNet.pt'
        name_LSTM_model = 'Aff-Wild2'
        
        # Load ResNet model
        self.pth_backbone_model = ResNet50(7, channels=3)
        self.pth_backbone_model.load_state_dict(torch.load(name_backbone_model))
        self.pth_backbone_model.eval()
        
        # Load LSTM model
        self.pth_LSTM_model = LSTMPyTorch()
        self.pth_LSTM_model.load_state_dict(torch.load(f'FER_dinamic_LSTM_{name_LSTM_model}.pt'))
        self.pth_LSTM_model.eval()
        
        # Emotion dictionary
        self.DICT_EMO = {0: 'Neutral', 1: 'Happiness', 2: 'Sadness', 3: 'Surprise', 4: 'Fear', 5: 'Disgust', 6: 'Anger'}
        self.NEGATIVE_EMOTIONS = ['Sadness', 'Fear', 'Disgust', 'Anger']
        
        # Initialize LSTM features
        self.lstm_features = []
    
    def create_ui(self):
        # Main frame
        main_frame = ttk.Frame(self.window)
        main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
        
        # Left frame for video and transcript
        left_frame = ttk.Frame(main_frame)
        left_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=5, pady=5)
        
        # Transcript frame at the top of left frame
        transcript_frame = ttk.LabelFrame(left_frame, text="Presentation Transcript")
        transcript_frame.pack(fill=tk.X, padx=5, pady=5)
        
        # Transcript buttons frame
        transcript_buttons_frame = ttk.Frame(transcript_frame)
        transcript_buttons_frame.pack(fill=tk.X, padx=5, pady=5)
        
        # Choose file button
        self.choose_file_button = ttk.Button(transcript_buttons_frame, text="Choose Transcript File", command=self.choose_transcript_file)
        self.choose_file_button.pack(side=tk.LEFT, padx=5)
        
        # File name label
        self.file_label = ttk.Label(transcript_buttons_frame, text="No file selected")
        self.file_label.pack(side=tk.LEFT, padx=5)
        
        # Transcript text area
        self.transcript_text = scrolledtext.ScrolledText(transcript_frame, wrap=tk.WORD, height=8)
        self.transcript_text.pack(fill=tk.X, padx=5, pady=5)
        self.transcript_text.insert(tk.END, "Load a transcript file to view your presentation text here.")
        
        # Video frame
        self.video_frame = ttk.LabelFrame(left_frame, text="Webcam Feed")
        self.video_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
        
        # Create canvas for video
        self.canvas = tk.Canvas(self.video_frame)
        self.canvas.pack(fill=tk.BOTH, expand=True)
        
        # Audio visualizer frame below video
        audio_viz_frame = ttk.LabelFrame(left_frame, text="Audio Input Level")
        # Fix: Remove height parameter from pack and use a fixed height widget
        audio_viz_frame.pack(fill=tk.X, padx=5, pady=5)
        # Create a fixed-height frame inside
        viz_content_frame = ttk.Frame(audio_viz_frame, height=120)
        viz_content_frame.pack(fill=tk.X)
        viz_content_frame.pack_propagate(False)  # Prevent shrinking
        
        # Create audio visualizer in the fixed-height frame
        self.audio_visualizer = AudioVisualizer(viz_content_frame, height=100, width=800)
        
        # Right frame for controls and summary
        right_frame = ttk.Frame(main_frame, width=300)
        right_frame.pack(side=tk.RIGHT, fill=tk.BOTH, padx=5, pady=5, expand=False)
        right_frame.pack_propagate(False)  # Prevent the frame from shrinking to fit its contents
        
        # Control frame
        control_frame = ttk.LabelFrame(right_frame, text="Controls")
        control_frame.pack(fill=tk.X, padx=5, pady=5)
        
        # Stopwatch display
        stopwatch_frame = ttk.Frame(control_frame)
        stopwatch_frame.pack(fill=tk.X, padx=5, pady=5)
        
        ttk.Label(stopwatch_frame, text="Elapsed Time: ").pack(side=tk.LEFT)
        self.time_label = ttk.Label(stopwatch_frame, text="00:00:00", font=("Arial", 14, "bold"))
        self.time_label.pack(side=tk.LEFT, padx=5)
        
        # Start button
        self.start_button = ttk.Button(control_frame, text="Start Logging", command=self.start_logging)
        self.start_button.pack(fill=tk.X, padx=5, pady=5)
        
        # Stop button
        self.stop_button = ttk.Button(control_frame, text="Stop Logging", command=self.stop_logging, state=tk.DISABLED)
        self.stop_button.pack(fill=tk.X, padx=5, pady=5)
        
        # Reset button
        self.reset_button = ttk.Button(control_frame, text="Reset", command=self.reset_logging, state=tk.DISABLED)
        self.reset_button.pack(fill=tk.X, padx=5, pady=5)
        
        # Status indicator
        status_frame = ttk.Frame(control_frame)
        status_frame.pack(fill=tk.X, padx=5, pady=5)
        
        ttk.Label(status_frame, text="Status: ").pack(side=tk.LEFT)
        self.status_label = ttk.Label(status_frame, text="Ready")
        self.status_label.pack(side=tk.LEFT)
        
        # Voice metrics frame
        voice_frame = ttk.LabelFrame(right_frame, text="Voice Analysis")
        voice_frame.pack(fill=tk.X, padx=5, pady=5)
        
        # Voice metrics display
        self.voice_labels = {}
        metrics = ["Confidence", "Enthusiasm", "Clarity", "Speaking Rate"]
        
        for metric in metrics:
            metric_frame = ttk.Frame(voice_frame)
            metric_frame.pack(fill=tk.X, padx=5, pady=2)
            
            ttk.Label(metric_frame, text=f"{metric}: ").pack(side=tk.LEFT)
            self.voice_labels[metric.lower()] = ttk.Label(metric_frame, text="N/A")
            self.voice_labels[metric.lower()].pack(side=tk.LEFT)
            
            # Progress bar for visual representation
            progress = ttk.Progressbar(metric_frame, length=150)
            progress.pack(side=tk.RIGHT, padx=5)
            self.voice_labels[f"{metric.lower()}_bar"] = progress
        
        # Summary frame
        summary_frame = ttk.LabelFrame(right_frame, text="Analysis Summary")
        summary_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
        
        # Create scrolled text for summary
        self.summary_text = scrolledtext.ScrolledText(summary_frame, wrap=tk.WORD, width=30, height=20)
        self.summary_text.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
        self.summary_text.config(state=tk.DISABLED)
        
        # Initialize audio processor
        self.audio_processor = AudioProcessor(self.audio_visualizer, self.update_voice_metrics)
    
    def update_voice_metrics(self, speech_metrics, voice_emotions, final=True):
        # Update the voice metrics display with new data
        if not self.is_running and not final:
            return
        
        # Update confidence
        confidence = voice_emotions["confidence"] * 100
        self.voice_labels["confidence"].config(text=f"{confidence:.1f}%")
        self.voice_labels["confidence_bar"]["value"] = confidence
        
        # Update enthusiasm
        enthusiasm = voice_emotions["enthusiasm"] * 100
        self.voice_labels["enthusiasm"].config(text=f"{enthusiasm:.1f}%")
        self.voice_labels["enthusiasm_bar"]["value"] = enthusiasm
        
        # Update clarity
        clarity = speech_metrics["clarity"] * 100
        self.voice_labels["clarity"].config(text=f"{clarity:.1f}%")
        self.voice_labels["clarity_bar"]["value"] = clarity

        # --- Add this block ---
        volume = speech_metrics["volume"]
        scaled_volume = min(100, volume * 100)  # Scale for UI (0-100)
        self.voice_labels["volume"] = self.voice_labels.get("volume") or ttk.Label()  # If not present
        self.voice_labels["volume"].config(text=f"{scaled_volume:.1f}")
        if "volume_bar" in self.voice_labels:
            self.voice_labels["volume_bar"]["value"] = scaled_volume
        # --- End block ---
        
        # Update speaking rate
        speaking_rate = speech_metrics["speaking_rate"]
        rate_percent = min(100, speaking_rate * 5)  # Scale for progress bar
        self.voice_labels["speaking rate"].config(text=f"{speaking_rate:.1f}")
        self.voice_labels["speaking rate_bar"]["value"] = rate_percent
        
        # Log speech data for every interval (not just final)
        self.speech_logs.append({
            "metrics": speech_metrics.copy(),
            "emotions": voice_emotions.copy()
        })

    
    def choose_transcript_file(self):
        file_path = filedialog.askopenfilename(
            title="Select Transcript File",
            filetypes=[("Text files", "*.txt"), ("All files", "*.*")]
        )
        
        if file_path:
            self.transcript_file_path = file_path
            filename = os.path.basename(file_path)
            self.file_label.config(text=f"Selected: {filename}")
            
            try:
                with open(file_path, 'r') as file:
                    content = file.read()
                    self.transcript_text.delete(1.0, tk.END)
                    self.transcript_text.insert(tk.END, content)
            except Exception as e:
                self.transcript_text.delete(1.0, tk.END)
                self.transcript_text.insert(tk.END, f"Error reading file: {str(e)}")
    
    def update_stopwatch(self):
        if self.stopwatch_active:
            current_time = time.time()
            self.elapsed_time = current_time - self.logging_start_time
            
            # Format time as HH:MM:SS
            hours, remainder = divmod(int(self.elapsed_time), 3600)
            minutes, seconds = divmod(remainder, 60)
            time_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
            self.time_label.config(text=time_str)
            
            # Update every 1 second
            self.window.after(1000, self.update_stopwatch)
    
    def update(self):
        ret, frame = self.cap.read()
        
        if ret:
            # Process frame if logging is active
            if self.is_running:
                self.process_frame(frame)
            
            # Convert to RGB for display
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            img = PIL.Image.fromarray(frame_rgb)
            
            # Resize to fit canvas
            canvas_width = self.canvas.winfo_width()
            canvas_height = self.canvas.winfo_height()
            
            if canvas_width > 1 and canvas_height > 1:
                ratio = min(canvas_width/img.width, canvas_height/img.height)
                new_width = int(img.width * ratio)
                new_height = int(img.height * ratio)
                img = img.resize((new_width, new_height), PIL.Image.Resampling.LANCZOS)
            
            self.photo = PIL.ImageTk.PhotoImage(image=img)
            self.canvas.create_image(canvas_width//2, canvas_height//2, image=self.photo, anchor=tk.CENTER)
        
        if not self.stop_event.is_set():
            self.window.after(10, self.update)
    
    def process_frame(self, frame):
        current_time = time.time()
        
        # Initialize the 5-second logging interval
        if self.last_log_time is None:
            self.last_log_time = current_time
        
        # Process frame for emotion detection
        frame_copy = frame.copy()
        frame_copy.flags.writeable = False
        frame_copy = cv2.cvtColor(frame_copy, cv2.COLOR_BGR2RGB)
        
        with self.mp_face_mesh.FaceMesh(
            max_num_faces=1,
            refine_landmarks=False,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5) as face_mesh:
            
            results = face_mesh.process(frame_copy)
            
            if results.multi_face_landmarks:
                for fl in results.multi_face_landmarks:
                    h, w, _ = frame_copy.shape
                    startX, startY, endX, endY = get_box(fl, w, h)
                    
                    # Extract face
                    cur_face = frame_copy[startY:endY, startX:endX]
                    if cur_face.size == 0:  # Skip if face not properly detected
                        continue
                    
                    try:
                        # Process with model
                        cur_face = pth_processing(Image.fromarray(cur_face))
                        features = torch.nn.functional.relu(self.pth_backbone_model.extract_features(cur_face)).detach().numpy()
                        
                        # Update LSTM features
                        if len(self.lstm_features) == 0:
                            self.lstm_features = [features] * 10
                        else:
                            self.lstm_features = self.lstm_features[1:] + [features]
                        
                        lstm_f = torch.from_numpy(np.vstack(self.lstm_features))
                        lstm_f = torch.unsqueeze(lstm_f, 0)
                        output = self.pth_LSTM_model(lstm_f).detach().numpy()
                        
                        # Get emotion label
                        cl = np.argmax(output)
                        emotion = self.DICT_EMO[cl]
                        confidence = output[0][cl]
                        
                        # Add to current emotions
                        self.current_emotions.append(emotion)
                    except Exception as e:
                        print(f"Error processing face: {e}")
        
        # Log emotions every 5 seconds
        if current_time - self.last_log_time >= 5 and self.current_emotions:
            self.log_emotions()
            self.last_log_time = current_time
    
    def log_emotions(self):
        if not self.current_emotions:
            return
        
        # Count occurrences of each emotion
        emotion_counts = Counter(self.current_emotions)
        dominant_emotion = emotion_counts.most_common(1)[0][0]
        
        # Check for negative emotions
        negative_emotions = [emotion for emotion in self.current_emotions if emotion in self.NEGATIVE_EMOTIONS]
        has_negative = len(negative_emotions) > 0
        
        # Create log entry
        elapsed_time = int(time.time() - self.logging_start_time)
        timestamp = f"{elapsed_time//60:02d}:{elapsed_time%60:02d}"
        
        log_entry = {
            "timestamp": timestamp,
            "dominant_emotion": dominant_emotion,
            "counts": dict(emotion_counts),
            "has_negative": has_negative
        }
        
        self.emotion_logs.append(log_entry)
        
        # Clear current emotions for next interval
        self.current_emotions = []
    
    def start_logging(self):
        self.is_running = True
        self.logging_start_time = time.time()
        self.last_log_time = None
        self.emotion_logs = []
        self.speech_logs = []
        self.current_emotions = []
        
        # Start audio processing
        self.audio_processor.start_recording()
        
        # Start stopwatch
        self.stopwatch_active = True
        self.update_stopwatch()
        
        # Update UI
        self.status_label.config(text="Logging")
        self.start_button.config(state=tk.DISABLED)
        self.stop_button.config(state=tk.NORMAL)
        self.reset_button.config(state=tk.DISABLED)
        self.choose_file_button.config(state=tk.DISABLED)  # Disable file selection during recording
        
        # Clear summary
        self.summary_text.config(state=tk.NORMAL)
        self.summary_text.delete(1.0, tk.END)
        self.summary_text.insert(tk.END, "Logging emotions and voice metrics...\n")
        self.summary_text.config(state=tk.DISABLED)
    
    def stop_logging(self):
        if self.is_running:
            self.is_running = False
            self.stopwatch_active = False
            
            # Stop audio recording and process final audio
            self.audio_processor.stop_recording()
            
            # Log any remaining emotions
            if self.current_emotions:
                self.log_emotions()
            
            # Update UI
            self.status_label.config(text="Ready")
            self.start_button.config(state=tk.DISABLED)
            self.stop_button.config(state=tk.DISABLED)
            self.reset_button.config(state=tk.NORMAL)
            self.choose_file_button.config(state=tk.NORMAL)  # Re-enable file selection
            
            # Generate and display summary
            self.display_summary()
            
            # Save video summary to text file
            self.save_video_summary()
            # Save audio summary to text file
            self.save_audio_summary()
    
    def save_video_summary(self):
        """Save the video summary to a text file."""
        try:
            with open("video_logging.txt", "w") as file:
                file.write("== Emotion Logging Summary ==\n")
                
                # Format final time
                hours, remainder = divmod(int(self.elapsed_time), 3600)
                minutes, seconds = divmod(remainder, 60)
                time_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
                file.write(f"Total Duration: {time_str}\n")
                
                # Log 5-second intervals
                file.write("=== 5-second Intervals ===\n")
                for log in self.emotion_logs:
                    file.write(f"[{log['timestamp']}] Dominant: {log['dominant_emotion']}\n")
                    #if log['has_negative']:
                        #file.write("⚠️ Negative emotions detected!\n")
                    for emotion, count in log['counts'].items():
                        percentage = count / sum(log['counts'].values()) * 100
                        file.write(f"* {emotion}: {percentage:.1f}%\n")
                
                # Overall summary
                file.write("=== Overall Summary ===\n")
                if self.emotion_logs:
                    dominant_counts = Counter([log['dominant_emotion'] for log in self.emotion_logs])
                    most_common = dominant_counts.most_common()
                    file.write("Most frequent emotions:\n")
                    for emotion, count in most_common:
                        percentage = count / len(self.emotion_logs) * 100
                        file.write(f"* {emotion}: {percentage:.1f}%\n")
                    
                    negative_intervals = sum(1 for log in self.emotion_logs if log['has_negative'])
                    neg_percentage = negative_intervals / len(self.emotion_logs) * 100
                    #file.write(f"⚠️ Negative emotions detected in {negative_intervals} intervals ({neg_percentage:.1f}%)\n")
        except Exception as e:
            print(f"Error saving video summary: {e}")
    
    def save_audio_summary(self):
        """Save the audio summary to a text file."""
        try:
            with open("audio_logging.txt", "w") as f:
                f.write("== voice attribute Logging Summary ==\n")
                # total duration
                hours, rem = divmod(int(self.elapsed_time), 3600)
                mins, secs = divmod(rem, 60)
                f.write(f"Total Duration: {hours:02d}:{mins:02d}:{secs:02d}\n")
                f.write("=== 5-second Intervals ===\n")
                from collections import Counter
                doms = []
                # iterate through each logged audio entry
                for idx, log in enumerate(self.speech_logs, start=1):
                    dom_emotion = max(log['emotions'], key=log['emotions'].get)
                    doms.append(dom_emotion)
                    f.write(f"[Interval {idx}] Dominant: {dom_emotion}\n\n")
                    # write each emotion and metric
                    # for k, v in {**log['emotions'], **log['metrics']}.items():
                    #     f.write(f"{k}: {v:.1f}\n")
                    # f.write("\n")
                    # Only write relevant metrics/emotions
                    for k, v in {**log['emotions'], **log['metrics']}.items():
                        if k in ["samples_processed", "volume_db"]:
                            continue  # Skip these keys
                        if k == "volume":
                            scaled_volume = v * 100
                            f.write(f"volume: {scaled_volume:.1f}\n")
                        else:
                            f.write(f"{k}: {v:.1f}\n")
                # overall summary
                # f.write("=== Overall Summary ===\n")
                # f.write("Most frequent emotions:\n")
                # counts = Counter(doms)
                # for em, cnt in counts.items():
                #     pct = cnt / len(doms) * 100 if doms else 0
                #     f.write(f"{em}: {pct:.1f}%\n")
        except Exception as e:
            print(f"Error saving audio summary: {e}")
    
    def reset_logging(self):
        # Reset all variables to initial state
        self.emotion_logs = []
        self.speech_logs = []
        self.current_emotions = []
        self.logging_start_time = None
        self.last_log_time = None
        self.elapsed_time = 0
        self.lstm_features = []
        
        # Overwrite the video summary file
        try:
            with open("video_logging.txt", "w") as file:
                file.write("Ready for new logging session.\n")
        except Exception as e:
            print(f"Error resetting video summary file: {e}")
        
        # Reset UI
        self.time_label.config(text="00:00:00")
        self.status_label.config(text="Ready")
        self.start_button.config(state=tk.NORMAL)
        self.stop_button.config(state=tk.DISABLED)
        self.reset_button.config(state=tk.DISABLED)
        
        # Reset voice metrics
        for metric in ["confidence", "enthusiasm", "clarity", "speaking rate"]:
            self.voice_labels[metric].config(text="N/A")
            self.voice_labels[f"{metric}_bar"]["value"] = 0
        
        # Clear summary
        self.summary_text.config(state=tk.NORMAL)
        self.summary_text.delete(1.0, tk.END)
        self.summary_text.insert(tk.END, "Ready to start a new session.\n")
        self.summary_text.config(state=tk.DISABLED)
    
    def display_summary(self):
        self.summary_text.config(state=tk.NORMAL)
        self.summary_text.delete(1.0, tk.END)
        
        if not self.emotion_logs and not self.speech_logs:
            self.summary_text.insert(tk.END, "No data logged.")
            self.summary_text.config(state=tk.DISABLED)
            return
        
        # Format final time
        hours, remainder = divmod(int(self.elapsed_time), 3600)
        minutes, seconds = divmod(remainder, 60)
        time_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
        
        self.summary_text.insert(tk.END, f"== Presentation Analysis Summary ==\n")
        self.summary_text.insert(tk.END, f"Total Duration: {time_str}\n")
        self.summary_text.insert(tk.END, f"Facial emotion tracking interval: 5 seconds\n")
        
        # Add transcript file info if available
        if self.transcript_file_path:
            filename = os.path.basename(self.transcript_file_path)
            self.summary_text.insert(tk.END, f"Transcript: {filename}\n")
        
        self.summary_text.insert(tk.END, "\n")
        
        # Visual emotion summary
        if self.emotion_logs:
            self.summary_text.insert(tk.END, "=== Visual Emotion Analysis ===\n")
            
            # Count dominant emotions across all intervals
            dominant_counts = Counter([log['dominant_emotion'] for log in self.emotion_logs])
            most_common = dominant_counts.most_common()
            
            self.summary_text.insert(tk.END, "Most frequent facial emotions:\n")
            for emotion, count in most_common:
                percentage = count / len(self.emotion_logs) * 100
                self.summary_text.insert(tk.END, f"  - {emotion}: {percentage:.1f}%\n")
            
            # Check for negative emotions
            negative_intervals = sum(1 for log in self.emotion_logs if log['has_negative'])
            if negative_intervals > 0:
                neg_percentage = negative_intervals / len(self.emotion_logs) * 100
                self.summary_text.insert(tk.END, f"\n⚠️ Negative facial expressions detected in {negative_intervals} intervals ({neg_percentage:.1f}%)\n")
            else:
                self.summary_text.insert(tk.END, "\n✅ No negative facial expressions detected\n")
        
        # Voice analysis summary
        if self.speech_logs:
            self.summary_text.insert(tk.END, "\n=== Voice Analysis ===\n")
            
            # Get the latest speech metrics (most comprehensive)
            latest_speech = self.speech_logs[-1]
            
            # Confidence
            confidence = latest_speech["emotions"]["confidence"] * 100
            self.summary_text.insert(tk.END, f"Voice Confidence: {confidence:.1f}%\n")
            
            # Enthusiasm
            enthusiasm = latest_speech["emotions"]["enthusiasm"] * 100
            self.summary_text.insert(tk.END, f"Voice Enthusiasm: {enthusiasm:.1f}%\n")
            
            # Speaking rate
            speaking_rate = latest_speech["metrics"]["speaking_rate"]
            rate_description = "Too slow" if speaking_rate < 5 else "Good" if speaking_rate < 15 else "Too fast"
            self.summary_text.insert(tk.END, f"Speaking Rate: {speaking_rate:.1f} ({rate_description})\n")
            
            # Clarity
            clarity = latest_speech["metrics"]["clarity"] * 100
            self.summary_text.insert(tk.END, f"Voice Clarity: {clarity:.1f}%\n")
            
            # Hesitation
            hesitation = latest_speech["emotions"]["hesitation"] * 100
            if hesitation > 40:
                self.summary_text.insert(tk.END, f"⚠️ High hesitation detected ({hesitation:.1f}%)\n")
        
        # Combined analysis and advice
        self.summary_text.insert(tk.END, "\n=== Presentation Advice ===\n")
        
        # Generate personalized advice based on both facial and voice analysis
        if self.emotion_logs and self.speech_logs:
            # Get key metrics
            latest_speech = self.speech_logs[-1]
            confidence = latest_speech["emotions"]["confidence"]
            enthusiasm = latest_speech["emotions"]["enthusiasm"]
            hesitation = latest_speech["emotions"]["hesitation"]
            speaking_rate = latest_speech["metrics"]["speaking_rate"]
            
            negative_face_percent = 0
            if self.emotion_logs:
                negative_intervals = sum(1 for log in self.emotion_logs if log['has_negative'])
                negative_face_percent = negative_intervals / len(self.emotion_logs) if self.emotion_logs else 0
            
            # Generate personalized advice
            if negative_face_percent > 0.3 and confidence < 0.5:
                self.summary_text.insert(tk.END, "Your facial expressions show nervousness and your voice lacks confidence. ")
                self.summary_text.insert(tk.END, "Practice more to build confidence. Consider recording yourself and watching the playback. ")
                self.summary_text.insert(tk.END, "Focus on deep breathing exercises before presenting.\n")
            elif confidence < 0.4:
                self.summary_text.insert(tk.END, "Your voice shows low confidence. Try to speak more assertively. ")
                self.summary_text.insert(tk.END, "Practice power posing before your presentation and speak from your diaphragm.\n")
            elif enthusiasm < 0.4:
                self.summary_text.insert(tk.END, "Your presentation lacks vocal enthusiasm. Add more vocal variety and emphasis on key points. ")
                self.summary_text.insert(tk.END, "Try varying your pitch and pace to keep your audience engaged.\n")
            elif hesitation > 0.6:
                self.summary_text.insert(tk.END, "You show significant hesitation in your speech. Practice your presentation more thoroughly ")
                self.summary_text.insert(tk.END, "to reduce pauses and filler words like 'um' and 'uh'.\n")
            elif speaking_rate > 15:
                self.summary_text.insert(tk.END, "You're speaking too quickly. Slow down to improve clarity and give your audience time to process. ")
                self.summary_text.insert(tk.END, "Try marking your script with deliberate pause points.\n")
            elif speaking_rate < 5:
                self.summary_text.insert(tk.END, "Your pace is quite slow. Try to increase your speaking rate slightly to maintain audience engagement. ")
                self.summary_text.insert(tk.END, "Practice with a timer to find a better rhythm.\n")
            elif confidence > 0.7 and enthusiasm > 0.7 and hesitation < 0.3:
                self.summary_text.insert(tk.END, "Excellent job! Your voice projects confidence and enthusiasm, and your facial expressions are positive. ")
                self.summary_text.insert(tk.END, "Maintain this energy level for your actual presentation.\n")
            else:
                self.summary_text.insert(tk.END, "Your presentation shows good elements but has room for improvement. ")
                self.summary_text.insert(tk.END, "Focus on maintaining consistent energy and confidence throughout.\n")
        
        # Additional specific advice
        self.summary_text.insert(tk.END, "\nPractice Tips:\n")
        self.summary_text.insert(tk.END, "1. Record yourself and review your performance\n")
        self.summary_text.insert(tk.END, "2. Practice in front of friends or colleagues\n")
        self.summary_text.insert(tk.END, "3. Time your presentation sections\n")
        self.summary_text.insert(tk.END, "4. Use breathing exercises before presenting\n")
        self.summary_text.insert(tk.END, "5. Remember to pause at key points\n")
        
        self.summary_text.config(state=tk.DISABLED)
    
    def on_closing(self):
        self.stop_event.set()
        if hasattr(self, 'audio_processor'):
            self.audio_processor.stop_recording()
        if self.cap and self.cap.isOpened():
            self.cap.release()
        self.window.destroy()

In [12]:
# Run the application
root = tk.Tk()
app = EmotionLoggingApp(root, "Presentation Rehearsal Voice & Emotion Tracker")

invalid command name "13523222784update"
    while executing
"13523222784update"
    ("after" script)


Audio recording started


I0000 00:00:1752451978.580966 5994918 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M1 Pro
W0000 00:00:1752451978.582856 5998252 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1752451978.585849 5998253 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1752451978.710087 5994918 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M1 Pro
W0000 00:00:1752451978.711329 5998263 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1752451978.713161 5998266 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1752451978.847553 5994918 gl

Audio recording stopped
