In [None]:
import requests
import librosa
import json
import numpy as np
from pydantic import BaseModel
from typing import List

SERVICES = {
    'asr': 'http://127.0.0.1:8001',
    'diarization': 'http://127.0.0.1:8002',
    'emotion': 'http://127.0.0.1:8003',
    'nlp': 'http://127.0.0.1:8004',
    'nonverb': 'http://127.0.0.1:8005',
    'robot_data': 'http://127.0.0.1:8006'
}
audio_path = "data/processed/test_audio.wav"
sr = 16000
y, _ = librosa.load("../"+audio_path, sr=sr)

# Load diarization results from file
with open("../data/full_diarization.json", 'r') as f:
    full_diarization = json.load(f)

window_start = 305
window_end = 345
actual_end = window_end

diarization_result = []
for seg in full_diarization['segments']:
    if window_start <= seg['start'] < window_end:
        # Include this speaker turn if it starts within the window
        diarization_result.append(seg)
        # Extend window boundary if turn extends beyond
        if seg['end'] > actual_end:
            actual_end = seg['end']
    elif seg['start'] >= window_end:
        # Stop when we reach turns that start after window
        break
    
diarization_result = {'segments': diarization_result, 'num_speakers': full_diarization['num_speakers']}
    
window_y = y[window_start*sr:window_end*sr]
y_list = window_y.tolist()

In [2]:
import requests
import librosa
import json
import numpy as np
from pydantic import BaseModel
from typing import List

SERVICES = {
    'asr': 'http://127.0.0.1:8001',
    'diarization': 'http://127.0.0.1:8002',
    'emotion': 'http://127.0.0.1:8003',
    'nlp': 'http://127.0.0.1:8004',
    'nonverb': 'http://127.0.0.1:8005',
    'robot_data': 'http://127.0.0.1:8006'
}

robot_interaction_features = requests.post(
    f"{SERVICES['robot_data']}/extract_features",
    json={"timeline_csv": "data/Sessions/20251003/Suzanne/142002/timeline_csv.csv"}
).json()

In [3]:
robot_interaction_features

{'features': {'total_time_s': 8.175948,
  'num_events': 10.0,
  'inter_event_mean_s': 0.9084386666666666,
  'inter_event_std_s': 1.4220854338644906,
  'A_control_frac': 0.537093190905813,
  'A_event_count': 3.0,
  'A_event_rate_per_s': 0.36692992665804625,
  'B_control_frac': 0.21571321148324327,
  'B_event_count': 3.0,
  'B_event_rate_per_s': 0.36692992665804625,
  'C_control_frac': 0.0,
  'C_event_count': 0.0,
  'C_event_rate_per_s': 0.0,
  'D_control_frac': 0.0,
  'D_event_count': 0.0,
  'D_event_rate_per_s': 0.0,
  'action_entropy_bits': 1.9219280948815916,
  'control_balance_index': 1.7974162748864686,
  'reaction_time_mean_s': 0.0}}

In [None]:
asr = requests.post(
    f"{SERVICES['diarization']}/transcribe",
    json=audio_path
).json()

KeyboardInterrupt: 

In [None]:
# Save diarization results to JSON file
output_file = "../data/full_diarization.json"

with open(output_file, 'w') as f:
    json.dump(full_diarization, f, indent=2)

print(f"Saved diarization results to {output_file}")

Saved diarization results to ../data/full_diarization.json


In [7]:
output_file = "../data/asr_diar.json"

with open(output_file, 'w') as f:
    json.dump({'asr': asr, 'diarization': full_diarization}, f, indent=2)

print(f"Saved asr and diarization results to {output_file}")

Saved asr and diarization results to ../data/asr_diar.json


In [16]:
diarization_result

{'segments': [{'start': 305.04659375,
   'end': 307.17284375,
   'speaker': 'speaker_SPEAKER_04'},
  {'start': 309.63659375000003,
   'end': 326.49471875,
   'speaker': 'speaker_SPEAKER_04'},
  {'start': 326.81534375,
   'end': 326.96721875000003,
   'speaker': 'speaker_SPEAKER_04'},
  {'start': 326.96721875000003,
   'end': 327.00096875,
   'speaker': 'speaker_SPEAKER_03'},
  {'start': 327.00096875,
   'end': 327.01784375,
   'speaker': 'speaker_SPEAKER_04'},
  {'start': 327.01784375,
   'end': 333.36284375,
   'speaker': 'speaker_SPEAKER_03'},
  {'start': 333.54846875000004,
   'end': 343.06596875,
   'speaker': 'speaker_SPEAKER_04'},
  {'start': 343.23471875,
   'end': 343.63971875000004,
   'speaker': 'speaker_SPEAKER_03'}],
 'num_speakers': 5}

In [58]:
response = requests.post(
    f"{SERVICES['nonverb']}/loudness_features",
    json={
        'diarization': diarization_result,
        'y': y.tolist(),
        'sr': sr
    }
)

print(response.json())

[{'speaker': 'speaker_SPEAKER_04', 'mean_rms': 0.005004299338907003, 'std_rms': 0.0023956692311912775, 'num_segments': 5}, {'speaker': 'speaker_SPEAKER_03', 'mean_rms': 0.0081566097214818, 'std_rms': 0.00615645619109273, 'num_segments': 3}]


In [26]:
n_fft = 1024
hop_length = 512
i = 3
audio_segment = y[int(diarization_result['segments'][i]['start']*sr):int(diarization_result['segments'][i]['end']*sr)]

spectrogram = librosa.stft(audio_segment, n_fft=n_fft, hop_length=hop_length)
magnitude_spec = np.abs(spectrogram)
rms = librosa.feature.rms(S=magnitude_spec, hop_length=hop_length, frame_length=n_fft)[0]
mean_rms = np.mean(rms)

mean_rms



np.float32(0.0029568342)

In [47]:
class SpeakerSpectralStats(BaseModel):
    speaker: str
    mean_rms: float
    std_rms: float
    num_segments: int

def group_by_speakers(y, sr, diarization_result: dict):
    # Dictionary to store audio segments for each speaker
    speaker_audio_segments = {}

    initial_time = int(diarization_result["segments"][0]["start"] * sr)
    # Iterate through each segment in the diarization result
    for segment in diarization_result["segments"]:
        speaker = segment["speaker"]
        start_time = segment["start"]
        end_time = segment["end"]
        
        # Convert time to sample indices
        start_sample = int(start_time * sr)
        end_sample = int(end_time * sr)
        
        # Extract the audio segment
        audio_segment = y[start_sample-initial_time:end_sample-initial_time]
        
        # Initialize speaker list if not exists
        if speaker not in speaker_audio_segments:
            speaker_audio_segments[speaker] = []
        
        # Append segment with metadata
        speaker_audio_segments[speaker].append({
            'audio': audio_segment,
            'start': start_time,
            'end': end_time,
            'duration': end_time - start_time
        })
        
    return speaker_audio_segments

def calculate_spectrogram_features(speaker_audio_segments, n_fft=1024, hop_length=512) -> List[SpeakerSpectralStats]:
    """
    Calculate spectral features (RMS and std) from spectrograms for each speaker's segments.
    
    Returns a list of SpeakerSpectralStats instances (one per speaker).
    """
    results: List[SpeakerSpectralStats] = []
    
    for speaker, segments in speaker_audio_segments.items():
        rms_values = []
        
        for seg in segments:
            audio_segment = seg['audio']
            
            # Compute STFT (spectrogram)
            spectrogram = librosa.stft(audio_segment, n_fft=n_fft, hop_length=hop_length)
            
            # Get magnitude spectrogram
            magnitude_spec = np.abs(spectrogram)
            
            # Calculate RMS energy for this segment
            # RMS is computed across frequency bins for each time frame, then averaged
            rms = librosa.feature.rms(S=magnitude_spec, hop_length=hop_length, frame_length=n_fft)[0]
            mean_rms = np.mean(rms)
            rms_values.append(mean_rms)
        
        # Create model instance with aggregated statistics
        spectral_stats = SpeakerSpectralStats(
            speaker=speaker,
            mean_rms=float(np.mean(rms_values)) if rms_values else 0.0,
            std_rms=float(np.std(rms_values)) if rms_values else 0.0,
            num_segments=len(rms_values)
        )
        results.append(spectral_stats)
    
    return results

In [55]:
diarization_grouped = group_by_speakers(window_y, sr, diarization_result)
loudness_features = calculate_spectrogram_features(diarization_grouped)



In [59]:
loudness_features[1]

SpeakerSpectralStats(speaker='speaker_SPEAKER_03', mean_rms=0.009062014520168304, std_rms=0.005267636850476265, num_segments=3)

In [None]:
[{'speaker': 'speaker_SPEAKER_04', 'mean_rms': 0.005004299338907003, 'std_rms': 0.0023956692311912775, 'num_segments': 5}, {'speaker': 'speaker_SPEAKER_03', 'mean_rms': 0.0081566097214818, 'std_rms': 0.00615645619109273, 'num_segments': 3}]