In [2]:
pip install torchaudio soundfile


Collecting torchaudio
  Downloading torchaudio-2.6.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting soundfile
  Downloading soundfile-0.13.1-py2.py3-none-any.whl.metadata (16 kB)
Collecting torch==2.6.0 (from torchaudio)
  Downloading torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 

In [4]:
import torchaudio
import torch
import numpy as np
import json
import boto3

ENDPOINT_NAME = "urbansound-496118972"
runtime = boto3.client("sagemaker-runtime", region_name="us-east-1")

# -----------------------
# PARAMETROS DEL MODELO
# -----------------------
SAMPLE_RATE = 22050
N_MELS = 128
TARGET_FRAMES = 160


# -----------------------
# PREPROCESAMIENTO
# -----------------------
def preprocess_audio(path):
    # Cargar audio - forzar MONO
    waveform, sr = torchaudio.load(path)

    if sr != SAMPLE_RATE:
        resampler = torchaudio.transforms.Resample(sr, SAMPLE_RATE)
        waveform = resampler(waveform)

    # Convertir a 1 canal
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Extraer mel
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_mels=N_MELS,
        f_max=8000
    )

    mel = mel_spectrogram(waveform)

    # Convertir a dB
    mel_db = torchaudio.transforms.AmplitudeToDB()(mel)

    # Normalizar (simple)
    mel_db = (mel_db - mel_db.mean()) / mel_db.std()

    # Quitar el canal del audio -> (128, frames)
    mel_db = mel_db.squeeze(0)

    # Ajustar frames EXACTOS
    frames = mel_db.shape[1]

    if frames < TARGET_FRAMES:
        pad = TARGET_FRAMES - frames
        mel_db = torch.nn.functional.pad(mel_db, (0, pad))

    elif frames > TARGET_FRAMES:
        mel_db = mel_db[:, :TARGET_FRAMES]

    # Agregar canal final -> (128, 160, 1)
    mel_db = mel_db.unsqueeze(-1)

    return mel_db.numpy().astype(np.float32)


# -----------------------
# PREDICCIÓN
# -----------------------
def predict(path):
    mel = preprocess_audio(path)
    mel = mel[np.newaxis, ...]

    payload = json.dumps(mel.tolist())

    response = runtime.invoke_endpoint(
        EndpointName=ENDPOINT_NAME,
        ContentType="application/json",
        Body=payload
    )

    raw = response["Body"].read()
    print("RAW RESPONSE:", raw)

    resp = json.loads(raw)
    preds = np.array(resp["predictions"][0])
    
    prob = float(np.max(preds))
    idx = int(np.argmax(preds))

    if prob < 0.6:
        return "indeterminado", prob

    class_names = [
        "air_conditioner","car_horn","children_playing","dog_bark","drilling",
        "engine_idling","gun_shot","jackhammer","siren","street_music"
    ]

    return class_names[idx], prob





# -----------------------
# TEST 
# -----------------------
label, prob = predict("Desert Eagle Single Shot Gunshot Sound Effect.mp3")
print("Predicción:", label, " | Prob:", prob)


RAW RESPONSE: b'{\n    "predictions": [[4.72714934e-09, 4.06661076e-12, 4.47562e-06, 0.99999547, 7.70248362e-12, 7.05242342e-13, 7.05440914e-11, 1.57621039e-16, 6.49218179e-12, 8.40275405e-09]\n    ]\n}'
Predicción: dog_bark  | Prob: 0.99999547


In [5]:
import sys
sys.executable



'/home/ec2-user/anaconda3/envs/python3/bin/python'