# Urban Sound Narrative

A Project that turns the day to day sound to a beautiful narration 

### Installing Dependencies

In [5]:
# For google collab:

# !pip install --quiet librosa torch torchaudio gtts ipywidgets tensorflow tensorflow_hub

# Local Jupyter Notebook

%pip install --quiet librosa torch torchaudio gtts ipywidgets

Note: you may need to restart the kernel to use updated packages.


## Importing Libraries

In [14]:
import librosa
import numpy as np
import requests
import json
from pathlib import Path
import os

import soundfile as sf
from sklearn.preprocessing import MultiLabelBinarizer

In [11]:
raw_audio_path = "../data/raw/city_sounds.mp3"
preprocessed_path = "../data/processed"

In [17]:
# Preprocessing the Audio using librosa 
# This steps involve the following:
"""Converting the audio to Mono channel, and making sure the sample rate is 16Hz"""

def preprocess_audio(audio_path, target_sr=16000):
    audio, sr = librosa.load(audio_path, sr=None)
    print(f"Audio: {audio}, SR: {sr}")

    print(f"Shape: {audio.shape}, Len: {len(audio)}")
    if sr != target_sr:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
    if len(audio.shape) > 1:
        audio = librosa.to_mono(audio)
    print(f"Audio len after mono: {audio}")
    audio = audio / np.max(np.abs(audio))
    output_path = os.path.join(preprocessed_path, "preprocessed_audio.wav")
    sf.write(output_path, audio, target_sr)
    return output_path, sr

Testing

In [18]:
preprocess_audio(raw_audio_path)

Audio: [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 4.16944118e-07
 1.46511340e-07 1.02268004e-07], SR: 44100
Shape: (884736,), Len: 884736
Audio len after mono: [9.4589547e-10 7.4039741e-10 8.5627172e-11 ... 3.8616520e-07 5.0218404e-07
 2.0078872e-07]


('../data/processed/preprocessed_audio.wav', 44100)

In [19]:
# Feature extraction for the model

import librosa
import numpy as np

def extract_features(audio_path, target_sr=16000, n_mfcc=40, n_mels=128):
    # 1. Load the audio
    audio, sr = librosa.load(audio_path, sr=target_sr)
    
    # 2. Mel-spectrogram
    mel_spec = librosa.feature.melspectrogram(
        y=audio,
        sr=sr,
        n_mels=n_mels,
        fmax=sr//2
    )
    
    # Convert power to decibel scale (more perceptually meaningful)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    # 3. MFCCs (summarized version of the mel-spectrogram)
    mfcc = librosa.feature.mfcc(
        y=audio,
        sr=sr,
        n_mfcc=n_mfcc
    )
    
    # 4. Normalize
    mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)
    
    return mel_spec_db, mfcc
