In [None]:
audio_path = "/content/city_sounds.mp3"

In [None]:
!pip install panns-inference

Collecting panns-inference
  Downloading panns_inference-0.1.1-py3-none-any.whl.metadata (2.4 kB)
Collecting torchlibrosa (from panns-inference)
  Downloading torchlibrosa-0.1.0-py3-none-any.whl.metadata (3.5 kB)
Downloading panns_inference-0.1.1-py3-none-any.whl (8.3 kB)
Downloading torchlibrosa-0.1.0-py3-none-any.whl (11 kB)
Installing collected packages: torchlibrosa, panns-inference
Successfully installed panns-inference-0.1.1 torchlibrosa-0.1.0


In [11]:
# Feature extraction using PANNs (CNN14) pretrained model
import torch
import torchaudio
import numpy as np
from panns_inference import AudioTagging, SoundEventDetection, labels

# Initialize the pretrained model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AudioTagging(checkpoint_path=None, device=device)  # CNN14 pretrained on AudioSet

def extract_panns_features(audio_path, target_sr=32000):
    """
    Extracts powerful embeddings from audio using PANNs CNN14.
    Args:
        audio_path (str): path to audio file
        target_sr (int): sampling rate (CNN14 expects 32kHz)
    Returns:
        embeddings (np.ndarray): 2048-dim feature vector representing the clip
        predicted_tags (list): top predicted sound tags (optional)
    """
    # 1. Load audio and resample to 32kHz
    waveform, sr = torchaudio.load(audio_path)
    if sr != target_sr:
        waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(waveform)

    # 2. Convert to mono if stereo
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    waveform = waveform.to(device)

    clipwise_output, embedding = model.inference(waveform)

# Pool embeddings across time
    pooled_embedding = embedding.mean(axis=0)  # already a NumPy array

# Make clipwise_output 1D
    clipwise_output = clipwise_output.squeeze()  # shape -> [num_classes]

# Get top predicted tags
    top_indices = clipwise_output.argsort()[-5:][::-1]  # top 5 indices
    predicted_tags = [labels[int(i)] for i in top_indices]  # cast to int

    return pooled_embedding, predicted_tags




Checkpoint path: /root/panns_data/Cnn14_mAP=0.431.pth
GPU number: 1


In [12]:
extract_panns_features(audio_path)

(array([0.        , 0.        , 0.        , ..., 0.        , 0.18207598,
        0.        ], dtype=float32),
 ['Speech',
  'Vehicle',
  'Car',
  'Outside, urban or manmade',
  'Traffic noise, roadway noise'])

In [28]:
# Test what's already installed
import torch
print(f"✓ PyTorch: {torch.__version__}")
print(f"✓ CUDA: {torch.cuda.is_available()}")

import whisper
print(f"✓ Whisper installed")

from panns_inference import AudioTagging
print(f"✓ PANNs installed")

print("\n✅ Core packages are ready!")

✓ PyTorch: 2.8.0+cu126
✓ CUDA: True
✓ Whisper installed
✓ PANNs installed

✅ Core packages are ready!


In [None]:
!pip un

In [32]:
# First, run this in a cell:
!pip uninstall whisper -y
!pip uninstall openai-whisper -y
!pip install openai-whisper

# Then RESTART the runtime (Runtime > Restart runtime in menu)
# This is important - the wrong whisper module is cached

[0mFound existing installation: openai-whisper 20250625
Uninstalling openai-whisper-20250625:
  Successfully uninstalled openai-whisper-20250625
Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=64026f5a9ad08281c0bb3dbc09b488ba6cf4eaa4dc61b18454d8b3b150323fb3
  Stored in directory: /root/.cache/pip/wheels/61/d2/20/09ec9bef734d126cba375b15898010b6cc28578d8afdde5869
Successfully built openai-whisper
Installing collected packages: opena

In [1]:
import torch
import torchaudio
import whisper
from panns_inference import AudioTagging, labels

# Initialize models
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using: {device}\n")

panns_model = AudioTagging(checkpoint_path=None, device=device)
whisper_model = whisper.load_model("base", device=device)

# Load and prepare audio
audio_path = "/content/city_sounds.mp3"
waveform, sr = torchaudio.load(audio_path)

# Resample to 32kHz for PANNs
if sr != 32000:
    waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=32000)(waveform)

# Convert to mono
if waveform.shape[0] > 1:
    waveform = waveform.mean(dim=0, keepdim=True)

waveform = waveform.to(device)

# Get sound tags
clipwise_output, embedding = panns_model.inference(waveform)
clipwise_output = clipwise_output.squeeze()
top_indices = clipwise_output.argsort()[-5:][::-1]

print("🔊 Detected sounds:")
for i in top_indices:
    print(f"  • {labels[int(i)]}: {clipwise_output[int(i)]:.2%}")

# Get speech transcript
print("\n🎤 Transcribing speech...")
result = whisper_model.transcribe(audio_path)
transcript = result["text"].strip()
print(f"  💬 \"{transcript}\"" if transcript else "  (no speech)")

Using: cuda

Checkpoint path: /root/panns_data/Cnn14_mAP=0.431.pth
GPU number: 1


100%|███████████████████████████████████████| 139M/139M [00:03<00:00, 41.1MiB/s]
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


🔊 Detected sounds:
  • Speech: 70.71%
  • Vehicle: 61.40%
  • Car: 25.56%
  • Outside, urban or manmade: 13.38%
  • Traffic noise, roadway noise: 13.34%

🎤 Transcribing speech...
  💬 "1.5% 1.5% 1.5% 1.5% 1.5%"


In [2]:
!pip install groq --quiet



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/135.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [15]:
import torch
import torchaudio
import whisper
from panns_inference import AudioTagging, labels
import librosa
import numpy as np
import requests
import json
from groq import Groq
from pydub import AudioSegment

# Initialize models
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using: {device}\n")
panns_model = AudioTagging(checkpoint_path=None, device=device)
whisper_model = whisper.load_model("base", device=device)

# Load and prepare audio
audio_path = "/content/city_sounds.mp3"
waveform, sr = torchaudio.load(audio_path)
if sr != 32000:
    waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=32000)(waveform)
if waveform.shape[0] > 1:
    waveform = waveform.mean(dim=0, keepdim=True)
waveform = waveform.to(device)

# Get sound tags
clipwise_output, embedding = panns_model.inference(waveform)
clipwise_output = clipwise_output.squeeze()
top_indices = clipwise_output.argsort()[-5:][::-1]
sound_types = [labels[int(i)] for i in top_indices]
print("🔊 Detected sounds:")
for i in top_indices:
    print(f"  • {labels[int(i)]}: {clipwise_output[int(i)]:.2%}")

# Get speech transcript
print("\n🎤 Transcribing speech...")
result = whisper_model.transcribe(audio_path)
transcript = result["text"].strip()
print(f"  💬 \"{transcript}\"" if transcript else "  (no speech)")

# Extract audio metadata
audio, sr = librosa.load(audio_path)
duration = librosa.get_duration(y=audio, sr=sr)
metadata = {
    "duration": duration,
    "sample_rate": sr,
    "channels": 1 if len(audio.shape) == 1 else audio.shape[1]
}
print(f"\n📊 Audio metadata: {metadata}")

# Generate narration with Groq
client = Groq(api_key=GROQ_API_KEY)  # Replace with your Groq API key
sound_str = ", ".join(sound_types)
prompt = (
    f"Write a lively, conversational 2-3 sentence narration of a bustling city scene based on sounds: {sound_str}. "
    f"Weave in someone casually mentioning '{transcript}' as part of the urban vibe, like a street vendor or passerby. "
    f"Use vivid sound effect cues in square brackets (e.g., [honking], [chattering voices]) for each sound, avoiding generic terms like 'traffic noise'. "
    f"Keep it natural, immersive, under 80 words, using metadata {metadata} for context."
)
response = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    model="llama-3.3-70b-versatile",
    temperature=0.4,  # Lower for natural, focused tone
    max_tokens=100  # Ensure no cutoff
)
narration = response.choices[0].message.content
with open("narration.txt", "w") as f:
    f.write(narration)
print(f"\n✨ Narration: {narration}")

# Convert to speech with ElevenLabs
url = "https://api.elevenlabs.io/v1/text-to-speech/21m00Tcm4TlvDq8ikWAM"  # Bella voice for warmth
headers = {
    "xi-api-key": ElevenLabs,  # Replace with your ElevenLabs API key
    "Content-Type": "application/json"
}
payload = {
    "text": narration,
    "model_id": "eleven_multilingual_v2",
    "voice_settings": {"stability": 0.6, "similarity_boost": 0.9, "style": 0.2}  # More expressive
}
response = requests.post(url, headers=headers, json=payload)
if response.status_code == 200:
    with open("temp_output.mp3", "wb") as f:
        f.write(response.content)
    # Amplify audio with pydub
    audio = AudioSegment.from_mp3("temp_output.mp3")
    audio = audio + 10  # Boost volume by 10dB
    audio.export("output.mp3", format="mp3")
    print("✅ Audio saved as output.mp3")
else:
    print(f"Error: {response.status_code} - {response.text}")

Using: cuda

Checkpoint path: /root/panns_data/Cnn14_mAP=0.431.pth
GPU number: 1


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


🔊 Detected sounds:
  • Speech: 70.71%
  • Vehicle: 61.40%
  • Car: 25.56%
  • Outside, urban or manmade: 13.38%
  • Traffic noise, roadway noise: 13.34%

🎤 Transcribing speech...
  💬 "1.5% 1.5% 1.5% 1.5% 1.5%"

📊 Audio metadata: {'duration': 20.06204081632653, 'sample_rate': 22050, 'channels': 1}

✨ Narration: As [car horns blaring] and [chattering voices] fill the air, a passerby yells "1.5% 1.5% 1.5% 1.5% 1.5%!" amidst [screeching tires] and [rumbling engines]. [Sirens wailing] in the distance add to the urban chaos. [Revving motorcycles] speed by, weaving through the crowded streets.
✅ Audio saved as output.mp3


In [6]:
from groq import Groq

client = Groq(api_key=GROQ_API_KEY)

# Use the sounds we detected
sounds = "Speech, Vehicle, Car, Traffic noise"
transcript = "1.5% 1.5% 1.5%"

prompt = f"""Write a casual, natural 2 sentence description of this scene. Be conversational and human, not dramatic.

Sounds: {sounds}
Someone said: "{transcript}"

Describe it simply:"""

response = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    model="llama-3.3-70b-versatile",
    temperature=0.7,  # Lower = more natural
    max_tokens=80  # Shorter = less flowery
)

narration = response.choices[0].message.content
print("\n✨ NARRATION:")
print(narration)


✨ NARRATION:
It sounds like someone's having a conversation in a car, probably stuck in traffic judging by the background noise. One of them just kept repeating "1.5% 1.5% 1.5%", I'm not sure what that's about, but it seems like they're trying to make a point.


In [7]:
!pip install edge-tts

Collecting edge-tts
  Downloading edge_tts-7.2.3-py3-none-any.whl.metadata (5.5 kB)
Downloading edge_tts-7.2.3-py3-none-any.whl (30 kB)
Installing collected packages: edge-tts
Successfully installed edge-tts-7.2.3


In [8]:
# Save narration to file
with open("narration.txt", "w") as f:
    f.write(narration)

# Convert to speech with Edge TTS
!edge-tts --voice en-US-GuyNeural --text "$(cat narration.txt)" --write-media output.mp3

print("✅ Audio saved as output.mp3")

✅ Audio saved as output.mp3


In [None]:
client = Groq(api_key=GROQ_API_KEY)  # Replace with your Groq API key
sound_str = ", ".join(sound_types)
transcript = "1.5% 1.5% 1.5%"
prompt = (
    f"Write a natural, conversational 2-3 sentence narration of a vivid urban scene based on these sounds: {sound_str}. "
    f"Include metadata: {metadata}. Mention someone saying '{transcript}' naturally in the scene. "
    f"Add dramatic sound effect cues in square brackets (e.g., [traffic noise]) for each sound. "
    f"Keep it immersive but human-like, under 80 words, for text-to-speech."
)
response = client.chat.completions.create(
    messages=[{"role": "user", "content": prompt}],
    model="llama-3.3-70b-versatile",
    temperature=0.6,
    max_tokens=80
)
narration = response.choices[0].message.content
with open("narration.txt", "w") as f:
    f.write(narration)
print(f"\n✨ Narration: {narration}")

# Convert to speech with ElevenLabs
url = "https://api.elevenlabs.io/v1/text-to-speech/pNInz6obpgDQGcFmaJgB"  # Adam voice
headers = {
    "xi-api-key": "your_elevenlabs_api_key",  # Replace with your ElevenLabs API key
    "Content-Type": "application/json"
}
payload = {
    "text": narration,
    "model_id": "eleven_monolingual_v1",
    "voice_settings": {"stability": 0.5, "similarity_boost": 0.5}
}
response = requests.post(url, headers=headers, json=payload)
if response.status_code == 200:
    with open("output.mp3", "wb") as f:
        f.write(response.content)
    print("✅ Audio saved as output.mp3")
else:
    print(f"Error: {response.status_code} - {response.text}")