In [1]:
import sys
import pandas as pd
sys.path.append("../")
from pathlib import Path

### Retrieve City Council meetings CSV created using meetings.ipynb

In [9]:
# Read the meetings CSV file
meeting_data = "../data/meetings.csv"
meetings_df = pd.read_csv(meeting_data)

# Convert date column to datetime
meetings_df['date'] = pd.to_datetime(meetings_df['date'])

# Sort by date in descending order (latest first)
meetings_df = meetings_df.sort_values('date', ascending=False)

city_council_meetings = meetings_df[
    meetings_df["meeting"].str.contains("Regular Council Meeting")
]

city_council_meetings.head()

  meetings_df['date'] = pd.to_datetime(meetings_df['date'])


Unnamed: 0,meeting,date,duration,agenda,video,duration_minutes
0,Regular Council Meeting,2025-03-05 17:02:00,00h 39m,https://tulsa-ok.granicus.com/AgendaViewer.php...,https://tulsa-ok.granicus.com/MediaPlayer.php?...,39
1,Regular Council Meeting,2025-02-26 17:00:00,00h 38m,https://tulsa-ok.granicus.com/AgendaViewer.php...,https://tulsa-ok.granicus.com/MediaPlayer.php?...,38
2,Regular Council Meeting,2025-02-12 17:00:00,00h 31m,https://tulsa-ok.granicus.com/AgendaViewer.php...,https://tulsa-ok.granicus.com/MediaPlayer.php?...,31
3,Regular Council Meeting,2025-02-05 17:00:00,00h 29m,https://tulsa-ok.granicus.com/AgendaViewer.php...,https://tulsa-ok.granicus.com/MediaPlayer.php?...,29
4,Regular Council Meeting,2025-01-29 17:03:00,01h 13m,https://tulsa-ok.granicus.com/AgendaViewer.php...,https://tulsa-ok.granicus.com/MediaPlayer.php?...,73


### Select a meeting to work on

In [10]:
import ipywidgets as wdgt

# Create dropdown widget with video URLs and meeting info
options = {
    f"{row['meeting']} - {row['date'].strftime('%Y-%m-%d')}": {
        'url': row['video'],
        'label': f"{row['meeting']} - {row['date'].strftime('%Y-%m-%d')}"
    }
    for _, row in city_council_meetings.iterrows()
}

video_select = wdgt.Dropdown(
    options={k: v['url'] for k, v in options.items()},
    description='Select Video:',
    style={'description_width': 'initial'},
    layout=wdgt.Layout(width='75%')
)

# Update video_url and file_name variables when selection changes
def update_video_url(change):
    global video_url, file_name
    video_url = change.new
    # Get the label for the selected URL
    selected_label = [k for k,v in options.items() if v['url'] == change.new][0]
    # Create safe filename by replacing spaces and special chars
    file_name = selected_label.lower().replace(' ', '_').replace('-', '_')
    file_name = ''.join(c for c in file_name if c.isalnum() or c == '_')

video_select.observe(update_video_url, names='value')
display(video_select)


Dropdown(description='Select Video:', layout=Layout(width='75%'), options={'Regular Council Meeting - 2025-03-…

### Download the mp4 file from the meeting player page
The meeting list includes a url to the granicus.com video player.
The page includes a video stream url which is not practical to use for downloading.  The function get_video_player return a GranicusPlayerPage object that includes a `download_url` which is created from the streaming url.


In [11]:
import os
import sys
from pathlib import Path
import requests
from src.models.meeting import GranicusPlayerPage
from src.granicus import get_video_player
from src.videos import download_file
# You may need to turn off VPN
sys.path.append("../")  # Make sure we can import from scripts

# Create output directory if it doesn't exist
VIDEO_DIRECTORY = Path("../data/video")
VIDEO_DIRECTORY.mkdir(parents=True, exist_ok=True)

# Define output path for the video
output_path = VIDEO_DIRECTORY / f"{file_name}.mp4"

# Get video player page info
player_page: GranicusPlayerPage = await get_video_player(video_url)

# Run the download
video_file = download_file(player_page.download_url, output_path)

# Display the result
if video_file:
    print(f"Video saved to: {video_file}")

Downloading video from: http://archive-video.granicus.com/tulsa-ok/tulsa-ok_d4578b1d-786e-49f9-9230-87b182dfffaf.mp4
Saving to: ../data/video/regular_council_meeting___2025_02_26.mp4
Downloaded 5% (30.0 MB / 588.8 MB)
Downloaded 5% (31.0 MB / 588.8 MB)
Downloaded 5% (32.0 MB / 588.8 MB)
Downloaded 5% (33.0 MB / 588.8 MB)
Downloaded 5% (34.0 MB / 588.8 MB)
Downloaded 5% (35.0 MB / 588.8 MB)
Downloaded 10% (59.0 MB / 588.8 MB)
Downloaded 10% (60.0 MB / 588.8 MB)
Downloaded 10% (61.0 MB / 588.8 MB)
Downloaded 10% (62.0 MB / 588.8 MB)
Downloaded 10% (63.0 MB / 588.8 MB)
Downloaded 10% (64.0 MB / 588.8 MB)
Downloaded 15% (89.0 MB / 588.8 MB)
Downloaded 15% (90.0 MB / 588.8 MB)
Downloaded 15% (91.0 MB / 588.8 MB)
Downloaded 15% (92.0 MB / 588.8 MB)
Downloaded 15% (93.0 MB / 588.8 MB)
Downloaded 15% (94.0 MB / 588.8 MB)
Downloaded 20% (118.0 MB / 588.8 MB)
Downloaded 20% (119.0 MB / 588.8 MB)
Downloaded 20% (120.0 MB / 588.8 MB)
Downloaded 20% (121.0 MB / 588.8 MB)
Downloaded 20% (122.0 MB / 

### Use the downloaded mp4 file to extract a wav file

In [12]:
import asyncio
import os
from pathlib import Path
from src.videos import save_audio


# Create output directory for audio files if it doesn't exist
audio_dir = Path("../data/audio")
audio_dir.mkdir(parents=True, exist_ok=True)

# Assuming video_file contains the path to the downloaded video
# This would be the result from the previous download_video call
if not os.path.exists(video_file):
    print(f"Error: Video file not found at {video_file}")
else:
    # Define output path for the audio
    video_filename = os.path.basename(video_file)
    base_filename = os.path.splitext(video_filename)[0]
    audio_path = audio_dir / f"{base_filename}.wav"

    print(f"Extracting audio from: {video_file}")
    print(f"Saving audio to: {audio_path}")

    audio_file = await save_audio(
                video_path=video_file,
                output_path=str(audio_path),
            )

    if audio_file:
        print(f"Audio saved to: {audio_file}")
        print(f"Audio file size: {os.path.getsize(audio_file) / (1024 * 1024):.2f} MB")

Extracting audio from: ../data/video/regular_council_meeting___2025_02_26.mp4
Saving audio to: ../data/audio/regular_council_meeting___2025_02_26.wav
Audio saved to: ../data/audio/regular_council_meeting___2025_02_26.wav
Audio file size: 70.34 MB


### Convert the video file into a transcipt
This step requires a huggingface login and api_token.
You will also need to agree to terms on each of the following models:
- guillaumekln/faster-whisper
- 


In [2]:
from src.videos import transcribe_video

video_file = "../data/video/regular_council_meeting___2025_02_26.mp4"

transcription_dir = Path("../data/transcripts")

transcription = await transcribe_video(video_file, transcription_dir)

2025-03-11 17:02:18,669 - src.huggingface - INFO - Successfully loaded Whisper model: tiny
2025-03-11 17:02:18,671 - src.videos - INFO - Transcribing video: ../data/video/regular_council_meeting___2025_02_26.mp4
2025-03-11 17:02:18,671 - src.videos - INFO - transcription will be saved to: ../data/transcripts/regular_council_meeting___2025_02_26.json
2025-03-11 17:02:21,501 - faster_whisper - INFO - Processing audio with duration 38:25.014
2025-03-11 17:02:22,394 - faster_whisper - INFO - Detected language 'en' with probability 0.38
2025-03-11 17:02:22,398 - src.videos - INFO - Processing transcription segments...
2025-03-11 17:04:23,388 - src.videos - INFO - Transcription completed in 124.72 seconds
2025-03-11 17:04:23,388 - src.videos - INFO - Detailed JSON saved to: ../data/transcripts/regular_council_meeting___2025_02_26.json


In [None]:
from src.videos import transcribe_video_with_diarization

video_file = "../data/video/regular_council_meeting___2025_02_26.mp4"

transcription_dir = Path("../data/transcripts")

transcription = await transcribe_video_with_diarization(video_file, transcription_dir)

INFO:src.videos:Transcribing video with speaker diarization: ../data/video/regular_council_meeting___2025_02_26.mp4
INFO:src.videos:Output will be saved to: ../data/transcripts/regular_council_meeting___2025_02_26.diarized.json
INFO:src.huggingface:Auto-detected device: cpu
INFO:src.huggingface:Auto-selected compute_type: int8
INFO:src.huggingface:Loading WhisperX model: medium on cpu with int8 precision
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../Library/Caches/pypoetry/virtualenvs/tgov_scraper-zRR99ne3-py3.10/lib/python3.10/site-packages/whisperx/assets/pytorch_model.bin`
INFO:src.huggingface:Loading diarization pipeline


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.4.1. Bad things might happen unless you revert torch to 1.x.


INFO:src.huggingface:WhisperX model loaded in 2.23 seconds
INFO:src.videos:Running initial transcription with batch size 8...


Detected language: en (0.60) in first 30s of audio...


INFO:src.videos:Detected language: en
INFO:src.videos:Loading alignment model for detected language: en
INFO:src.videos:Aligning transcription with audio...
INFO:src.videos:Running speaker diarization...
  std = sequences.std(dim=-1, correction=1)
