In [62]:
!pip install pytube moviepy openai-whisper pyannote.audio torch transformers librosa --quiet

import os, torch
import whisper
from pytube import YouTube
from moviepy.editor import AudioFileClip
from transformers import pipeline
from pyannote.audio import Pipeline
from collections import defaultdict



In [63]:
# Download & Preprocess Audio
!pip install yt-dlp --quiet

import yt_dlp

def download_youtube_audio(url, out_file="call.wav"):
    """
    Downloads audio from a YouTube video and converts it to WAV format.
    Uses yt-dlp for reliability.
    """
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': 'call_audio.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

    # Rename to consistent output
    if os.path.exists("call_audio.wav"):
        os.rename("call_audio.wav", out_file)

    return out_file

# Example usage
yt_url = "https://www.youtube.com/watch?v=4ostqJD3Psc"
audio_file = download_youtube_audio(yt_url)
print("✅ Audio saved as:", audio_file)

[youtube] Extracting URL: https://www.youtube.com/watch?v=4ostqJD3Psc
[youtube] 4ostqJD3Psc: Downloading webpage
[youtube] 4ostqJD3Psc: Downloading tv simply player API JSON
[youtube] 4ostqJD3Psc: Downloading tv client config
[youtube] 4ostqJD3Psc: Downloading tv player API JSON
[info] 4ostqJD3Psc: Downloading 1 format(s): 251
[download] Destination: call_audio.webm
[download] 100% of    1.99MiB in 00:00:00 at 11.46MiB/s  
[ExtractAudio] Destination: call_audio.wav
Deleting original file call_audio.webm (pass -k to keep)
✅ Audio saved as: call.wav


In [64]:
# GROUP 3: Speech-to-Text (Whisper)
# --------------------------
def transcribe_audio(audio_path, model_size="small"):
    model = whisper.load_model(model_size)
    result = model.transcribe(audio_path)
    return result["text"], result["segments"]

transcript, segments = transcribe_audio(audio_file)





### Combine Transcription and Diarization

Now that we have the transcription segments and the speaker diarization segments, we can combine them to create a final transcript with speaker labels.

In [65]:
def combine_transcription_diarization(segments, speaker_segments):
    """
    Combines transcription segments with speaker diarization segments.

    Parameters:
        segments (list): List of transcription segments from Whisper.
        speaker_segments (list): List of speaker segments from diarization.

    Returns:
        list: List of combined segments with speaker labels and text.
    """
    combined_segments = []
    speaker_index = 0

    for segment in segments:
        segment_start = segment['start']
        segment_end = segment['end']
        segment_text = segment['text'].strip()

        # Find the speaker for this transcription segment
        current_speaker = "Unknown Speaker"
        for i in range(speaker_index, len(speaker_segments)):
            diarization_start = speaker_segments[i]['start']
            diarization_end = speaker_segments[i]['end']

            # Check for overlap between transcription segment and diarization segment
            if max(segment_start, diarization_start) < min(segment_end, diarization_end):
                current_speaker = speaker_segments[i]['speaker']
                speaker_index = i  # Start searching from this speaker segment next time
                break

        combined_segments.append({
            "speaker": current_speaker,
            "start": segment_start,
            "end": segment_end,
            "text": segment_text
        })

    return combined_segments

# Assuming 'segments' is the output from the transcribe_audio function
# and 'speaker_segments' is the output from the diarize_speakers function
# (using the corrected code from the previous turn if needed)

final_transcript = combine_transcription_diarization(segments, speaker_segments)

# Print the final transcript
for entry in final_transcript:
    print(f"[{entry['start']:.2f}s - {entry['end']:.2f}s] {entry['speaker']}: {entry['text']}")

[0.00s - 9.36s] Speaker_1: Thank you for calling Nissan.
[9.36s - 10.36s] Speaker_2: My name is Lauren.
[10.36s - 11.36s] Speaker_2: Can I have your name?
[11.36s - 14.16s] Speaker_2: Yeah, my name is John Smith.
[14.16s - 15.16s] Speaker_2: Thank you, John.
[15.16s - 16.16s] Speaker_1: How can I help you?
[16.16s - 20.60s] Speaker_2: I was just calling about to see how much it would cost to update the map in my car.
[20.60s - 22.48s] Speaker_2: I'd be happy to help you with that today.
[22.48s - 24.00s] Speaker_1: Did you receive a mail or from us?
[24.00s - 25.00s] Speaker_2: I did.
[25.00s - 26.48s] Speaker_1: Do you need the customer number?
[26.48s - 27.48s] Speaker_1: Yes, please.
[27.48s - 28.48s] Speaker_1: Okay.
[28.48s - 31.00s] Speaker_1: I have a 15243.
[31.00s - 32.00s] Speaker_1: Thank you.
[32.00s - 33.32s] Speaker_1: And the year-making model of your vehicle?
[33.32s - 36.88s] Speaker_1: Yeah, I have a 2009 Nissan Altima.
[36.88s - 38.36s] Speaker_2: Oh, nice car.
[38.3

In [66]:
# GROUP 5: Metrics Extraction
# --------------------------
def compute_metrics(transcript, speaker_segments):
    # Talk-time ratio
    talk_time = defaultdict(float)
    for seg in speaker_segments:
        talk_time[seg["speaker"]] += seg["duration"]
    total_time = sum(talk_time.values()) or 1
    talk_ratio = {sp: round((dur/total_time)*100, 2) for sp, dur in talk_time.items()}

    # Longest monologue
    longest_mono = max(speaker_segments, key=lambda x: x["duration"])

    # Questions count
    question_count = transcript.count("?")

    return talk_ratio, longest_mono, question_count

talk_ratio, longest_mono, question_count = compute_metrics(transcript, speaker_segments)


In [67]:
 # GROUP 6: Sentiment Analysis
# --------------------------
def analyze_sentiment(text, sample_size=500):
    sentiment_analyzer = pipeline("sentiment-analysis")
    sample_text = text[:sample_size]
    return sentiment_analyzer(sample_text)[0]["label"]

sentiment = analyze_sentiment(transcript)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [68]:
# GROUP 7: Actionable Insight + Bonus
# --------------------------
def generate_insight(talk_ratio, question_count):
    if not talk_ratio:
        return "Insufficient data."

    dominant = max(talk_ratio, key=talk_ratio.get)
    if talk_ratio[dominant] > 70:
        return f"{dominant} dominated the call. Encourage balanced dialogue."
    elif question_count < 3:
        return "Too few questions. Encourage rep to ask more."
    else:
        return "Good balance of talk and engagement."

insight = generate_insight(talk_ratio, question_count)

# Bonus: Identify Sales Rep vs Customer
sales_rep = max(talk_ratio, key=talk_ratio.get, default="Unknown")
customer = min(talk_ratio, key=talk_ratio.get, default="Unknown")

In [69]:
# GROUP 8: Results
# --------------------------
print("📊 Talk-time Ratio:", talk_ratio)
print("❓ Questions Asked:", question_count)
print("🗣️ Longest Monologue:", round(longest_mono['duration'], 2), "seconds by", longest_mono['speaker'])
print("😊 Call Sentiment:", sentiment)
print("💡 Actionable Insight:", insight)
print("👤 Likely Sales Rep:", sales_rep, "| Customer:", customer)

📊 Talk-time Ratio: {'Speaker_1': np.float64(51.58), 'Speaker_2': np.float64(48.42)}
❓ Questions Asked: 8
🗣️ Longest Monologue: 1.71 seconds by Speaker_2
😊 Call Sentiment: POSITIVE
💡 Actionable Insight: Good balance of talk and engagement.
👤 Likely Sales Rep: Speaker_1 | Customer: Speaker_2


### Finish Task

The task is complete. We have successfully downloaded the audio from the YouTube video, transcribed it, performed speaker diarization, and combined the results to produce a transcript with speaker labels.